commit ddb4c2c2da1dd3f4eed8ce9e71ba3d14ec39a19c Author: ModelHub XC Date: Mon May 25 16:31:18 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Neelectric/Llama-3.1-8B-Instruct_SFT_sciencev00.03 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..6989f0b --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/MoT_science_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_sciencev00.03 +tags: +- generated_from_trainer +- open-r1 +- sft +- trl +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_sciencev00.03 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/MoT_science_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/MoT_science_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_sciencev00.03", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_science/runs/uhw758x4) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.28.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..d8ae1e1 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 3.182940295913485e+19, + "train_loss": 0.8493060141035994, + "train_runtime": 32137.5108, + "train_samples": 145693, + "train_samples_per_second": 13.6, + "train_steps_per_second": 0.85 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e1d9068 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..cecac7c --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05e4cea7497de82b16d1729f503cd322d79610316668567fbce6f220e11725e +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..150b247 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae4f4bfe83ef1398f9f4404bf3da6be26cb3d575263e09b64f2c1776879882df +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..6b5a69d --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b3cfa24c8cb6fa09824e243a3f736318c90a585c1fb3fbccc04c7d03fad6160 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..12c957c --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3abb573a3ceba4049bd9099e2589c555948aee868a70ffa3577cc92faa8e0b5b +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..d8ae1e1 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 3.182940295913485e+19, + "train_loss": 0.8493060141035994, + "train_runtime": 32137.5108, + "train_samples": 145693, + "train_samples_per_second": 13.6, + "train_steps_per_second": 0.85 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..d0476aa --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,245905 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 27318, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010981770261366133, + "grad_norm": 5.988698959350586, + "learning_rate": 0.0, + "loss": 1.392, + "mean_token_accuracy": 0.6335183382034302, + "num_tokens": 31074.0, + "step": 1 + }, + { + "epoch": 0.00021963540522732265, + "grad_norm": 7.709012985229492, + "learning_rate": 1.8301610541727673e-09, + "loss": 1.5176, + "mean_token_accuracy": 0.6059308052062988, + "num_tokens": 54561.0, + "step": 2 + }, + { + "epoch": 0.000329453107840984, + "grad_norm": 6.730776786804199, + "learning_rate": 3.6603221083455347e-09, + "loss": 1.5323, + "mean_token_accuracy": 0.6023818850517273, + "num_tokens": 83797.0, + "step": 3 + }, + { + "epoch": 0.0004392708104546453, + "grad_norm": 6.558500289916992, + "learning_rate": 5.4904831625183024e-09, + "loss": 1.4906, + "mean_token_accuracy": 0.613278329372406, + "num_tokens": 114053.0, + "step": 4 + }, + { + "epoch": 0.0005490885130683066, + "grad_norm": 6.8331708908081055, + "learning_rate": 7.320644216691069e-09, + "loss": 1.5107, + "mean_token_accuracy": 0.609889030456543, + "num_tokens": 141603.0, + "step": 5 + }, + { + "epoch": 0.000658906215681968, + "grad_norm": 6.662057399749756, + "learning_rate": 9.150805270863836e-09, + "loss": 1.5153, + "mean_token_accuracy": 0.6010513305664062, + "num_tokens": 170995.0, + "step": 6 + }, + { + "epoch": 0.0007687239182956292, + "grad_norm": 7.228508472442627, + "learning_rate": 1.0980966325036605e-08, + "loss": 1.4474, + "mean_token_accuracy": 0.6196508407592773, + "num_tokens": 196275.0, + "step": 7 + }, + { + "epoch": 0.0008785416209092906, + "grad_norm": 7.458967685699463, + "learning_rate": 1.2811127379209372e-08, + "loss": 1.4555, + "mean_token_accuracy": 0.6182136535644531, + "num_tokens": 219328.0, + "step": 8 + }, + { + "epoch": 0.000988359323522952, + "grad_norm": 6.484447002410889, + "learning_rate": 1.4641288433382139e-08, + "loss": 1.4639, + "mean_token_accuracy": 0.615833580493927, + "num_tokens": 251394.0, + "step": 9 + }, + { + "epoch": 0.0010981770261366132, + "grad_norm": 8.234504699707031, + "learning_rate": 1.6471449487554907e-08, + "loss": 1.6103, + "mean_token_accuracy": 0.588140606880188, + "num_tokens": 274193.0, + "step": 10 + }, + { + "epoch": 0.0012079947287502745, + "grad_norm": 6.882067680358887, + "learning_rate": 1.8301610541727673e-08, + "loss": 1.4409, + "mean_token_accuracy": 0.6233946681022644, + "num_tokens": 300826.0, + "step": 11 + }, + { + "epoch": 0.001317812431363936, + "grad_norm": 6.914370536804199, + "learning_rate": 2.013177159590044e-08, + "loss": 1.56, + "mean_token_accuracy": 0.603441596031189, + "num_tokens": 327325.0, + "step": 12 + }, + { + "epoch": 0.0014276301339775973, + "grad_norm": 7.0081071853637695, + "learning_rate": 2.196193265007321e-08, + "loss": 1.5463, + "mean_token_accuracy": 0.5956589579582214, + "num_tokens": 355026.0, + "step": 13 + }, + { + "epoch": 0.0015374478365912585, + "grad_norm": 8.8041410446167, + "learning_rate": 2.3792093704245975e-08, + "loss": 1.5333, + "mean_token_accuracy": 0.6074589490890503, + "num_tokens": 374698.0, + "step": 14 + }, + { + "epoch": 0.0016472655392049198, + "grad_norm": 6.864985942840576, + "learning_rate": 2.5622254758418744e-08, + "loss": 1.5563, + "mean_token_accuracy": 0.5992418527603149, + "num_tokens": 403158.0, + "step": 15 + }, + { + "epoch": 0.0017570832418185812, + "grad_norm": 6.483746528625488, + "learning_rate": 2.7452415812591512e-08, + "loss": 1.3694, + "mean_token_accuracy": 0.6351869702339172, + "num_tokens": 429478.0, + "step": 16 + }, + { + "epoch": 0.0018669009444322424, + "grad_norm": 7.261568546295166, + "learning_rate": 2.9282576866764277e-08, + "loss": 1.4631, + "mean_token_accuracy": 0.6336261034011841, + "num_tokens": 454287.0, + "step": 17 + }, + { + "epoch": 0.001976718647045904, + "grad_norm": 8.020394325256348, + "learning_rate": 3.111273792093704e-08, + "loss": 1.4445, + "mean_token_accuracy": 0.6305864453315735, + "num_tokens": 475824.0, + "step": 18 + }, + { + "epoch": 0.002086536349659565, + "grad_norm": 6.431168556213379, + "learning_rate": 3.2942898975109815e-08, + "loss": 1.5088, + "mean_token_accuracy": 0.6091854572296143, + "num_tokens": 507291.0, + "step": 19 + }, + { + "epoch": 0.0021963540522732263, + "grad_norm": 7.116753101348877, + "learning_rate": 3.477306002928258e-08, + "loss": 1.4815, + "mean_token_accuracy": 0.6053434610366821, + "num_tokens": 532237.0, + "step": 20 + }, + { + "epoch": 0.002306171754886888, + "grad_norm": 8.30763053894043, + "learning_rate": 3.6603221083455345e-08, + "loss": 1.5391, + "mean_token_accuracy": 0.5965453386306763, + "num_tokens": 553465.0, + "step": 21 + }, + { + "epoch": 0.002415989457500549, + "grad_norm": 6.741918087005615, + "learning_rate": 3.843338213762812e-08, + "loss": 1.4741, + "mean_token_accuracy": 0.6187750697135925, + "num_tokens": 581684.0, + "step": 22 + }, + { + "epoch": 0.0025258071601142102, + "grad_norm": 6.339682102203369, + "learning_rate": 4.026354319180088e-08, + "loss": 1.5546, + "mean_token_accuracy": 0.597669780254364, + "num_tokens": 614806.0, + "step": 23 + }, + { + "epoch": 0.002635624862727872, + "grad_norm": 8.204667091369629, + "learning_rate": 4.209370424597365e-08, + "loss": 1.5312, + "mean_token_accuracy": 0.6057168245315552, + "num_tokens": 636138.0, + "step": 24 + }, + { + "epoch": 0.002745442565341533, + "grad_norm": 8.896713256835938, + "learning_rate": 4.392386530014642e-08, + "loss": 1.4681, + "mean_token_accuracy": 0.6225725412368774, + "num_tokens": 655219.0, + "step": 25 + }, + { + "epoch": 0.0028552602679551946, + "grad_norm": 8.047857284545898, + "learning_rate": 4.5754026354319185e-08, + "loss": 1.5803, + "mean_token_accuracy": 0.6006311178207397, + "num_tokens": 677363.0, + "step": 26 + }, + { + "epoch": 0.0029650779705688557, + "grad_norm": 7.266357421875, + "learning_rate": 4.758418740849195e-08, + "loss": 1.4364, + "mean_token_accuracy": 0.6175681948661804, + "num_tokens": 701684.0, + "step": 27 + }, + { + "epoch": 0.003074895673182517, + "grad_norm": 7.91674280166626, + "learning_rate": 4.941434846266472e-08, + "loss": 1.4722, + "mean_token_accuracy": 0.6179271936416626, + "num_tokens": 722995.0, + "step": 28 + }, + { + "epoch": 0.0031847133757961785, + "grad_norm": 6.926092624664307, + "learning_rate": 5.124450951683749e-08, + "loss": 1.4464, + "mean_token_accuracy": 0.6267740726470947, + "num_tokens": 749472.0, + "step": 29 + }, + { + "epoch": 0.0032945310784098397, + "grad_norm": 6.4611945152282715, + "learning_rate": 5.307467057101025e-08, + "loss": 1.4854, + "mean_token_accuracy": 0.6076056361198425, + "num_tokens": 778748.0, + "step": 30 + }, + { + "epoch": 0.003404348781023501, + "grad_norm": 8.22752857208252, + "learning_rate": 5.4904831625183024e-08, + "loss": 1.4845, + "mean_token_accuracy": 0.6203884482383728, + "num_tokens": 798956.0, + "step": 31 + }, + { + "epoch": 0.0035141664836371624, + "grad_norm": 7.043278694152832, + "learning_rate": 5.673499267935579e-08, + "loss": 1.5123, + "mean_token_accuracy": 0.6074115037918091, + "num_tokens": 824562.0, + "step": 32 + }, + { + "epoch": 0.0036239841862508236, + "grad_norm": 7.072328567504883, + "learning_rate": 5.8565153733528555e-08, + "loss": 1.4879, + "mean_token_accuracy": 0.614173173904419, + "num_tokens": 849838.0, + "step": 33 + }, + { + "epoch": 0.0037338018888644848, + "grad_norm": 7.119189739227295, + "learning_rate": 6.039531478770133e-08, + "loss": 1.5053, + "mean_token_accuracy": 0.6172736883163452, + "num_tokens": 873992.0, + "step": 34 + }, + { + "epoch": 0.0038436195914781464, + "grad_norm": 8.041004180908203, + "learning_rate": 6.222547584187409e-08, + "loss": 1.5781, + "mean_token_accuracy": 0.5886217951774597, + "num_tokens": 896497.0, + "step": 35 + }, + { + "epoch": 0.003953437294091808, + "grad_norm": 7.14554500579834, + "learning_rate": 6.405563689604686e-08, + "loss": 1.3644, + "mean_token_accuracy": 0.6379081010818481, + "num_tokens": 920089.0, + "step": 36 + }, + { + "epoch": 0.004063254996705469, + "grad_norm": 6.803360939025879, + "learning_rate": 6.588579795021963e-08, + "loss": 1.5012, + "mean_token_accuracy": 0.6018442511558533, + "num_tokens": 947134.0, + "step": 37 + }, + { + "epoch": 0.00417307269931913, + "grad_norm": 7.518194198608398, + "learning_rate": 6.771595900439239e-08, + "loss": 1.4871, + "mean_token_accuracy": 0.6054000854492188, + "num_tokens": 970334.0, + "step": 38 + }, + { + "epoch": 0.004282890401932792, + "grad_norm": 6.888261318206787, + "learning_rate": 6.954612005856516e-08, + "loss": 1.4975, + "mean_token_accuracy": 0.6066856980323792, + "num_tokens": 997900.0, + "step": 39 + }, + { + "epoch": 0.004392708104546453, + "grad_norm": 7.08450984954834, + "learning_rate": 7.137628111273793e-08, + "loss": 1.5246, + "mean_token_accuracy": 0.600643515586853, + "num_tokens": 1023728.0, + "step": 40 + }, + { + "epoch": 0.004502525807160114, + "grad_norm": 6.933853626251221, + "learning_rate": 7.320644216691069e-08, + "loss": 1.4718, + "mean_token_accuracy": 0.6180598139762878, + "num_tokens": 1050352.0, + "step": 41 + }, + { + "epoch": 0.004612343509773776, + "grad_norm": 6.983968257904053, + "learning_rate": 7.503660322108346e-08, + "loss": 1.3964, + "mean_token_accuracy": 0.6323193311691284, + "num_tokens": 1075825.0, + "step": 42 + }, + { + "epoch": 0.0047221612123874365, + "grad_norm": 7.499317169189453, + "learning_rate": 7.686676427525623e-08, + "loss": 1.4096, + "mean_token_accuracy": 0.625597357749939, + "num_tokens": 1098287.0, + "step": 43 + }, + { + "epoch": 0.004831978915001098, + "grad_norm": 7.56381893157959, + "learning_rate": 7.869692532942899e-08, + "loss": 1.5634, + "mean_token_accuracy": 0.589715838432312, + "num_tokens": 1121975.0, + "step": 44 + }, + { + "epoch": 0.00494179661761476, + "grad_norm": 6.396032333374023, + "learning_rate": 8.052708638360176e-08, + "loss": 1.4761, + "mean_token_accuracy": 0.6242425441741943, + "num_tokens": 1149322.0, + "step": 45 + }, + { + "epoch": 0.0050516143202284204, + "grad_norm": 6.677330017089844, + "learning_rate": 8.235724743777454e-08, + "loss": 1.4601, + "mean_token_accuracy": 0.6125476360321045, + "num_tokens": 1175319.0, + "step": 46 + }, + { + "epoch": 0.005161432022842082, + "grad_norm": 6.6233062744140625, + "learning_rate": 8.41874084919473e-08, + "loss": 1.5557, + "mean_token_accuracy": 0.6010117530822754, + "num_tokens": 1203669.0, + "step": 47 + }, + { + "epoch": 0.005271249725455744, + "grad_norm": 6.890981197357178, + "learning_rate": 8.601756954612007e-08, + "loss": 1.49, + "mean_token_accuracy": 0.6142376065254211, + "num_tokens": 1228422.0, + "step": 48 + }, + { + "epoch": 0.005381067428069404, + "grad_norm": 7.299238204956055, + "learning_rate": 8.784773060029284e-08, + "loss": 1.5516, + "mean_token_accuracy": 0.5993298888206482, + "num_tokens": 1252030.0, + "step": 49 + }, + { + "epoch": 0.005490885130683066, + "grad_norm": 6.336511135101318, + "learning_rate": 8.96778916544656e-08, + "loss": 1.4059, + "mean_token_accuracy": 0.6315608024597168, + "num_tokens": 1279118.0, + "step": 50 + }, + { + "epoch": 0.0056007028332967276, + "grad_norm": 6.340853214263916, + "learning_rate": 9.150805270863837e-08, + "loss": 1.4826, + "mean_token_accuracy": 0.6110865473747253, + "num_tokens": 1304928.0, + "step": 51 + }, + { + "epoch": 0.005710520535910389, + "grad_norm": 6.599939346313477, + "learning_rate": 9.333821376281114e-08, + "loss": 1.4669, + "mean_token_accuracy": 0.6110140085220337, + "num_tokens": 1330302.0, + "step": 52 + }, + { + "epoch": 0.00582033823852405, + "grad_norm": 6.179874897003174, + "learning_rate": 9.51683748169839e-08, + "loss": 1.4888, + "mean_token_accuracy": 0.6132649183273315, + "num_tokens": 1359034.0, + "step": 53 + }, + { + "epoch": 0.0059301559411377115, + "grad_norm": 6.503165245056152, + "learning_rate": 9.699853587115667e-08, + "loss": 1.4806, + "mean_token_accuracy": 0.6135605573654175, + "num_tokens": 1384480.0, + "step": 54 + }, + { + "epoch": 0.006039973643751373, + "grad_norm": 6.71858549118042, + "learning_rate": 9.882869692532944e-08, + "loss": 1.473, + "mean_token_accuracy": 0.6110966801643372, + "num_tokens": 1409834.0, + "step": 55 + }, + { + "epoch": 0.006149791346365034, + "grad_norm": 6.3191399574279785, + "learning_rate": 1.006588579795022e-07, + "loss": 1.4609, + "mean_token_accuracy": 0.6173813939094543, + "num_tokens": 1436384.0, + "step": 56 + }, + { + "epoch": 0.006259609048978695, + "grad_norm": 7.004606246948242, + "learning_rate": 1.0248901903367497e-07, + "loss": 1.4695, + "mean_token_accuracy": 0.6169832944869995, + "num_tokens": 1460088.0, + "step": 57 + }, + { + "epoch": 0.006369426751592357, + "grad_norm": 7.135741233825684, + "learning_rate": 1.0431918008784775e-07, + "loss": 1.4794, + "mean_token_accuracy": 0.6172667741775513, + "num_tokens": 1482660.0, + "step": 58 + }, + { + "epoch": 0.006479244454206018, + "grad_norm": 6.1265058517456055, + "learning_rate": 1.061493411420205e-07, + "loss": 1.588, + "mean_token_accuracy": 0.58420729637146, + "num_tokens": 1512572.0, + "step": 59 + }, + { + "epoch": 0.006589062156819679, + "grad_norm": 6.533516883850098, + "learning_rate": 1.0797950219619328e-07, + "loss": 1.4258, + "mean_token_accuracy": 0.6149400472640991, + "num_tokens": 1537685.0, + "step": 60 + }, + { + "epoch": 0.006698879859433341, + "grad_norm": 6.777708530426025, + "learning_rate": 1.0980966325036605e-07, + "loss": 1.397, + "mean_token_accuracy": 0.6320955753326416, + "num_tokens": 1561543.0, + "step": 61 + }, + { + "epoch": 0.006808697562047002, + "grad_norm": 6.705153465270996, + "learning_rate": 1.1163982430453881e-07, + "loss": 1.3771, + "mean_token_accuracy": 0.6319336891174316, + "num_tokens": 1585568.0, + "step": 62 + }, + { + "epoch": 0.006918515264660663, + "grad_norm": 7.558284282684326, + "learning_rate": 1.1346998535871158e-07, + "loss": 1.5296, + "mean_token_accuracy": 0.5963862538337708, + "num_tokens": 1607199.0, + "step": 63 + }, + { + "epoch": 0.007028332967274325, + "grad_norm": 6.612707138061523, + "learning_rate": 1.1530014641288435e-07, + "loss": 1.462, + "mean_token_accuracy": 0.6124200224876404, + "num_tokens": 1633223.0, + "step": 64 + }, + { + "epoch": 0.007138150669887986, + "grad_norm": 5.737194061279297, + "learning_rate": 1.1713030746705711e-07, + "loss": 1.3872, + "mean_token_accuracy": 0.6237789392471313, + "num_tokens": 1665877.0, + "step": 65 + }, + { + "epoch": 0.007247968372501647, + "grad_norm": 6.967468738555908, + "learning_rate": 1.1896046852122988e-07, + "loss": 1.4374, + "mean_token_accuracy": 0.6198134422302246, + "num_tokens": 1689212.0, + "step": 66 + }, + { + "epoch": 0.007357786075115309, + "grad_norm": 6.040383338928223, + "learning_rate": 1.2079062957540265e-07, + "loss": 1.3972, + "mean_token_accuracy": 0.6238062381744385, + "num_tokens": 1714951.0, + "step": 67 + }, + { + "epoch": 0.0074676037777289695, + "grad_norm": 6.195141315460205, + "learning_rate": 1.2262079062957543e-07, + "loss": 1.3199, + "mean_token_accuracy": 0.6472660899162292, + "num_tokens": 1737497.0, + "step": 68 + }, + { + "epoch": 0.007577421480342631, + "grad_norm": 5.675432205200195, + "learning_rate": 1.2445095168374817e-07, + "loss": 1.448, + "mean_token_accuracy": 0.6143748760223389, + "num_tokens": 1763399.0, + "step": 69 + }, + { + "epoch": 0.007687239182956293, + "grad_norm": 5.261104106903076, + "learning_rate": 1.2628111273792094e-07, + "loss": 1.5242, + "mean_token_accuracy": 0.6017512083053589, + "num_tokens": 1791755.0, + "step": 70 + }, + { + "epoch": 0.007797056885569954, + "grad_norm": 4.870793342590332, + "learning_rate": 1.2811127379209371e-07, + "loss": 1.3308, + "mean_token_accuracy": 0.6450766921043396, + "num_tokens": 1822942.0, + "step": 71 + }, + { + "epoch": 0.007906874588183616, + "grad_norm": 5.957615375518799, + "learning_rate": 1.2994143484626649e-07, + "loss": 1.4677, + "mean_token_accuracy": 0.6093469858169556, + "num_tokens": 1846609.0, + "step": 72 + }, + { + "epoch": 0.008016692290797276, + "grad_norm": 5.253456115722656, + "learning_rate": 1.3177159590043926e-07, + "loss": 1.4174, + "mean_token_accuracy": 0.6193337440490723, + "num_tokens": 1872448.0, + "step": 73 + }, + { + "epoch": 0.008126509993410937, + "grad_norm": 4.768847942352295, + "learning_rate": 1.3360175695461203e-07, + "loss": 1.4228, + "mean_token_accuracy": 0.6125335097312927, + "num_tokens": 1903480.0, + "step": 74 + }, + { + "epoch": 0.008236327696024599, + "grad_norm": 5.235701560974121, + "learning_rate": 1.3543191800878478e-07, + "loss": 1.3567, + "mean_token_accuracy": 0.63885498046875, + "num_tokens": 1928245.0, + "step": 75 + }, + { + "epoch": 0.00834614539863826, + "grad_norm": 5.3026323318481445, + "learning_rate": 1.3726207906295755e-07, + "loss": 1.4121, + "mean_token_accuracy": 0.6134922504425049, + "num_tokens": 1953911.0, + "step": 76 + }, + { + "epoch": 0.008455963101251922, + "grad_norm": 5.069142818450928, + "learning_rate": 1.3909224011713032e-07, + "loss": 1.5061, + "mean_token_accuracy": 0.5967615842819214, + "num_tokens": 1984062.0, + "step": 77 + }, + { + "epoch": 0.008565780803865584, + "grad_norm": 5.069657802581787, + "learning_rate": 1.409224011713031e-07, + "loss": 1.4355, + "mean_token_accuracy": 0.6088422536849976, + "num_tokens": 2010246.0, + "step": 78 + }, + { + "epoch": 0.008675598506479244, + "grad_norm": 4.927698135375977, + "learning_rate": 1.4275256222547586e-07, + "loss": 1.4, + "mean_token_accuracy": 0.6226131319999695, + "num_tokens": 2036371.0, + "step": 79 + }, + { + "epoch": 0.008785416209092905, + "grad_norm": 5.166134834289551, + "learning_rate": 1.4458272327964864e-07, + "loss": 1.3279, + "mean_token_accuracy": 0.6405322551727295, + "num_tokens": 2060143.0, + "step": 80 + }, + { + "epoch": 0.008895233911706567, + "grad_norm": 5.377108097076416, + "learning_rate": 1.4641288433382138e-07, + "loss": 1.441, + "mean_token_accuracy": 0.6136000752449036, + "num_tokens": 2084023.0, + "step": 81 + }, + { + "epoch": 0.009005051614320228, + "grad_norm": 4.697564125061035, + "learning_rate": 1.4824304538799415e-07, + "loss": 1.3316, + "mean_token_accuracy": 0.6402846574783325, + "num_tokens": 2111399.0, + "step": 82 + }, + { + "epoch": 0.00911486931693389, + "grad_norm": 4.600299835205078, + "learning_rate": 1.5007320644216692e-07, + "loss": 1.4341, + "mean_token_accuracy": 0.6095661520957947, + "num_tokens": 2139462.0, + "step": 83 + }, + { + "epoch": 0.009224687019547552, + "grad_norm": 4.790316104888916, + "learning_rate": 1.519033674963397e-07, + "loss": 1.3798, + "mean_token_accuracy": 0.6218881011009216, + "num_tokens": 2164271.0, + "step": 84 + }, + { + "epoch": 0.009334504722161213, + "grad_norm": 4.2401041984558105, + "learning_rate": 1.5373352855051247e-07, + "loss": 1.4086, + "mean_token_accuracy": 0.614416241645813, + "num_tokens": 2197822.0, + "step": 85 + }, + { + "epoch": 0.009444322424774873, + "grad_norm": 4.801418304443359, + "learning_rate": 1.5556368960468524e-07, + "loss": 1.3475, + "mean_token_accuracy": 0.6363457441329956, + "num_tokens": 2222493.0, + "step": 86 + }, + { + "epoch": 0.009554140127388535, + "grad_norm": 4.419833660125732, + "learning_rate": 1.5739385065885799e-07, + "loss": 1.4781, + "mean_token_accuracy": 0.5995503664016724, + "num_tokens": 2253038.0, + "step": 87 + }, + { + "epoch": 0.009663957830002196, + "grad_norm": 4.17234992980957, + "learning_rate": 1.5922401171303076e-07, + "loss": 1.3625, + "mean_token_accuracy": 0.624117374420166, + "num_tokens": 2283312.0, + "step": 88 + }, + { + "epoch": 0.009773775532615858, + "grad_norm": 4.038226127624512, + "learning_rate": 1.6105417276720353e-07, + "loss": 1.3842, + "mean_token_accuracy": 0.6329691410064697, + "num_tokens": 2318888.0, + "step": 89 + }, + { + "epoch": 0.00988359323522952, + "grad_norm": 4.470351219177246, + "learning_rate": 1.628843338213763e-07, + "loss": 1.4307, + "mean_token_accuracy": 0.6228741407394409, + "num_tokens": 2348756.0, + "step": 90 + }, + { + "epoch": 0.009993410937843181, + "grad_norm": 4.209887504577637, + "learning_rate": 1.6471449487554907e-07, + "loss": 1.2804, + "mean_token_accuracy": 0.6521115899085999, + "num_tokens": 2373926.0, + "step": 91 + }, + { + "epoch": 0.010103228640456841, + "grad_norm": 4.340063571929932, + "learning_rate": 1.6654465592972184e-07, + "loss": 1.4012, + "mean_token_accuracy": 0.6166192293167114, + "num_tokens": 2401887.0, + "step": 92 + }, + { + "epoch": 0.010213046343070502, + "grad_norm": 4.9677557945251465, + "learning_rate": 1.683748169838946e-07, + "loss": 1.2384, + "mean_token_accuracy": 0.6485952138900757, + "num_tokens": 2422686.0, + "step": 93 + }, + { + "epoch": 0.010322864045684164, + "grad_norm": 4.19266939163208, + "learning_rate": 1.7020497803806736e-07, + "loss": 1.3547, + "mean_token_accuracy": 0.6289039850234985, + "num_tokens": 2448162.0, + "step": 94 + }, + { + "epoch": 0.010432681748297826, + "grad_norm": 4.360573768615723, + "learning_rate": 1.7203513909224013e-07, + "loss": 1.3612, + "mean_token_accuracy": 0.6214061975479126, + "num_tokens": 2473764.0, + "step": 95 + }, + { + "epoch": 0.010542499450911487, + "grad_norm": 3.96142840385437, + "learning_rate": 1.738653001464129e-07, + "loss": 1.3843, + "mean_token_accuracy": 0.6173025369644165, + "num_tokens": 2503690.0, + "step": 96 + }, + { + "epoch": 0.010652317153525149, + "grad_norm": 4.094654560089111, + "learning_rate": 1.7569546120058568e-07, + "loss": 1.2968, + "mean_token_accuracy": 0.6396679878234863, + "num_tokens": 2529345.0, + "step": 97 + }, + { + "epoch": 0.010762134856138809, + "grad_norm": 4.036545276641846, + "learning_rate": 1.7752562225475845e-07, + "loss": 1.2865, + "mean_token_accuracy": 0.652564287185669, + "num_tokens": 2553970.0, + "step": 98 + }, + { + "epoch": 0.01087195255875247, + "grad_norm": 3.8276896476745605, + "learning_rate": 1.793557833089312e-07, + "loss": 1.3622, + "mean_token_accuracy": 0.6285756826400757, + "num_tokens": 2582078.0, + "step": 99 + }, + { + "epoch": 0.010981770261366132, + "grad_norm": 4.449720859527588, + "learning_rate": 1.8118594436310397e-07, + "loss": 1.302, + "mean_token_accuracy": 0.6415062546730042, + "num_tokens": 2601726.0, + "step": 100 + }, + { + "epoch": 0.011091587963979794, + "grad_norm": 3.620961904525757, + "learning_rate": 1.8301610541727674e-07, + "loss": 1.3936, + "mean_token_accuracy": 0.6272752285003662, + "num_tokens": 2631480.0, + "step": 101 + }, + { + "epoch": 0.011201405666593455, + "grad_norm": 3.9831454753875732, + "learning_rate": 1.848462664714495e-07, + "loss": 1.2598, + "mean_token_accuracy": 0.6489138603210449, + "num_tokens": 2655931.0, + "step": 102 + }, + { + "epoch": 0.011311223369207117, + "grad_norm": 3.74761962890625, + "learning_rate": 1.8667642752562228e-07, + "loss": 1.3106, + "mean_token_accuracy": 0.6348493099212646, + "num_tokens": 2681572.0, + "step": 103 + }, + { + "epoch": 0.011421041071820778, + "grad_norm": 3.9929988384246826, + "learning_rate": 1.8850658857979505e-07, + "loss": 1.3283, + "mean_token_accuracy": 0.639693021774292, + "num_tokens": 2704055.0, + "step": 104 + }, + { + "epoch": 0.011530858774434438, + "grad_norm": 3.4846994876861572, + "learning_rate": 1.903367496339678e-07, + "loss": 1.2829, + "mean_token_accuracy": 0.6468504667282104, + "num_tokens": 2732638.0, + "step": 105 + }, + { + "epoch": 0.0116406764770481, + "grad_norm": 4.159327030181885, + "learning_rate": 1.9216691068814057e-07, + "loss": 1.3482, + "mean_token_accuracy": 0.6292141675949097, + "num_tokens": 2756843.0, + "step": 106 + }, + { + "epoch": 0.011750494179661761, + "grad_norm": 3.6514973640441895, + "learning_rate": 1.9399707174231334e-07, + "loss": 1.2821, + "mean_token_accuracy": 0.6446664929389954, + "num_tokens": 2780503.0, + "step": 107 + }, + { + "epoch": 0.011860311882275423, + "grad_norm": 3.8156514167785645, + "learning_rate": 1.9582723279648612e-07, + "loss": 1.3105, + "mean_token_accuracy": 0.6333107948303223, + "num_tokens": 2803498.0, + "step": 108 + }, + { + "epoch": 0.011970129584889085, + "grad_norm": 3.803229331970215, + "learning_rate": 1.976573938506589e-07, + "loss": 1.2344, + "mean_token_accuracy": 0.662423312664032, + "num_tokens": 2828318.0, + "step": 109 + }, + { + "epoch": 0.012079947287502746, + "grad_norm": 3.984194278717041, + "learning_rate": 1.9948755490483166e-07, + "loss": 1.1838, + "mean_token_accuracy": 0.6767470836639404, + "num_tokens": 2850771.0, + "step": 110 + }, + { + "epoch": 0.012189764990116406, + "grad_norm": 3.7736332416534424, + "learning_rate": 2.013177159590044e-07, + "loss": 1.3213, + "mean_token_accuracy": 0.6344679594039917, + "num_tokens": 2877968.0, + "step": 111 + }, + { + "epoch": 0.012299582692730068, + "grad_norm": 3.8043322563171387, + "learning_rate": 2.0314787701317718e-07, + "loss": 1.3979, + "mean_token_accuracy": 0.6132405996322632, + "num_tokens": 2904426.0, + "step": 112 + }, + { + "epoch": 0.01240940039534373, + "grad_norm": 4.211339950561523, + "learning_rate": 2.0497803806734995e-07, + "loss": 1.2407, + "mean_token_accuracy": 0.6491400003433228, + "num_tokens": 2926267.0, + "step": 113 + }, + { + "epoch": 0.01251921809795739, + "grad_norm": 3.431424856185913, + "learning_rate": 2.0680819912152272e-07, + "loss": 1.3438, + "mean_token_accuracy": 0.6237856149673462, + "num_tokens": 2953034.0, + "step": 114 + }, + { + "epoch": 0.012629035800571052, + "grad_norm": 5.371855735778809, + "learning_rate": 2.086383601756955e-07, + "loss": 1.2759, + "mean_token_accuracy": 0.6410088539123535, + "num_tokens": 2971703.0, + "step": 115 + }, + { + "epoch": 0.012738853503184714, + "grad_norm": 4.0041422843933105, + "learning_rate": 2.1046852122986826e-07, + "loss": 1.2307, + "mean_token_accuracy": 0.6600528955459595, + "num_tokens": 2993674.0, + "step": 116 + }, + { + "epoch": 0.012848671205798374, + "grad_norm": 3.5415241718292236, + "learning_rate": 2.12298682284041e-07, + "loss": 1.2308, + "mean_token_accuracy": 0.6541603803634644, + "num_tokens": 3018473.0, + "step": 117 + }, + { + "epoch": 0.012958488908412035, + "grad_norm": 3.452284097671509, + "learning_rate": 2.1412884333821378e-07, + "loss": 1.2786, + "mean_token_accuracy": 0.6480268239974976, + "num_tokens": 3044514.0, + "step": 118 + }, + { + "epoch": 0.013068306611025697, + "grad_norm": 3.45027756690979, + "learning_rate": 2.1595900439238655e-07, + "loss": 1.3509, + "mean_token_accuracy": 0.6242484450340271, + "num_tokens": 3071031.0, + "step": 119 + }, + { + "epoch": 0.013178124313639359, + "grad_norm": 3.9973561763763428, + "learning_rate": 2.1778916544655933e-07, + "loss": 1.3345, + "mean_token_accuracy": 0.6405109167098999, + "num_tokens": 3095641.0, + "step": 120 + }, + { + "epoch": 0.01328794201625302, + "grad_norm": 3.877866506576538, + "learning_rate": 2.196193265007321e-07, + "loss": 1.156, + "mean_token_accuracy": 0.6718418598175049, + "num_tokens": 3118928.0, + "step": 121 + }, + { + "epoch": 0.013397759718866682, + "grad_norm": 4.140008449554443, + "learning_rate": 2.2144948755490487e-07, + "loss": 1.327, + "mean_token_accuracy": 0.6318884491920471, + "num_tokens": 3143843.0, + "step": 122 + }, + { + "epoch": 0.013507577421480343, + "grad_norm": 4.045380115509033, + "learning_rate": 2.2327964860907761e-07, + "loss": 1.2486, + "mean_token_accuracy": 0.6555131077766418, + "num_tokens": 3166841.0, + "step": 123 + }, + { + "epoch": 0.013617395124094003, + "grad_norm": 4.578455448150635, + "learning_rate": 2.2510980966325039e-07, + "loss": 1.3059, + "mean_token_accuracy": 0.6312591433525085, + "num_tokens": 3193250.0, + "step": 124 + }, + { + "epoch": 0.013727212826707665, + "grad_norm": 4.063391208648682, + "learning_rate": 2.2693997071742316e-07, + "loss": 1.3258, + "mean_token_accuracy": 0.6330127716064453, + "num_tokens": 3220522.0, + "step": 125 + }, + { + "epoch": 0.013837030529321327, + "grad_norm": 4.010960578918457, + "learning_rate": 2.2877013177159593e-07, + "loss": 1.1944, + "mean_token_accuracy": 0.654394268989563, + "num_tokens": 3245474.0, + "step": 126 + }, + { + "epoch": 0.013946848231934988, + "grad_norm": 4.322788238525391, + "learning_rate": 2.306002928257687e-07, + "loss": 1.2389, + "mean_token_accuracy": 0.6425864696502686, + "num_tokens": 3273782.0, + "step": 127 + }, + { + "epoch": 0.01405666593454865, + "grad_norm": 3.9469454288482666, + "learning_rate": 2.3243045387994145e-07, + "loss": 1.2848, + "mean_token_accuracy": 0.6428601741790771, + "num_tokens": 3299360.0, + "step": 128 + }, + { + "epoch": 0.014166483637162311, + "grad_norm": 3.65634822845459, + "learning_rate": 2.3426061493411422e-07, + "loss": 1.2563, + "mean_token_accuracy": 0.6478175520896912, + "num_tokens": 3326954.0, + "step": 129 + }, + { + "epoch": 0.014276301339775971, + "grad_norm": 4.557602882385254, + "learning_rate": 2.36090775988287e-07, + "loss": 1.3674, + "mean_token_accuracy": 0.6192685961723328, + "num_tokens": 3352218.0, + "step": 130 + }, + { + "epoch": 0.014386119042389633, + "grad_norm": 3.6736364364624023, + "learning_rate": 2.3792093704245976e-07, + "loss": 1.165, + "mean_token_accuracy": 0.6670578122138977, + "num_tokens": 3379699.0, + "step": 131 + }, + { + "epoch": 0.014495936745003294, + "grad_norm": 3.247772216796875, + "learning_rate": 2.397510980966325e-07, + "loss": 1.2907, + "mean_token_accuracy": 0.6428329944610596, + "num_tokens": 3410257.0, + "step": 132 + }, + { + "epoch": 0.014605754447616956, + "grad_norm": 4.547119617462158, + "learning_rate": 2.415812591508053e-07, + "loss": 1.2231, + "mean_token_accuracy": 0.6525284647941589, + "num_tokens": 3435993.0, + "step": 133 + }, + { + "epoch": 0.014715572150230618, + "grad_norm": 3.763603687286377, + "learning_rate": 2.4341142020497805e-07, + "loss": 1.2473, + "mean_token_accuracy": 0.6496945023536682, + "num_tokens": 3461380.0, + "step": 134 + }, + { + "epoch": 0.01482538985284428, + "grad_norm": 4.077746868133545, + "learning_rate": 2.4524158125915085e-07, + "loss": 1.2601, + "mean_token_accuracy": 0.6380690932273865, + "num_tokens": 3488051.0, + "step": 135 + }, + { + "epoch": 0.014935207555457939, + "grad_norm": 5.152332305908203, + "learning_rate": 2.470717423133236e-07, + "loss": 1.2531, + "mean_token_accuracy": 0.6508322358131409, + "num_tokens": 3513373.0, + "step": 136 + }, + { + "epoch": 0.0150450252580716, + "grad_norm": 4.302706718444824, + "learning_rate": 2.4890190336749634e-07, + "loss": 1.1471, + "mean_token_accuracy": 0.6711650490760803, + "num_tokens": 3533475.0, + "step": 137 + }, + { + "epoch": 0.015154842960685262, + "grad_norm": 3.5688467025756836, + "learning_rate": 2.5073206442166914e-07, + "loss": 1.3301, + "mean_token_accuracy": 0.6360520720481873, + "num_tokens": 3561234.0, + "step": 138 + }, + { + "epoch": 0.015264660663298924, + "grad_norm": 3.634084939956665, + "learning_rate": 2.525622254758419e-07, + "loss": 1.1949, + "mean_token_accuracy": 0.6635682582855225, + "num_tokens": 3591106.0, + "step": 139 + }, + { + "epoch": 0.015374478365912585, + "grad_norm": 3.486431360244751, + "learning_rate": 2.543923865300147e-07, + "loss": 1.2266, + "mean_token_accuracy": 0.6507273316383362, + "num_tokens": 3617921.0, + "step": 140 + }, + { + "epoch": 0.015484296068526247, + "grad_norm": 4.273600101470947, + "learning_rate": 2.5622254758418743e-07, + "loss": 1.2329, + "mean_token_accuracy": 0.6534054279327393, + "num_tokens": 3638915.0, + "step": 141 + }, + { + "epoch": 0.015594113771139909, + "grad_norm": 4.429962635040283, + "learning_rate": 2.5805270863836023e-07, + "loss": 1.1388, + "mean_token_accuracy": 0.6750228404998779, + "num_tokens": 3659126.0, + "step": 142 + }, + { + "epoch": 0.01570393147375357, + "grad_norm": 3.6751205921173096, + "learning_rate": 2.5988286969253297e-07, + "loss": 1.1156, + "mean_token_accuracy": 0.6828022003173828, + "num_tokens": 3681345.0, + "step": 143 + }, + { + "epoch": 0.015813749176367232, + "grad_norm": 3.1404335498809814, + "learning_rate": 2.617130307467057e-07, + "loss": 1.2833, + "mean_token_accuracy": 0.6417004466056824, + "num_tokens": 3709604.0, + "step": 144 + }, + { + "epoch": 0.01592356687898089, + "grad_norm": 3.3231992721557617, + "learning_rate": 2.635431918008785e-07, + "loss": 1.2744, + "mean_token_accuracy": 0.6437402963638306, + "num_tokens": 3742560.0, + "step": 145 + }, + { + "epoch": 0.01603338458159455, + "grad_norm": 3.2512285709381104, + "learning_rate": 2.6537335285505126e-07, + "loss": 1.2619, + "mean_token_accuracy": 0.6448155641555786, + "num_tokens": 3771269.0, + "step": 146 + }, + { + "epoch": 0.016143202284208215, + "grad_norm": 5.057307720184326, + "learning_rate": 2.6720351390922406e-07, + "loss": 1.2326, + "mean_token_accuracy": 0.6572617292404175, + "num_tokens": 3788970.0, + "step": 147 + }, + { + "epoch": 0.016253019986821875, + "grad_norm": 3.6729414463043213, + "learning_rate": 2.690336749633968e-07, + "loss": 1.1719, + "mean_token_accuracy": 0.669413685798645, + "num_tokens": 3813559.0, + "step": 148 + }, + { + "epoch": 0.016362837689435538, + "grad_norm": 3.0197010040283203, + "learning_rate": 2.7086383601756955e-07, + "loss": 1.2312, + "mean_token_accuracy": 0.6522884368896484, + "num_tokens": 3847572.0, + "step": 149 + }, + { + "epoch": 0.016472655392049198, + "grad_norm": 3.9062142372131348, + "learning_rate": 2.7269399707174235e-07, + "loss": 1.2275, + "mean_token_accuracy": 0.6522775292396545, + "num_tokens": 3868021.0, + "step": 150 + }, + { + "epoch": 0.01658247309466286, + "grad_norm": 3.3160839080810547, + "learning_rate": 2.745241581259151e-07, + "loss": 1.1508, + "mean_token_accuracy": 0.6690583229064941, + "num_tokens": 3893806.0, + "step": 151 + }, + { + "epoch": 0.01669229079727652, + "grad_norm": 3.2638304233551025, + "learning_rate": 2.763543191800879e-07, + "loss": 1.1631, + "mean_token_accuracy": 0.6755173802375793, + "num_tokens": 3921258.0, + "step": 152 + }, + { + "epoch": 0.01680210849989018, + "grad_norm": 3.2134156227111816, + "learning_rate": 2.7818448023426064e-07, + "loss": 1.2333, + "mean_token_accuracy": 0.6480783820152283, + "num_tokens": 3945616.0, + "step": 153 + }, + { + "epoch": 0.016911926202503844, + "grad_norm": 3.229661226272583, + "learning_rate": 2.8001464128843344e-07, + "loss": 1.256, + "mean_token_accuracy": 0.6523908972740173, + "num_tokens": 3975587.0, + "step": 154 + }, + { + "epoch": 0.017021743905117504, + "grad_norm": 3.256577730178833, + "learning_rate": 2.818448023426062e-07, + "loss": 1.2044, + "mean_token_accuracy": 0.658042848110199, + "num_tokens": 4001434.0, + "step": 155 + }, + { + "epoch": 0.017131561607731167, + "grad_norm": 3.8770029544830322, + "learning_rate": 2.8367496339677893e-07, + "loss": 1.1274, + "mean_token_accuracy": 0.6724061369895935, + "num_tokens": 4020682.0, + "step": 156 + }, + { + "epoch": 0.017241379310344827, + "grad_norm": 3.2044966220855713, + "learning_rate": 2.855051244509517e-07, + "loss": 1.2401, + "mean_token_accuracy": 0.6483820676803589, + "num_tokens": 4050676.0, + "step": 157 + }, + { + "epoch": 0.017351197012958487, + "grad_norm": 4.153353214263916, + "learning_rate": 2.8733528550512447e-07, + "loss": 1.0682, + "mean_token_accuracy": 0.6869837045669556, + "num_tokens": 4069433.0, + "step": 158 + }, + { + "epoch": 0.01746101471557215, + "grad_norm": 3.34678316116333, + "learning_rate": 2.8916544655929727e-07, + "loss": 1.2318, + "mean_token_accuracy": 0.647219181060791, + "num_tokens": 4095417.0, + "step": 159 + }, + { + "epoch": 0.01757083241818581, + "grad_norm": 3.135220766067505, + "learning_rate": 2.9099560761347e-07, + "loss": 1.2296, + "mean_token_accuracy": 0.6616467237472534, + "num_tokens": 4122427.0, + "step": 160 + }, + { + "epoch": 0.017680650120799474, + "grad_norm": 3.0098938941955566, + "learning_rate": 2.9282576866764276e-07, + "loss": 1.2755, + "mean_token_accuracy": 0.6450210809707642, + "num_tokens": 4152366.0, + "step": 161 + }, + { + "epoch": 0.017790467823413134, + "grad_norm": 3.5504515171051025, + "learning_rate": 2.9465592972181556e-07, + "loss": 1.2308, + "mean_token_accuracy": 0.6522648930549622, + "num_tokens": 4175678.0, + "step": 162 + }, + { + "epoch": 0.017900285526026797, + "grad_norm": 2.868105173110962, + "learning_rate": 2.964860907759883e-07, + "loss": 1.2133, + "mean_token_accuracy": 0.6661208271980286, + "num_tokens": 4206066.0, + "step": 163 + }, + { + "epoch": 0.018010103228640457, + "grad_norm": 3.975857973098755, + "learning_rate": 2.983162518301611e-07, + "loss": 1.2081, + "mean_token_accuracy": 0.6526422500610352, + "num_tokens": 4227497.0, + "step": 164 + }, + { + "epoch": 0.018119920931254117, + "grad_norm": 3.2163121700286865, + "learning_rate": 3.0014641288433385e-07, + "loss": 1.2266, + "mean_token_accuracy": 0.6544435024261475, + "num_tokens": 4255360.0, + "step": 165 + }, + { + "epoch": 0.01822973863386778, + "grad_norm": 3.5533149242401123, + "learning_rate": 3.0197657393850665e-07, + "loss": 1.2218, + "mean_token_accuracy": 0.652908444404602, + "num_tokens": 4288638.0, + "step": 166 + }, + { + "epoch": 0.01833955633648144, + "grad_norm": 4.03098726272583, + "learning_rate": 3.038067349926794e-07, + "loss": 1.2578, + "mean_token_accuracy": 0.6421217918395996, + "num_tokens": 4314349.0, + "step": 167 + }, + { + "epoch": 0.018449374039095103, + "grad_norm": 4.491618633270264, + "learning_rate": 3.0563689604685214e-07, + "loss": 1.1862, + "mean_token_accuracy": 0.6581841707229614, + "num_tokens": 4333756.0, + "step": 168 + }, + { + "epoch": 0.018559191741708763, + "grad_norm": 4.11190938949585, + "learning_rate": 3.0746705710102494e-07, + "loss": 1.2233, + "mean_token_accuracy": 0.6476842164993286, + "num_tokens": 4355666.0, + "step": 169 + }, + { + "epoch": 0.018669009444322426, + "grad_norm": 3.7382419109344482, + "learning_rate": 3.092972181551977e-07, + "loss": 1.1851, + "mean_token_accuracy": 0.6663314700126648, + "num_tokens": 4377860.0, + "step": 170 + }, + { + "epoch": 0.018778827146936086, + "grad_norm": 3.429738998413086, + "learning_rate": 3.111273792093705e-07, + "loss": 1.1884, + "mean_token_accuracy": 0.6618335247039795, + "num_tokens": 4403664.0, + "step": 171 + }, + { + "epoch": 0.018888644849549746, + "grad_norm": 3.271775960922241, + "learning_rate": 3.1295754026354317e-07, + "loss": 1.1835, + "mean_token_accuracy": 0.6610559225082397, + "num_tokens": 4435684.0, + "step": 172 + }, + { + "epoch": 0.01899846255216341, + "grad_norm": 3.3564045429229736, + "learning_rate": 3.1478770131771597e-07, + "loss": 1.2044, + "mean_token_accuracy": 0.6608788967132568, + "num_tokens": 4462258.0, + "step": 173 + }, + { + "epoch": 0.01910828025477707, + "grad_norm": 2.7060327529907227, + "learning_rate": 3.166178623718887e-07, + "loss": 1.1778, + "mean_token_accuracy": 0.6614301800727844, + "num_tokens": 4495426.0, + "step": 174 + }, + { + "epoch": 0.019218097957390733, + "grad_norm": 3.3993966579437256, + "learning_rate": 3.184480234260615e-07, + "loss": 1.3224, + "mean_token_accuracy": 0.6348024606704712, + "num_tokens": 4521065.0, + "step": 175 + }, + { + "epoch": 0.019327915660004392, + "grad_norm": 3.0366122722625732, + "learning_rate": 3.2027818448023426e-07, + "loss": 1.2469, + "mean_token_accuracy": 0.655142068862915, + "num_tokens": 4549800.0, + "step": 176 + }, + { + "epoch": 0.019437733362618052, + "grad_norm": 2.9666748046875, + "learning_rate": 3.2210834553440706e-07, + "loss": 1.1649, + "mean_token_accuracy": 0.6700648069381714, + "num_tokens": 4579866.0, + "step": 177 + }, + { + "epoch": 0.019547551065231716, + "grad_norm": 3.2044966220855713, + "learning_rate": 3.239385065885798e-07, + "loss": 1.1787, + "mean_token_accuracy": 0.6561389565467834, + "num_tokens": 4605876.0, + "step": 178 + }, + { + "epoch": 0.019657368767845376, + "grad_norm": 2.6089859008789062, + "learning_rate": 3.257686676427526e-07, + "loss": 1.2707, + "mean_token_accuracy": 0.6531457901000977, + "num_tokens": 4638660.0, + "step": 179 + }, + { + "epoch": 0.01976718647045904, + "grad_norm": 3.7837295532226562, + "learning_rate": 3.2759882869692535e-07, + "loss": 1.1668, + "mean_token_accuracy": 0.6620555520057678, + "num_tokens": 4658705.0, + "step": 180 + }, + { + "epoch": 0.0198770041730727, + "grad_norm": 2.9665770530700684, + "learning_rate": 3.2942898975109815e-07, + "loss": 1.2053, + "mean_token_accuracy": 0.6730668544769287, + "num_tokens": 4689036.0, + "step": 181 + }, + { + "epoch": 0.019986821875686362, + "grad_norm": 3.36736798286438, + "learning_rate": 3.312591508052709e-07, + "loss": 1.1507, + "mean_token_accuracy": 0.666716456413269, + "num_tokens": 4710640.0, + "step": 182 + }, + { + "epoch": 0.020096639578300022, + "grad_norm": 3.341442346572876, + "learning_rate": 3.330893118594437e-07, + "loss": 1.1975, + "mean_token_accuracy": 0.6544729471206665, + "num_tokens": 4734910.0, + "step": 183 + }, + { + "epoch": 0.020206457280913682, + "grad_norm": 3.2035021781921387, + "learning_rate": 3.349194729136164e-07, + "loss": 1.2569, + "mean_token_accuracy": 0.6401028633117676, + "num_tokens": 4759850.0, + "step": 184 + }, + { + "epoch": 0.020316274983527345, + "grad_norm": 2.9970030784606934, + "learning_rate": 3.367496339677892e-07, + "loss": 1.1008, + "mean_token_accuracy": 0.6799271106719971, + "num_tokens": 4783485.0, + "step": 185 + }, + { + "epoch": 0.020426092686141005, + "grad_norm": 2.9891834259033203, + "learning_rate": 3.385797950219619e-07, + "loss": 1.249, + "mean_token_accuracy": 0.6404869556427002, + "num_tokens": 4808685.0, + "step": 186 + }, + { + "epoch": 0.02053591038875467, + "grad_norm": 3.393881320953369, + "learning_rate": 3.404099560761347e-07, + "loss": 1.2525, + "mean_token_accuracy": 0.647480845451355, + "num_tokens": 4832421.0, + "step": 187 + }, + { + "epoch": 0.020645728091368328, + "grad_norm": 2.9852662086486816, + "learning_rate": 3.4224011713030747e-07, + "loss": 1.218, + "mean_token_accuracy": 0.6534949541091919, + "num_tokens": 4858534.0, + "step": 188 + }, + { + "epoch": 0.02075554579398199, + "grad_norm": 3.26723575592041, + "learning_rate": 3.4407027818448027e-07, + "loss": 1.1898, + "mean_token_accuracy": 0.6609129905700684, + "num_tokens": 4882260.0, + "step": 189 + }, + { + "epoch": 0.02086536349659565, + "grad_norm": 2.647145986557007, + "learning_rate": 3.45900439238653e-07, + "loss": 1.1607, + "mean_token_accuracy": 0.6656354069709778, + "num_tokens": 4912447.0, + "step": 190 + }, + { + "epoch": 0.02097518119920931, + "grad_norm": 2.910539150238037, + "learning_rate": 3.477306002928258e-07, + "loss": 1.1531, + "mean_token_accuracy": 0.6742715835571289, + "num_tokens": 4936759.0, + "step": 191 + }, + { + "epoch": 0.021084998901822975, + "grad_norm": 2.5706613063812256, + "learning_rate": 3.4956076134699856e-07, + "loss": 1.2343, + "mean_token_accuracy": 0.6463020443916321, + "num_tokens": 4972367.0, + "step": 192 + }, + { + "epoch": 0.021194816604436634, + "grad_norm": 3.6939892768859863, + "learning_rate": 3.5139092240117136e-07, + "loss": 1.1285, + "mean_token_accuracy": 0.6752742528915405, + "num_tokens": 4991846.0, + "step": 193 + }, + { + "epoch": 0.021304634307050298, + "grad_norm": 3.9443795680999756, + "learning_rate": 3.5322108345534405e-07, + "loss": 1.0916, + "mean_token_accuracy": 0.677848756313324, + "num_tokens": 5010256.0, + "step": 194 + }, + { + "epoch": 0.021414452009663958, + "grad_norm": 2.6785078048706055, + "learning_rate": 3.550512445095169e-07, + "loss": 1.1689, + "mean_token_accuracy": 0.6664788722991943, + "num_tokens": 5039708.0, + "step": 195 + }, + { + "epoch": 0.021524269712277617, + "grad_norm": 2.7057628631591797, + "learning_rate": 3.568814055636896e-07, + "loss": 1.2474, + "mean_token_accuracy": 0.6465043425559998, + "num_tokens": 5069744.0, + "step": 196 + }, + { + "epoch": 0.02163408741489128, + "grad_norm": 2.9601364135742188, + "learning_rate": 3.587115666178624e-07, + "loss": 1.1341, + "mean_token_accuracy": 0.6732813119888306, + "num_tokens": 5094835.0, + "step": 197 + }, + { + "epoch": 0.02174390511750494, + "grad_norm": 2.915632486343384, + "learning_rate": 3.6054172767203514e-07, + "loss": 1.1825, + "mean_token_accuracy": 0.656638503074646, + "num_tokens": 5119738.0, + "step": 198 + }, + { + "epoch": 0.021853722820118604, + "grad_norm": 2.5184268951416016, + "learning_rate": 3.6237188872620793e-07, + "loss": 1.235, + "mean_token_accuracy": 0.6472339034080505, + "num_tokens": 5148497.0, + "step": 199 + }, + { + "epoch": 0.021963540522732264, + "grad_norm": 2.935873508453369, + "learning_rate": 3.642020497803807e-07, + "loss": 1.2266, + "mean_token_accuracy": 0.6518561840057373, + "num_tokens": 5173767.0, + "step": 200 + }, + { + "epoch": 0.022073358225345927, + "grad_norm": 3.8963096141815186, + "learning_rate": 3.660322108345535e-07, + "loss": 1.0902, + "mean_token_accuracy": 0.6804190874099731, + "num_tokens": 5193465.0, + "step": 201 + }, + { + "epoch": 0.022183175927959587, + "grad_norm": 3.0631961822509766, + "learning_rate": 3.678623718887262e-07, + "loss": 1.1382, + "mean_token_accuracy": 0.6781226396560669, + "num_tokens": 5217114.0, + "step": 202 + }, + { + "epoch": 0.022292993630573247, + "grad_norm": 2.499818801879883, + "learning_rate": 3.69692532942899e-07, + "loss": 1.1905, + "mean_token_accuracy": 0.6569050550460815, + "num_tokens": 5246528.0, + "step": 203 + }, + { + "epoch": 0.02240281133318691, + "grad_norm": 2.6336796283721924, + "learning_rate": 3.7152269399707177e-07, + "loss": 1.1554, + "mean_token_accuracy": 0.6674355268478394, + "num_tokens": 5276532.0, + "step": 204 + }, + { + "epoch": 0.02251262903580057, + "grad_norm": 2.597310781478882, + "learning_rate": 3.7335285505124457e-07, + "loss": 1.0987, + "mean_token_accuracy": 0.6783553957939148, + "num_tokens": 5304539.0, + "step": 205 + }, + { + "epoch": 0.022622446738414233, + "grad_norm": 2.8688840866088867, + "learning_rate": 3.7518301610541726e-07, + "loss": 1.2041, + "mean_token_accuracy": 0.6571409106254578, + "num_tokens": 5330843.0, + "step": 206 + }, + { + "epoch": 0.022732264441027893, + "grad_norm": 2.4987502098083496, + "learning_rate": 3.770131771595901e-07, + "loss": 1.23, + "mean_token_accuracy": 0.6428174376487732, + "num_tokens": 5361802.0, + "step": 207 + }, + { + "epoch": 0.022842082143641557, + "grad_norm": 3.7543394565582275, + "learning_rate": 3.788433382137628e-07, + "loss": 1.1844, + "mean_token_accuracy": 0.6551060080528259, + "num_tokens": 5381784.0, + "step": 208 + }, + { + "epoch": 0.022951899846255217, + "grad_norm": 2.9658091068267822, + "learning_rate": 3.806734992679356e-07, + "loss": 1.1771, + "mean_token_accuracy": 0.6596943736076355, + "num_tokens": 5409671.0, + "step": 209 + }, + { + "epoch": 0.023061717548868876, + "grad_norm": 2.705029249191284, + "learning_rate": 3.8250366032210835e-07, + "loss": 1.0528, + "mean_token_accuracy": 0.6899898052215576, + "num_tokens": 5438389.0, + "step": 210 + }, + { + "epoch": 0.02317153525148254, + "grad_norm": 3.1171023845672607, + "learning_rate": 3.8433382137628114e-07, + "loss": 1.0908, + "mean_token_accuracy": 0.6818212866783142, + "num_tokens": 5460093.0, + "step": 211 + }, + { + "epoch": 0.0232813529540962, + "grad_norm": 2.7327678203582764, + "learning_rate": 3.861639824304539e-07, + "loss": 1.2382, + "mean_token_accuracy": 0.6514479517936707, + "num_tokens": 5487423.0, + "step": 212 + }, + { + "epoch": 0.023391170656709863, + "grad_norm": 2.8478589057922363, + "learning_rate": 3.879941434846267e-07, + "loss": 1.1061, + "mean_token_accuracy": 0.676876425743103, + "num_tokens": 5513069.0, + "step": 213 + }, + { + "epoch": 0.023500988359323523, + "grad_norm": 3.199213743209839, + "learning_rate": 3.8982430453879943e-07, + "loss": 1.1738, + "mean_token_accuracy": 0.6656695604324341, + "num_tokens": 5534321.0, + "step": 214 + }, + { + "epoch": 0.023610806061937183, + "grad_norm": 2.7326738834381104, + "learning_rate": 3.9165446559297223e-07, + "loss": 1.1883, + "mean_token_accuracy": 0.654737114906311, + "num_tokens": 5561518.0, + "step": 215 + }, + { + "epoch": 0.023720623764550846, + "grad_norm": 2.6012167930603027, + "learning_rate": 3.93484626647145e-07, + "loss": 1.0505, + "mean_token_accuracy": 0.6960687637329102, + "num_tokens": 5588559.0, + "step": 216 + }, + { + "epoch": 0.023830441467164506, + "grad_norm": 2.46223783493042, + "learning_rate": 3.953147877013178e-07, + "loss": 1.2127, + "mean_token_accuracy": 0.6468876600265503, + "num_tokens": 5617676.0, + "step": 217 + }, + { + "epoch": 0.02394025916977817, + "grad_norm": 2.7259037494659424, + "learning_rate": 3.9714494875549047e-07, + "loss": 1.2198, + "mean_token_accuracy": 0.6541846394538879, + "num_tokens": 5645917.0, + "step": 218 + }, + { + "epoch": 0.02405007687239183, + "grad_norm": 3.012094736099243, + "learning_rate": 3.989751098096633e-07, + "loss": 1.0545, + "mean_token_accuracy": 0.6940882205963135, + "num_tokens": 5668206.0, + "step": 219 + }, + { + "epoch": 0.024159894575005492, + "grad_norm": 2.574899673461914, + "learning_rate": 4.00805270863836e-07, + "loss": 1.0791, + "mean_token_accuracy": 0.6860181093215942, + "num_tokens": 5693349.0, + "step": 220 + }, + { + "epoch": 0.024269712277619152, + "grad_norm": 3.717142343521118, + "learning_rate": 4.026354319180088e-07, + "loss": 1.039, + "mean_token_accuracy": 0.6982893943786621, + "num_tokens": 5712014.0, + "step": 221 + }, + { + "epoch": 0.024379529980232812, + "grad_norm": 2.934199810028076, + "learning_rate": 4.0446559297218155e-07, + "loss": 1.1418, + "mean_token_accuracy": 0.6783616542816162, + "num_tokens": 5737231.0, + "step": 222 + }, + { + "epoch": 0.024489347682846475, + "grad_norm": 3.0451607704162598, + "learning_rate": 4.0629575402635435e-07, + "loss": 1.175, + "mean_token_accuracy": 0.662371039390564, + "num_tokens": 5758827.0, + "step": 223 + }, + { + "epoch": 0.024599165385460135, + "grad_norm": 2.9462881088256836, + "learning_rate": 4.081259150805271e-07, + "loss": 1.1933, + "mean_token_accuracy": 0.659816324710846, + "num_tokens": 5783941.0, + "step": 224 + }, + { + "epoch": 0.0247089830880738, + "grad_norm": 3.061868190765381, + "learning_rate": 4.099560761346999e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6960210204124451, + "num_tokens": 5807662.0, + "step": 225 + }, + { + "epoch": 0.02481880079068746, + "grad_norm": 2.7937474250793457, + "learning_rate": 4.1178623718887264e-07, + "loss": 1.2072, + "mean_token_accuracy": 0.6515583992004395, + "num_tokens": 5834416.0, + "step": 226 + }, + { + "epoch": 0.024928618493301122, + "grad_norm": 3.4098901748657227, + "learning_rate": 4.1361639824304544e-07, + "loss": 1.0264, + "mean_token_accuracy": 0.695879340171814, + "num_tokens": 5852923.0, + "step": 227 + }, + { + "epoch": 0.02503843619591478, + "grad_norm": 2.6620469093322754, + "learning_rate": 4.154465592972182e-07, + "loss": 1.1497, + "mean_token_accuracy": 0.6789113283157349, + "num_tokens": 5878350.0, + "step": 228 + }, + { + "epoch": 0.02514825389852844, + "grad_norm": 2.74064302444458, + "learning_rate": 4.17276720351391e-07, + "loss": 1.2115, + "mean_token_accuracy": 0.655072808265686, + "num_tokens": 5905840.0, + "step": 229 + }, + { + "epoch": 0.025258071601142105, + "grad_norm": 2.8623039722442627, + "learning_rate": 4.191068814055637e-07, + "loss": 1.0999, + "mean_token_accuracy": 0.6877780556678772, + "num_tokens": 5928855.0, + "step": 230 + }, + { + "epoch": 0.025367889303755765, + "grad_norm": 2.8885514736175537, + "learning_rate": 4.2093704245973653e-07, + "loss": 1.1177, + "mean_token_accuracy": 0.6733992099761963, + "num_tokens": 5950402.0, + "step": 231 + }, + { + "epoch": 0.025477707006369428, + "grad_norm": 2.9372811317443848, + "learning_rate": 4.227672035139092e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6934570670127869, + "num_tokens": 5972900.0, + "step": 232 + }, + { + "epoch": 0.025587524708983088, + "grad_norm": 2.7090201377868652, + "learning_rate": 4.24597364568082e-07, + "loss": 1.0885, + "mean_token_accuracy": 0.6785494685173035, + "num_tokens": 5996111.0, + "step": 233 + }, + { + "epoch": 0.025697342411596748, + "grad_norm": 2.659456729888916, + "learning_rate": 4.2642752562225476e-07, + "loss": 1.1682, + "mean_token_accuracy": 0.6663960218429565, + "num_tokens": 6025406.0, + "step": 234 + }, + { + "epoch": 0.02580716011421041, + "grad_norm": 2.5585436820983887, + "learning_rate": 4.2825768667642756e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.691234290599823, + "num_tokens": 6051701.0, + "step": 235 + }, + { + "epoch": 0.02591697781682407, + "grad_norm": 2.7170779705047607, + "learning_rate": 4.300878477306003e-07, + "loss": 1.0972, + "mean_token_accuracy": 0.6849727034568787, + "num_tokens": 6075021.0, + "step": 236 + }, + { + "epoch": 0.026026795519437734, + "grad_norm": 3.170278549194336, + "learning_rate": 4.319180087847731e-07, + "loss": 1.1054, + "mean_token_accuracy": 0.6725242137908936, + "num_tokens": 6095142.0, + "step": 237 + }, + { + "epoch": 0.026136613222051394, + "grad_norm": 2.7470204830169678, + "learning_rate": 4.3374816983894585e-07, + "loss": 1.1109, + "mean_token_accuracy": 0.6783125400543213, + "num_tokens": 6121179.0, + "step": 238 + }, + { + "epoch": 0.026246430924665057, + "grad_norm": 2.8201308250427246, + "learning_rate": 4.3557833089311865e-07, + "loss": 1.192, + "mean_token_accuracy": 0.666208028793335, + "num_tokens": 6145224.0, + "step": 239 + }, + { + "epoch": 0.026356248627278717, + "grad_norm": 2.5290045738220215, + "learning_rate": 4.374084919472914e-07, + "loss": 1.2086, + "mean_token_accuracy": 0.6558765172958374, + "num_tokens": 6173946.0, + "step": 240 + }, + { + "epoch": 0.026466066329892377, + "grad_norm": 2.5830068588256836, + "learning_rate": 4.392386530014642e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.6839110255241394, + "num_tokens": 6200097.0, + "step": 241 + }, + { + "epoch": 0.02657588403250604, + "grad_norm": 2.6306140422821045, + "learning_rate": 4.410688140556369e-07, + "loss": 1.1779, + "mean_token_accuracy": 0.6700119972229004, + "num_tokens": 6225844.0, + "step": 242 + }, + { + "epoch": 0.0266857017351197, + "grad_norm": 2.5194036960601807, + "learning_rate": 4.4289897510980974e-07, + "loss": 1.1793, + "mean_token_accuracy": 0.6566219329833984, + "num_tokens": 6250867.0, + "step": 243 + }, + { + "epoch": 0.026795519437733364, + "grad_norm": 2.2837488651275635, + "learning_rate": 4.4472913616398243e-07, + "loss": 1.2216, + "mean_token_accuracy": 0.6461571455001831, + "num_tokens": 6280676.0, + "step": 244 + }, + { + "epoch": 0.026905337140347024, + "grad_norm": 2.4190144538879395, + "learning_rate": 4.4655929721815523e-07, + "loss": 1.1918, + "mean_token_accuracy": 0.663524866104126, + "num_tokens": 6306590.0, + "step": 245 + }, + { + "epoch": 0.027015154842960687, + "grad_norm": 2.3145461082458496, + "learning_rate": 4.48389458272328e-07, + "loss": 1.1088, + "mean_token_accuracy": 0.6780857443809509, + "num_tokens": 6336134.0, + "step": 246 + }, + { + "epoch": 0.027124972545574347, + "grad_norm": 2.5802388191223145, + "learning_rate": 4.5021961932650077e-07, + "loss": 1.1618, + "mean_token_accuracy": 0.6678459644317627, + "num_tokens": 6362804.0, + "step": 247 + }, + { + "epoch": 0.027234790248188007, + "grad_norm": 2.6193912029266357, + "learning_rate": 4.520497803806735e-07, + "loss": 1.2667, + "mean_token_accuracy": 0.637610137462616, + "num_tokens": 6387269.0, + "step": 248 + }, + { + "epoch": 0.02734460795080167, + "grad_norm": 2.5432932376861572, + "learning_rate": 4.538799414348463e-07, + "loss": 1.1575, + "mean_token_accuracy": 0.6648458242416382, + "num_tokens": 6415918.0, + "step": 249 + }, + { + "epoch": 0.02745442565341533, + "grad_norm": 2.3514370918273926, + "learning_rate": 4.5571010248901906e-07, + "loss": 1.1049, + "mean_token_accuracy": 0.6820707321166992, + "num_tokens": 6445585.0, + "step": 250 + }, + { + "epoch": 0.027564243356028993, + "grad_norm": 2.4509708881378174, + "learning_rate": 4.5754026354319186e-07, + "loss": 1.1019, + "mean_token_accuracy": 0.6816365718841553, + "num_tokens": 6472950.0, + "step": 251 + }, + { + "epoch": 0.027674061058642653, + "grad_norm": 2.7400639057159424, + "learning_rate": 4.593704245973646e-07, + "loss": 1.0156, + "mean_token_accuracy": 0.7017265558242798, + "num_tokens": 6494530.0, + "step": 252 + }, + { + "epoch": 0.027783878761256313, + "grad_norm": 2.3024239540100098, + "learning_rate": 4.612005856515374e-07, + "loss": 1.1461, + "mean_token_accuracy": 0.6616600751876831, + "num_tokens": 6522876.0, + "step": 253 + }, + { + "epoch": 0.027893696463869976, + "grad_norm": 2.77717661857605, + "learning_rate": 4.630307467057101e-07, + "loss": 1.1263, + "mean_token_accuracy": 0.6705452799797058, + "num_tokens": 6545241.0, + "step": 254 + }, + { + "epoch": 0.028003514166483636, + "grad_norm": 2.8273656368255615, + "learning_rate": 4.648609077598829e-07, + "loss": 1.0643, + "mean_token_accuracy": 0.6855105757713318, + "num_tokens": 6566634.0, + "step": 255 + }, + { + "epoch": 0.0281133318690973, + "grad_norm": 2.9507229328155518, + "learning_rate": 4.6669106881405564e-07, + "loss": 1.1007, + "mean_token_accuracy": 0.6788308620452881, + "num_tokens": 6588292.0, + "step": 256 + }, + { + "epoch": 0.02822314957171096, + "grad_norm": 2.3700950145721436, + "learning_rate": 4.6852122986822844e-07, + "loss": 1.0626, + "mean_token_accuracy": 0.6846873760223389, + "num_tokens": 6615944.0, + "step": 257 + }, + { + "epoch": 0.028332967274324623, + "grad_norm": 2.540876626968384, + "learning_rate": 4.703513909224012e-07, + "loss": 1.1277, + "mean_token_accuracy": 0.6692619919776917, + "num_tokens": 6643543.0, + "step": 258 + }, + { + "epoch": 0.028442784976938282, + "grad_norm": 2.37385892868042, + "learning_rate": 4.72181551976574e-07, + "loss": 1.097, + "mean_token_accuracy": 0.6774890422821045, + "num_tokens": 6673533.0, + "step": 259 + }, + { + "epoch": 0.028552602679551942, + "grad_norm": 2.9975175857543945, + "learning_rate": 4.7401171303074673e-07, + "loss": 1.2046, + "mean_token_accuracy": 0.6555564403533936, + "num_tokens": 6695530.0, + "step": 260 + }, + { + "epoch": 0.028662420382165606, + "grad_norm": 2.689340114593506, + "learning_rate": 4.758418740849195e-07, + "loss": 1.1316, + "mean_token_accuracy": 0.6668024659156799, + "num_tokens": 6718104.0, + "step": 261 + }, + { + "epoch": 0.028772238084779266, + "grad_norm": 2.448853015899658, + "learning_rate": 4.776720351390922e-07, + "loss": 1.129, + "mean_token_accuracy": 0.6756620407104492, + "num_tokens": 6746415.0, + "step": 262 + }, + { + "epoch": 0.02888205578739293, + "grad_norm": 2.4084649085998535, + "learning_rate": 4.79502196193265e-07, + "loss": 1.1204, + "mean_token_accuracy": 0.6712230443954468, + "num_tokens": 6775299.0, + "step": 263 + }, + { + "epoch": 0.02899187349000659, + "grad_norm": 2.574849843978882, + "learning_rate": 4.813323572474378e-07, + "loss": 1.0706, + "mean_token_accuracy": 0.6846632361412048, + "num_tokens": 6800718.0, + "step": 264 + }, + { + "epoch": 0.029101691192620252, + "grad_norm": 2.919645309448242, + "learning_rate": 4.831625183016106e-07, + "loss": 1.1357, + "mean_token_accuracy": 0.6689141392707825, + "num_tokens": 6827983.0, + "step": 265 + }, + { + "epoch": 0.029211508895233912, + "grad_norm": 2.6848788261413574, + "learning_rate": 4.849926793557833e-07, + "loss": 1.1416, + "mean_token_accuracy": 0.6702640056610107, + "num_tokens": 6852449.0, + "step": 266 + }, + { + "epoch": 0.029321326597847572, + "grad_norm": 2.6321089267730713, + "learning_rate": 4.868228404099561e-07, + "loss": 1.1319, + "mean_token_accuracy": 0.6725228428840637, + "num_tokens": 6877637.0, + "step": 267 + }, + { + "epoch": 0.029431144300461235, + "grad_norm": 2.7246322631835938, + "learning_rate": 4.886530014641289e-07, + "loss": 1.0099, + "mean_token_accuracy": 0.6947791576385498, + "num_tokens": 6899140.0, + "step": 268 + }, + { + "epoch": 0.029540962003074895, + "grad_norm": 2.4785914421081543, + "learning_rate": 4.904831625183017e-07, + "loss": 1.1837, + "mean_token_accuracy": 0.6714632511138916, + "num_tokens": 6926971.0, + "step": 269 + }, + { + "epoch": 0.02965077970568856, + "grad_norm": 2.371887445449829, + "learning_rate": 4.923133235724744e-07, + "loss": 1.0073, + "mean_token_accuracy": 0.7062453627586365, + "num_tokens": 6953582.0, + "step": 270 + }, + { + "epoch": 0.029760597408302218, + "grad_norm": 2.570542812347412, + "learning_rate": 4.941434846266472e-07, + "loss": 1.0639, + "mean_token_accuracy": 0.6947340369224548, + "num_tokens": 6979856.0, + "step": 271 + }, + { + "epoch": 0.029870415110915878, + "grad_norm": 2.421010732650757, + "learning_rate": 4.959736456808199e-07, + "loss": 1.1305, + "mean_token_accuracy": 0.6773236989974976, + "num_tokens": 7005165.0, + "step": 272 + }, + { + "epoch": 0.02998023281352954, + "grad_norm": 2.593508243560791, + "learning_rate": 4.978038067349927e-07, + "loss": 1.144, + "mean_token_accuracy": 0.6699409484863281, + "num_tokens": 7029736.0, + "step": 273 + }, + { + "epoch": 0.0300900505161432, + "grad_norm": 2.5641274452209473, + "learning_rate": 4.996339677891655e-07, + "loss": 1.1712, + "mean_token_accuracy": 0.663267195224762, + "num_tokens": 7055302.0, + "step": 274 + }, + { + "epoch": 0.030199868218756865, + "grad_norm": 2.6321847438812256, + "learning_rate": 5.014641288433383e-07, + "loss": 1.1083, + "mean_token_accuracy": 0.6740577220916748, + "num_tokens": 7080590.0, + "step": 275 + }, + { + "epoch": 0.030309685921370524, + "grad_norm": 2.6958224773406982, + "learning_rate": 5.03294289897511e-07, + "loss": 1.1596, + "mean_token_accuracy": 0.6631611585617065, + "num_tokens": 7104478.0, + "step": 276 + }, + { + "epoch": 0.030419503623984188, + "grad_norm": 2.4939944744110107, + "learning_rate": 5.051244509516838e-07, + "loss": 1.1597, + "mean_token_accuracy": 0.6631972193717957, + "num_tokens": 7133006.0, + "step": 277 + }, + { + "epoch": 0.030529321326597848, + "grad_norm": 2.537271499633789, + "learning_rate": 5.069546120058566e-07, + "loss": 1.1903, + "mean_token_accuracy": 0.6498050093650818, + "num_tokens": 7161834.0, + "step": 278 + }, + { + "epoch": 0.030639139029211507, + "grad_norm": 2.2745776176452637, + "learning_rate": 5.087847730600294e-07, + "loss": 1.1006, + "mean_token_accuracy": 0.6758937239646912, + "num_tokens": 7191766.0, + "step": 279 + }, + { + "epoch": 0.03074895673182517, + "grad_norm": 2.193084955215454, + "learning_rate": 5.106149341142021e-07, + "loss": 1.2133, + "mean_token_accuracy": 0.6484830975532532, + "num_tokens": 7223705.0, + "step": 280 + }, + { + "epoch": 0.03085877443443883, + "grad_norm": 2.6575138568878174, + "learning_rate": 5.124450951683749e-07, + "loss": 1.081, + "mean_token_accuracy": 0.6826989054679871, + "num_tokens": 7248624.0, + "step": 281 + }, + { + "epoch": 0.030968592137052494, + "grad_norm": 3.0322189331054688, + "learning_rate": 5.142752562225477e-07, + "loss": 0.9396, + "mean_token_accuracy": 0.7130846977233887, + "num_tokens": 7266954.0, + "step": 282 + }, + { + "epoch": 0.031078409839666154, + "grad_norm": 2.202932834625244, + "learning_rate": 5.161054172767205e-07, + "loss": 1.1448, + "mean_token_accuracy": 0.6685200333595276, + "num_tokens": 7297490.0, + "step": 283 + }, + { + "epoch": 0.031188227542279817, + "grad_norm": 2.6323490142822266, + "learning_rate": 5.179355783308931e-07, + "loss": 1.1335, + "mean_token_accuracy": 0.6668334007263184, + "num_tokens": 7321275.0, + "step": 284 + }, + { + "epoch": 0.03129804524489348, + "grad_norm": 2.6313068866729736, + "learning_rate": 5.197657393850659e-07, + "loss": 1.0881, + "mean_token_accuracy": 0.6723230481147766, + "num_tokens": 7345913.0, + "step": 285 + }, + { + "epoch": 0.03140786294750714, + "grad_norm": 2.610790729522705, + "learning_rate": 5.215959004392386e-07, + "loss": 1.1133, + "mean_token_accuracy": 0.6728963851928711, + "num_tokens": 7371287.0, + "step": 286 + }, + { + "epoch": 0.0315176806501208, + "grad_norm": 2.3407909870147705, + "learning_rate": 5.234260614934114e-07, + "loss": 1.1439, + "mean_token_accuracy": 0.672377347946167, + "num_tokens": 7400618.0, + "step": 287 + }, + { + "epoch": 0.031627498352734464, + "grad_norm": 2.652750253677368, + "learning_rate": 5.252562225475842e-07, + "loss": 1.0278, + "mean_token_accuracy": 0.6989157199859619, + "num_tokens": 7422596.0, + "step": 288 + }, + { + "epoch": 0.03173731605534812, + "grad_norm": 2.4602925777435303, + "learning_rate": 5.27086383601757e-07, + "loss": 1.158, + "mean_token_accuracy": 0.659257173538208, + "num_tokens": 7447711.0, + "step": 289 + }, + { + "epoch": 0.03184713375796178, + "grad_norm": 2.8848037719726562, + "learning_rate": 5.289165446559297e-07, + "loss": 0.9604, + "mean_token_accuracy": 0.7118693590164185, + "num_tokens": 7468487.0, + "step": 290 + }, + { + "epoch": 0.03195695146057544, + "grad_norm": 2.7135424613952637, + "learning_rate": 5.307467057101025e-07, + "loss": 1.0458, + "mean_token_accuracy": 0.6877069473266602, + "num_tokens": 7489346.0, + "step": 291 + }, + { + "epoch": 0.0320667691631891, + "grad_norm": 2.7722554206848145, + "learning_rate": 5.325768667642753e-07, + "loss": 1.1274, + "mean_token_accuracy": 0.6710218191146851, + "num_tokens": 7511427.0, + "step": 292 + }, + { + "epoch": 0.03217658686580277, + "grad_norm": 2.5017354488372803, + "learning_rate": 5.344070278184481e-07, + "loss": 1.1264, + "mean_token_accuracy": 0.6718500852584839, + "num_tokens": 7540350.0, + "step": 293 + }, + { + "epoch": 0.03228640456841643, + "grad_norm": 2.2578608989715576, + "learning_rate": 5.362371888726208e-07, + "loss": 1.1138, + "mean_token_accuracy": 0.678106963634491, + "num_tokens": 7570260.0, + "step": 294 + }, + { + "epoch": 0.03239622227103009, + "grad_norm": 2.5766305923461914, + "learning_rate": 5.380673499267936e-07, + "loss": 1.0974, + "mean_token_accuracy": 0.6781374216079712, + "num_tokens": 7595672.0, + "step": 295 + }, + { + "epoch": 0.03250603997364375, + "grad_norm": 2.9550974369049072, + "learning_rate": 5.398975109809663e-07, + "loss": 1.1459, + "mean_token_accuracy": 0.6670591235160828, + "num_tokens": 7616295.0, + "step": 296 + }, + { + "epoch": 0.03261585767625741, + "grad_norm": 2.2992968559265137, + "learning_rate": 5.417276720351391e-07, + "loss": 1.0801, + "mean_token_accuracy": 0.6777212023735046, + "num_tokens": 7644130.0, + "step": 297 + }, + { + "epoch": 0.032725675378871076, + "grad_norm": 2.604607105255127, + "learning_rate": 5.435578330893119e-07, + "loss": 1.0242, + "mean_token_accuracy": 0.6969271898269653, + "num_tokens": 7666795.0, + "step": 298 + }, + { + "epoch": 0.032835493081484736, + "grad_norm": 2.266309976577759, + "learning_rate": 5.453879941434847e-07, + "loss": 1.0886, + "mean_token_accuracy": 0.6855642795562744, + "num_tokens": 7695066.0, + "step": 299 + }, + { + "epoch": 0.032945310784098396, + "grad_norm": 2.3312225341796875, + "learning_rate": 5.472181551976574e-07, + "loss": 1.073, + "mean_token_accuracy": 0.691187858581543, + "num_tokens": 7719074.0, + "step": 300 + }, + { + "epoch": 0.033055128486712056, + "grad_norm": 2.812713384628296, + "learning_rate": 5.490483162518302e-07, + "loss": 1.1274, + "mean_token_accuracy": 0.6828641891479492, + "num_tokens": 7739717.0, + "step": 301 + }, + { + "epoch": 0.03316494618932572, + "grad_norm": 2.5025644302368164, + "learning_rate": 5.50878477306003e-07, + "loss": 1.0806, + "mean_token_accuracy": 0.6832683086395264, + "num_tokens": 7764269.0, + "step": 302 + }, + { + "epoch": 0.03327476389193938, + "grad_norm": 2.474789619445801, + "learning_rate": 5.527086383601758e-07, + "loss": 1.0724, + "mean_token_accuracy": 0.6833196878433228, + "num_tokens": 7787606.0, + "step": 303 + }, + { + "epoch": 0.03338458159455304, + "grad_norm": 2.520263671875, + "learning_rate": 5.545387994143485e-07, + "loss": 1.0532, + "mean_token_accuracy": 0.6914522051811218, + "num_tokens": 7812857.0, + "step": 304 + }, + { + "epoch": 0.0334943992971667, + "grad_norm": 2.510159730911255, + "learning_rate": 5.563689604685213e-07, + "loss": 1.1498, + "mean_token_accuracy": 0.6660964488983154, + "num_tokens": 7837868.0, + "step": 305 + }, + { + "epoch": 0.03360421699978036, + "grad_norm": 2.4392309188842773, + "learning_rate": 5.581991215226941e-07, + "loss": 1.2138, + "mean_token_accuracy": 0.6568025350570679, + "num_tokens": 7864105.0, + "step": 306 + }, + { + "epoch": 0.03371403470239403, + "grad_norm": 2.3945844173431396, + "learning_rate": 5.600292825768669e-07, + "loss": 1.0924, + "mean_token_accuracy": 0.6758068799972534, + "num_tokens": 7888848.0, + "step": 307 + }, + { + "epoch": 0.03382385240500769, + "grad_norm": 2.299315929412842, + "learning_rate": 5.618594436310396e-07, + "loss": 1.1362, + "mean_token_accuracy": 0.6694443225860596, + "num_tokens": 7915538.0, + "step": 308 + }, + { + "epoch": 0.03393367010762135, + "grad_norm": 2.2365386486053467, + "learning_rate": 5.636896046852124e-07, + "loss": 1.1737, + "mean_token_accuracy": 0.6659106016159058, + "num_tokens": 7945386.0, + "step": 309 + }, + { + "epoch": 0.03404348781023501, + "grad_norm": 2.401862859725952, + "learning_rate": 5.655197657393851e-07, + "loss": 1.0886, + "mean_token_accuracy": 0.6787183880805969, + "num_tokens": 7970517.0, + "step": 310 + }, + { + "epoch": 0.03415330551284867, + "grad_norm": 2.063718557357788, + "learning_rate": 5.673499267935579e-07, + "loss": 1.1667, + "mean_token_accuracy": 0.6626712083816528, + "num_tokens": 8005512.0, + "step": 311 + }, + { + "epoch": 0.034263123215462335, + "grad_norm": 2.3038814067840576, + "learning_rate": 5.691800878477307e-07, + "loss": 1.1977, + "mean_token_accuracy": 0.6613548994064331, + "num_tokens": 8033792.0, + "step": 312 + }, + { + "epoch": 0.034372940918075995, + "grad_norm": 2.2983579635620117, + "learning_rate": 5.710102489019035e-07, + "loss": 1.1147, + "mean_token_accuracy": 0.6866854429244995, + "num_tokens": 8063312.0, + "step": 313 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 2.416680335998535, + "learning_rate": 5.728404099560761e-07, + "loss": 1.0484, + "mean_token_accuracy": 0.6900269985198975, + "num_tokens": 8088802.0, + "step": 314 + }, + { + "epoch": 0.034592576323303315, + "grad_norm": 2.5399794578552246, + "learning_rate": 5.746705710102489e-07, + "loss": 1.1059, + "mean_token_accuracy": 0.6782264709472656, + "num_tokens": 8112792.0, + "step": 315 + }, + { + "epoch": 0.034702394025916974, + "grad_norm": 2.596006393432617, + "learning_rate": 5.765007320644217e-07, + "loss": 1.0789, + "mean_token_accuracy": 0.689464807510376, + "num_tokens": 8136677.0, + "step": 316 + }, + { + "epoch": 0.03481221172853064, + "grad_norm": 2.101789951324463, + "learning_rate": 5.783308931185945e-07, + "loss": 1.0451, + "mean_token_accuracy": 0.6951925158500671, + "num_tokens": 8168088.0, + "step": 317 + }, + { + "epoch": 0.0349220294311443, + "grad_norm": 2.552476406097412, + "learning_rate": 5.801610541727672e-07, + "loss": 1.1158, + "mean_token_accuracy": 0.6700053215026855, + "num_tokens": 8192137.0, + "step": 318 + }, + { + "epoch": 0.03503184713375796, + "grad_norm": 2.6506881713867188, + "learning_rate": 5.8199121522694e-07, + "loss": 1.0001, + "mean_token_accuracy": 0.6975290775299072, + "num_tokens": 8213537.0, + "step": 319 + }, + { + "epoch": 0.03514166483637162, + "grad_norm": 2.1701879501342773, + "learning_rate": 5.838213762811127e-07, + "loss": 1.1523, + "mean_token_accuracy": 0.6644915342330933, + "num_tokens": 8244992.0, + "step": 320 + }, + { + "epoch": 0.03525148253898529, + "grad_norm": 2.579134225845337, + "learning_rate": 5.856515373352855e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.6850491166114807, + "num_tokens": 8269300.0, + "step": 321 + }, + { + "epoch": 0.03536130024159895, + "grad_norm": 2.432335376739502, + "learning_rate": 5.874816983894583e-07, + "loss": 1.1027, + "mean_token_accuracy": 0.6717942357063293, + "num_tokens": 8295719.0, + "step": 322 + }, + { + "epoch": 0.03547111794421261, + "grad_norm": 2.484015464782715, + "learning_rate": 5.893118594436311e-07, + "loss": 1.1059, + "mean_token_accuracy": 0.6778780221939087, + "num_tokens": 8319976.0, + "step": 323 + }, + { + "epoch": 0.03558093564682627, + "grad_norm": 2.192431926727295, + "learning_rate": 5.911420204978038e-07, + "loss": 1.1488, + "mean_token_accuracy": 0.6668640375137329, + "num_tokens": 8349269.0, + "step": 324 + }, + { + "epoch": 0.03569075334943993, + "grad_norm": 2.7661592960357666, + "learning_rate": 5.929721815519766e-07, + "loss": 1.0606, + "mean_token_accuracy": 0.6845899224281311, + "num_tokens": 8370467.0, + "step": 325 + }, + { + "epoch": 0.035800571052053594, + "grad_norm": 2.6731934547424316, + "learning_rate": 5.948023426061494e-07, + "loss": 1.137, + "mean_token_accuracy": 0.6695430278778076, + "num_tokens": 8393695.0, + "step": 326 + }, + { + "epoch": 0.035910388754667254, + "grad_norm": 2.2749807834625244, + "learning_rate": 5.966325036603222e-07, + "loss": 1.1631, + "mean_token_accuracy": 0.6670844554901123, + "num_tokens": 8423918.0, + "step": 327 + }, + { + "epoch": 0.036020206457280914, + "grad_norm": 2.3354434967041016, + "learning_rate": 5.984626647144949e-07, + "loss": 1.1857, + "mean_token_accuracy": 0.6586282253265381, + "num_tokens": 8453032.0, + "step": 328 + }, + { + "epoch": 0.03613002415989457, + "grad_norm": 1.9776688814163208, + "learning_rate": 6.002928257686677e-07, + "loss": 1.0335, + "mean_token_accuracy": 0.6954900026321411, + "num_tokens": 8487271.0, + "step": 329 + }, + { + "epoch": 0.03623984186250823, + "grad_norm": 2.3019816875457764, + "learning_rate": 6.021229868228404e-07, + "loss": 1.0707, + "mean_token_accuracy": 0.6860865950584412, + "num_tokens": 8514693.0, + "step": 330 + }, + { + "epoch": 0.0363496595651219, + "grad_norm": 2.506800413131714, + "learning_rate": 6.039531478770133e-07, + "loss": 1.1328, + "mean_token_accuracy": 0.6677663326263428, + "num_tokens": 8540547.0, + "step": 331 + }, + { + "epoch": 0.03645947726773556, + "grad_norm": 2.3942203521728516, + "learning_rate": 6.05783308931186e-07, + "loss": 1.0915, + "mean_token_accuracy": 0.677158772945404, + "num_tokens": 8564986.0, + "step": 332 + }, + { + "epoch": 0.03656929497034922, + "grad_norm": 2.393873929977417, + "learning_rate": 6.076134699853588e-07, + "loss": 1.0672, + "mean_token_accuracy": 0.6907179951667786, + "num_tokens": 8589549.0, + "step": 333 + }, + { + "epoch": 0.03667911267296288, + "grad_norm": 2.692852258682251, + "learning_rate": 6.094436310395315e-07, + "loss": 1.1156, + "mean_token_accuracy": 0.6744906902313232, + "num_tokens": 8612366.0, + "step": 334 + }, + { + "epoch": 0.03678893037557654, + "grad_norm": 2.1967830657958984, + "learning_rate": 6.112737920937043e-07, + "loss": 1.0285, + "mean_token_accuracy": 0.6959508657455444, + "num_tokens": 8641505.0, + "step": 335 + }, + { + "epoch": 0.036898748078190206, + "grad_norm": 2.576068878173828, + "learning_rate": 6.131039531478771e-07, + "loss": 1.0871, + "mean_token_accuracy": 0.6810929775238037, + "num_tokens": 8664134.0, + "step": 336 + }, + { + "epoch": 0.037008565780803866, + "grad_norm": 2.4507532119750977, + "learning_rate": 6.149341142020499e-07, + "loss": 1.0919, + "mean_token_accuracy": 0.6814320087432861, + "num_tokens": 8687941.0, + "step": 337 + }, + { + "epoch": 0.037118383483417526, + "grad_norm": 2.463315486907959, + "learning_rate": 6.167642752562226e-07, + "loss": 1.1463, + "mean_token_accuracy": 0.6766359806060791, + "num_tokens": 8714254.0, + "step": 338 + }, + { + "epoch": 0.037228201186031186, + "grad_norm": 2.9057791233062744, + "learning_rate": 6.185944363103954e-07, + "loss": 1.0557, + "mean_token_accuracy": 0.6902493238449097, + "num_tokens": 8735395.0, + "step": 339 + }, + { + "epoch": 0.03733801888864485, + "grad_norm": 2.4759674072265625, + "learning_rate": 6.204245973645682e-07, + "loss": 1.0742, + "mean_token_accuracy": 0.6770930886268616, + "num_tokens": 8759605.0, + "step": 340 + }, + { + "epoch": 0.03744783659125851, + "grad_norm": 2.4836719036102295, + "learning_rate": 6.22254758418741e-07, + "loss": 1.1337, + "mean_token_accuracy": 0.6719181537628174, + "num_tokens": 8784071.0, + "step": 341 + }, + { + "epoch": 0.03755765429387217, + "grad_norm": 2.11950421333313, + "learning_rate": 6.240849194729137e-07, + "loss": 1.1688, + "mean_token_accuracy": 0.6596482992172241, + "num_tokens": 8816948.0, + "step": 342 + }, + { + "epoch": 0.03766747199648583, + "grad_norm": 2.3567543029785156, + "learning_rate": 6.259150805270863e-07, + "loss": 1.1262, + "mean_token_accuracy": 0.6773577928543091, + "num_tokens": 8845941.0, + "step": 343 + }, + { + "epoch": 0.03777728969909949, + "grad_norm": 2.5270471572875977, + "learning_rate": 6.277452415812591e-07, + "loss": 1.0907, + "mean_token_accuracy": 0.6818406581878662, + "num_tokens": 8870517.0, + "step": 344 + }, + { + "epoch": 0.03788710740171316, + "grad_norm": 2.1377856731414795, + "learning_rate": 6.295754026354319e-07, + "loss": 1.0795, + "mean_token_accuracy": 0.6838027238845825, + "num_tokens": 8898869.0, + "step": 345 + }, + { + "epoch": 0.03799692510432682, + "grad_norm": 2.6280086040496826, + "learning_rate": 6.314055636896047e-07, + "loss": 1.107, + "mean_token_accuracy": 0.6768750548362732, + "num_tokens": 8921864.0, + "step": 346 + }, + { + "epoch": 0.03810674280694048, + "grad_norm": 2.359504461288452, + "learning_rate": 6.332357247437774e-07, + "loss": 1.1115, + "mean_token_accuracy": 0.6697350740432739, + "num_tokens": 8948803.0, + "step": 347 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 2.3179500102996826, + "learning_rate": 6.350658857979502e-07, + "loss": 1.0972, + "mean_token_accuracy": 0.6789036393165588, + "num_tokens": 8976725.0, + "step": 348 + }, + { + "epoch": 0.0383263782121678, + "grad_norm": 2.3203556537628174, + "learning_rate": 6.36896046852123e-07, + "loss": 1.1396, + "mean_token_accuracy": 0.6797265410423279, + "num_tokens": 9004148.0, + "step": 349 + }, + { + "epoch": 0.038436195914781465, + "grad_norm": 2.318265199661255, + "learning_rate": 6.387262079062958e-07, + "loss": 1.1119, + "mean_token_accuracy": 0.6692647337913513, + "num_tokens": 9034117.0, + "step": 350 + }, + { + "epoch": 0.038546013617395125, + "grad_norm": 2.357966184616089, + "learning_rate": 6.405563689604685e-07, + "loss": 1.1254, + "mean_token_accuracy": 0.6771513223648071, + "num_tokens": 9058914.0, + "step": 351 + }, + { + "epoch": 0.038655831320008785, + "grad_norm": 2.4373457431793213, + "learning_rate": 6.423865300146413e-07, + "loss": 1.0508, + "mean_token_accuracy": 0.683131992816925, + "num_tokens": 9082923.0, + "step": 352 + }, + { + "epoch": 0.038765649022622445, + "grad_norm": 2.6252105236053467, + "learning_rate": 6.442166910688141e-07, + "loss": 0.996, + "mean_token_accuracy": 0.7094027996063232, + "num_tokens": 9103636.0, + "step": 353 + }, + { + "epoch": 0.038875466725236105, + "grad_norm": 3.2774853706359863, + "learning_rate": 6.460468521229869e-07, + "loss": 1.0971, + "mean_token_accuracy": 0.6847522258758545, + "num_tokens": 9120844.0, + "step": 354 + }, + { + "epoch": 0.03898528442784977, + "grad_norm": 2.5244157314300537, + "learning_rate": 6.478770131771596e-07, + "loss": 1.1376, + "mean_token_accuracy": 0.6742476224899292, + "num_tokens": 9145101.0, + "step": 355 + }, + { + "epoch": 0.03909510213046343, + "grad_norm": 2.220973491668701, + "learning_rate": 6.497071742313324e-07, + "loss": 1.1918, + "mean_token_accuracy": 0.6504552960395813, + "num_tokens": 9174760.0, + "step": 356 + }, + { + "epoch": 0.03920491983307709, + "grad_norm": 2.530170440673828, + "learning_rate": 6.515373352855052e-07, + "loss": 1.16, + "mean_token_accuracy": 0.66435706615448, + "num_tokens": 9198501.0, + "step": 357 + }, + { + "epoch": 0.03931473753569075, + "grad_norm": 2.4881086349487305, + "learning_rate": 6.53367496339678e-07, + "loss": 1.1007, + "mean_token_accuracy": 0.6788725852966309, + "num_tokens": 9221687.0, + "step": 358 + }, + { + "epoch": 0.03942455523830442, + "grad_norm": 1.8599920272827148, + "learning_rate": 6.551976573938507e-07, + "loss": 1.1502, + "mean_token_accuracy": 0.6658376455307007, + "num_tokens": 9257292.0, + "step": 359 + }, + { + "epoch": 0.03953437294091808, + "grad_norm": 2.0035829544067383, + "learning_rate": 6.570278184480235e-07, + "loss": 1.0286, + "mean_token_accuracy": 0.6964173913002014, + "num_tokens": 9290128.0, + "step": 360 + }, + { + "epoch": 0.03964419064353174, + "grad_norm": 2.25024151802063, + "learning_rate": 6.588579795021963e-07, + "loss": 1.0985, + "mean_token_accuracy": 0.6752670407295227, + "num_tokens": 9318313.0, + "step": 361 + }, + { + "epoch": 0.0397540083461454, + "grad_norm": 2.0629775524139404, + "learning_rate": 6.606881405563691e-07, + "loss": 1.1917, + "mean_token_accuracy": 0.6534470915794373, + "num_tokens": 9350908.0, + "step": 362 + }, + { + "epoch": 0.03986382604875906, + "grad_norm": 2.4849209785461426, + "learning_rate": 6.625183016105418e-07, + "loss": 1.1007, + "mean_token_accuracy": 0.6837116479873657, + "num_tokens": 9372715.0, + "step": 363 + }, + { + "epoch": 0.039973643751372724, + "grad_norm": 2.248079299926758, + "learning_rate": 6.643484626647146e-07, + "loss": 1.0547, + "mean_token_accuracy": 0.6946510076522827, + "num_tokens": 9398991.0, + "step": 364 + }, + { + "epoch": 0.040083461453986384, + "grad_norm": 2.4313127994537354, + "learning_rate": 6.661786237188874e-07, + "loss": 1.0035, + "mean_token_accuracy": 0.702237606048584, + "num_tokens": 9423072.0, + "step": 365 + }, + { + "epoch": 0.040193279156600044, + "grad_norm": 2.42638897895813, + "learning_rate": 6.680087847730602e-07, + "loss": 1.1277, + "mean_token_accuracy": 0.669771671295166, + "num_tokens": 9447613.0, + "step": 366 + }, + { + "epoch": 0.040303096859213704, + "grad_norm": 2.3884847164154053, + "learning_rate": 6.698389458272328e-07, + "loss": 1.0643, + "mean_token_accuracy": 0.6895078420639038, + "num_tokens": 9473670.0, + "step": 367 + }, + { + "epoch": 0.040412914561827364, + "grad_norm": 2.474308729171753, + "learning_rate": 6.716691068814056e-07, + "loss": 1.1793, + "mean_token_accuracy": 0.6588496565818787, + "num_tokens": 9497934.0, + "step": 368 + }, + { + "epoch": 0.04052273226444103, + "grad_norm": 2.4043760299682617, + "learning_rate": 6.734992679355784e-07, + "loss": 1.2183, + "mean_token_accuracy": 0.6443287134170532, + "num_tokens": 9524251.0, + "step": 369 + }, + { + "epoch": 0.04063254996705469, + "grad_norm": 2.367671489715576, + "learning_rate": 6.753294289897512e-07, + "loss": 1.0689, + "mean_token_accuracy": 0.6825901865959167, + "num_tokens": 9548733.0, + "step": 370 + }, + { + "epoch": 0.04074236766966835, + "grad_norm": 2.3728814125061035, + "learning_rate": 6.771595900439239e-07, + "loss": 1.0393, + "mean_token_accuracy": 0.6941757202148438, + "num_tokens": 9573600.0, + "step": 371 + }, + { + "epoch": 0.04085218537228201, + "grad_norm": 2.5533041954040527, + "learning_rate": 6.789897510980966e-07, + "loss": 1.1274, + "mean_token_accuracy": 0.6652812957763672, + "num_tokens": 9596483.0, + "step": 372 + }, + { + "epoch": 0.04096200307489567, + "grad_norm": 2.2605650424957275, + "learning_rate": 6.808199121522694e-07, + "loss": 1.1302, + "mean_token_accuracy": 0.6722444295883179, + "num_tokens": 9624642.0, + "step": 373 + }, + { + "epoch": 0.04107182077750934, + "grad_norm": 2.0680344104766846, + "learning_rate": 6.826500732064422e-07, + "loss": 1.136, + "mean_token_accuracy": 0.6749445199966431, + "num_tokens": 9657400.0, + "step": 374 + }, + { + "epoch": 0.041181638480122996, + "grad_norm": 2.4306888580322266, + "learning_rate": 6.844802342606149e-07, + "loss": 1.0962, + "mean_token_accuracy": 0.6790273189544678, + "num_tokens": 9682326.0, + "step": 375 + }, + { + "epoch": 0.041291456182736656, + "grad_norm": 2.646214008331299, + "learning_rate": 6.863103953147877e-07, + "loss": 1.1088, + "mean_token_accuracy": 0.6741273403167725, + "num_tokens": 9705310.0, + "step": 376 + }, + { + "epoch": 0.041401273885350316, + "grad_norm": 2.287508487701416, + "learning_rate": 6.881405563689605e-07, + "loss": 1.1817, + "mean_token_accuracy": 0.6552755832672119, + "num_tokens": 9733007.0, + "step": 377 + }, + { + "epoch": 0.04151109158796398, + "grad_norm": 2.197859048843384, + "learning_rate": 6.899707174231333e-07, + "loss": 1.0879, + "mean_token_accuracy": 0.681107759475708, + "num_tokens": 9762345.0, + "step": 378 + }, + { + "epoch": 0.04162090929057764, + "grad_norm": 2.811227560043335, + "learning_rate": 6.91800878477306e-07, + "loss": 1.0607, + "mean_token_accuracy": 0.6898235082626343, + "num_tokens": 9782404.0, + "step": 379 + }, + { + "epoch": 0.0417307269931913, + "grad_norm": 2.324465036392212, + "learning_rate": 6.936310395314788e-07, + "loss": 1.0836, + "mean_token_accuracy": 0.6796847581863403, + "num_tokens": 9809035.0, + "step": 380 + }, + { + "epoch": 0.04184054469580496, + "grad_norm": 2.3854684829711914, + "learning_rate": 6.954612005856516e-07, + "loss": 1.1716, + "mean_token_accuracy": 0.657896101474762, + "num_tokens": 9834926.0, + "step": 381 + }, + { + "epoch": 0.04195036239841862, + "grad_norm": 2.1749727725982666, + "learning_rate": 6.972913616398244e-07, + "loss": 1.0984, + "mean_token_accuracy": 0.6780186295509338, + "num_tokens": 9864984.0, + "step": 382 + }, + { + "epoch": 0.04206018010103229, + "grad_norm": 2.156686782836914, + "learning_rate": 6.991215226939971e-07, + "loss": 1.2215, + "mean_token_accuracy": 0.6638263463973999, + "num_tokens": 9897508.0, + "step": 383 + }, + { + "epoch": 0.04216999780364595, + "grad_norm": 2.399675130844116, + "learning_rate": 7.009516837481699e-07, + "loss": 1.2022, + "mean_token_accuracy": 0.6485671401023865, + "num_tokens": 9923716.0, + "step": 384 + }, + { + "epoch": 0.04227981550625961, + "grad_norm": 2.4234166145324707, + "learning_rate": 7.027818448023427e-07, + "loss": 1.0325, + "mean_token_accuracy": 0.6967921853065491, + "num_tokens": 9946619.0, + "step": 385 + }, + { + "epoch": 0.04238963320887327, + "grad_norm": 2.1480159759521484, + "learning_rate": 7.046120058565155e-07, + "loss": 1.0721, + "mean_token_accuracy": 0.6810526251792908, + "num_tokens": 9976400.0, + "step": 386 + }, + { + "epoch": 0.04249945091148693, + "grad_norm": 2.5611767768859863, + "learning_rate": 7.064421669106881e-07, + "loss": 1.042, + "mean_token_accuracy": 0.6858223676681519, + "num_tokens": 9997854.0, + "step": 387 + }, + { + "epoch": 0.042609268614100596, + "grad_norm": 1.988158106803894, + "learning_rate": 7.08272327964861e-07, + "loss": 1.0713, + "mean_token_accuracy": 0.6809245347976685, + "num_tokens": 10030235.0, + "step": 388 + }, + { + "epoch": 0.042719086316714255, + "grad_norm": 2.3250749111175537, + "learning_rate": 7.101024890190338e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.6973642110824585, + "num_tokens": 10056011.0, + "step": 389 + }, + { + "epoch": 0.042828904019327915, + "grad_norm": 2.2468435764312744, + "learning_rate": 7.119326500732066e-07, + "loss": 1.1006, + "mean_token_accuracy": 0.6809842586517334, + "num_tokens": 10085788.0, + "step": 390 + }, + { + "epoch": 0.042938721721941575, + "grad_norm": 2.067235231399536, + "learning_rate": 7.137628111273792e-07, + "loss": 1.0699, + "mean_token_accuracy": 0.6850781440734863, + "num_tokens": 10118569.0, + "step": 391 + }, + { + "epoch": 0.043048539424555235, + "grad_norm": 2.55416202545166, + "learning_rate": 7.15592972181552e-07, + "loss": 1.1577, + "mean_token_accuracy": 0.6662168502807617, + "num_tokens": 10142142.0, + "step": 392 + }, + { + "epoch": 0.0431583571271689, + "grad_norm": 2.1549947261810303, + "learning_rate": 7.174231332357248e-07, + "loss": 1.0889, + "mean_token_accuracy": 0.6831209063529968, + "num_tokens": 10171706.0, + "step": 393 + }, + { + "epoch": 0.04326817482978256, + "grad_norm": 2.6138100624084473, + "learning_rate": 7.192532942898976e-07, + "loss": 1.0534, + "mean_token_accuracy": 0.6877411007881165, + "num_tokens": 10193907.0, + "step": 394 + }, + { + "epoch": 0.04337799253239622, + "grad_norm": 2.3712735176086426, + "learning_rate": 7.210834553440703e-07, + "loss": 1.1369, + "mean_token_accuracy": 0.6719613075256348, + "num_tokens": 10219556.0, + "step": 395 + }, + { + "epoch": 0.04348781023500988, + "grad_norm": 2.342400074005127, + "learning_rate": 7.229136163982431e-07, + "loss": 1.1976, + "mean_token_accuracy": 0.6585458517074585, + "num_tokens": 10246794.0, + "step": 396 + }, + { + "epoch": 0.04359762793762355, + "grad_norm": 2.0598580837249756, + "learning_rate": 7.247437774524159e-07, + "loss": 1.0797, + "mean_token_accuracy": 0.6864933371543884, + "num_tokens": 10279066.0, + "step": 397 + }, + { + "epoch": 0.04370744564023721, + "grad_norm": 2.41452693939209, + "learning_rate": 7.265739385065887e-07, + "loss": 1.0014, + "mean_token_accuracy": 0.6993433237075806, + "num_tokens": 10302501.0, + "step": 398 + }, + { + "epoch": 0.04381726334285087, + "grad_norm": 2.6056783199310303, + "learning_rate": 7.284040995607614e-07, + "loss": 1.1424, + "mean_token_accuracy": 0.6676542162895203, + "num_tokens": 10328438.0, + "step": 399 + }, + { + "epoch": 0.04392708104546453, + "grad_norm": 2.436455726623535, + "learning_rate": 7.302342606149342e-07, + "loss": 1.0874, + "mean_token_accuracy": 0.6881359815597534, + "num_tokens": 10353257.0, + "step": 400 + }, + { + "epoch": 0.04403689874807819, + "grad_norm": 2.1680002212524414, + "learning_rate": 7.32064421669107e-07, + "loss": 1.118, + "mean_token_accuracy": 0.6730912923812866, + "num_tokens": 10382297.0, + "step": 401 + }, + { + "epoch": 0.044146716450691854, + "grad_norm": 2.5881547927856445, + "learning_rate": 7.338945827232798e-07, + "loss": 1.0578, + "mean_token_accuracy": 0.684780478477478, + "num_tokens": 10404730.0, + "step": 402 + }, + { + "epoch": 0.044256534153305514, + "grad_norm": 2.6346917152404785, + "learning_rate": 7.357247437774524e-07, + "loss": 1.0713, + "mean_token_accuracy": 0.6951695084571838, + "num_tokens": 10426851.0, + "step": 403 + }, + { + "epoch": 0.044366351855919174, + "grad_norm": 2.6623973846435547, + "learning_rate": 7.375549048316252e-07, + "loss": 1.0275, + "mean_token_accuracy": 0.6920211315155029, + "num_tokens": 10449269.0, + "step": 404 + }, + { + "epoch": 0.044476169558532834, + "grad_norm": 2.776845693588257, + "learning_rate": 7.39385065885798e-07, + "loss": 1.0346, + "mean_token_accuracy": 0.7003248929977417, + "num_tokens": 10469689.0, + "step": 405 + }, + { + "epoch": 0.044585987261146494, + "grad_norm": 2.319643497467041, + "learning_rate": 7.412152269399708e-07, + "loss": 1.0676, + "mean_token_accuracy": 0.6852364540100098, + "num_tokens": 10496305.0, + "step": 406 + }, + { + "epoch": 0.04469580496376016, + "grad_norm": 2.444119691848755, + "learning_rate": 7.430453879941435e-07, + "loss": 1.1197, + "mean_token_accuracy": 0.6731023788452148, + "num_tokens": 10520795.0, + "step": 407 + }, + { + "epoch": 0.04480562266637382, + "grad_norm": 2.0257506370544434, + "learning_rate": 7.448755490483163e-07, + "loss": 1.0904, + "mean_token_accuracy": 0.6746888160705566, + "num_tokens": 10554624.0, + "step": 408 + }, + { + "epoch": 0.04491544036898748, + "grad_norm": 2.4924018383026123, + "learning_rate": 7.467057101024891e-07, + "loss": 1.1384, + "mean_token_accuracy": 0.6729732751846313, + "num_tokens": 10585236.0, + "step": 409 + }, + { + "epoch": 0.04502525807160114, + "grad_norm": 2.311272144317627, + "learning_rate": 7.485358711566619e-07, + "loss": 1.1911, + "mean_token_accuracy": 0.6518462300300598, + "num_tokens": 10613824.0, + "step": 410 + }, + { + "epoch": 0.0451350757742148, + "grad_norm": 2.3119423389434814, + "learning_rate": 7.503660322108345e-07, + "loss": 1.1103, + "mean_token_accuracy": 0.6722153425216675, + "num_tokens": 10642189.0, + "step": 411 + }, + { + "epoch": 0.04524489347682847, + "grad_norm": 2.5204498767852783, + "learning_rate": 7.521961932650074e-07, + "loss": 1.0415, + "mean_token_accuracy": 0.6972141861915588, + "num_tokens": 10665418.0, + "step": 412 + }, + { + "epoch": 0.04535471117944213, + "grad_norm": 2.550140857696533, + "learning_rate": 7.540263543191802e-07, + "loss": 1.0155, + "mean_token_accuracy": 0.6946404576301575, + "num_tokens": 10685961.0, + "step": 413 + }, + { + "epoch": 0.04546452888205579, + "grad_norm": 2.17737078666687, + "learning_rate": 7.55856515373353e-07, + "loss": 1.0884, + "mean_token_accuracy": 0.6830883026123047, + "num_tokens": 10714354.0, + "step": 414 + }, + { + "epoch": 0.045574346584669446, + "grad_norm": 2.2130889892578125, + "learning_rate": 7.576866764275256e-07, + "loss": 1.0773, + "mean_token_accuracy": 0.6749380826950073, + "num_tokens": 10743284.0, + "step": 415 + }, + { + "epoch": 0.04568416428728311, + "grad_norm": 2.4113669395446777, + "learning_rate": 7.595168374816984e-07, + "loss": 1.1387, + "mean_token_accuracy": 0.6713839173316956, + "num_tokens": 10767965.0, + "step": 416 + }, + { + "epoch": 0.04579398198989677, + "grad_norm": 2.478952407836914, + "learning_rate": 7.613469985358712e-07, + "loss": 1.1021, + "mean_token_accuracy": 0.6763887405395508, + "num_tokens": 10791640.0, + "step": 417 + }, + { + "epoch": 0.04590379969251043, + "grad_norm": 2.003750801086426, + "learning_rate": 7.63177159590044e-07, + "loss": 1.133, + "mean_token_accuracy": 0.6598677635192871, + "num_tokens": 10824420.0, + "step": 418 + }, + { + "epoch": 0.04601361739512409, + "grad_norm": 2.451258420944214, + "learning_rate": 7.650073206442167e-07, + "loss": 1.1275, + "mean_token_accuracy": 0.6706487536430359, + "num_tokens": 10849147.0, + "step": 419 + }, + { + "epoch": 0.04612343509773775, + "grad_norm": 2.4669854640960693, + "learning_rate": 7.668374816983895e-07, + "loss": 1.1546, + "mean_token_accuracy": 0.6663594245910645, + "num_tokens": 10873539.0, + "step": 420 + }, + { + "epoch": 0.04623325280035142, + "grad_norm": 2.012784719467163, + "learning_rate": 7.686676427525623e-07, + "loss": 1.1025, + "mean_token_accuracy": 0.6738346815109253, + "num_tokens": 10903789.0, + "step": 421 + }, + { + "epoch": 0.04634307050296508, + "grad_norm": 2.2434284687042236, + "learning_rate": 7.704978038067351e-07, + "loss": 1.0104, + "mean_token_accuracy": 0.7024239301681519, + "num_tokens": 10930818.0, + "step": 422 + }, + { + "epoch": 0.04645288820557874, + "grad_norm": 2.656662702560425, + "learning_rate": 7.723279648609078e-07, + "loss": 1.0291, + "mean_token_accuracy": 0.6892585158348083, + "num_tokens": 10951451.0, + "step": 423 + }, + { + "epoch": 0.0465627059081924, + "grad_norm": 2.178145170211792, + "learning_rate": 7.741581259150806e-07, + "loss": 1.1348, + "mean_token_accuracy": 0.6732345819473267, + "num_tokens": 10982517.0, + "step": 424 + }, + { + "epoch": 0.04667252361080606, + "grad_norm": 2.312812566757202, + "learning_rate": 7.759882869692534e-07, + "loss": 1.0612, + "mean_token_accuracy": 0.6968604326248169, + "num_tokens": 11009255.0, + "step": 425 + }, + { + "epoch": 0.046782341313419726, + "grad_norm": 2.187943696975708, + "learning_rate": 7.778184480234262e-07, + "loss": 1.0105, + "mean_token_accuracy": 0.7011281251907349, + "num_tokens": 11035828.0, + "step": 426 + }, + { + "epoch": 0.046892159016033386, + "grad_norm": 2.3590660095214844, + "learning_rate": 7.796486090775989e-07, + "loss": 1.1321, + "mean_token_accuracy": 0.6625726222991943, + "num_tokens": 11061431.0, + "step": 427 + }, + { + "epoch": 0.047001976718647046, + "grad_norm": 2.1804513931274414, + "learning_rate": 7.814787701317717e-07, + "loss": 1.1751, + "mean_token_accuracy": 0.6604818105697632, + "num_tokens": 11093191.0, + "step": 428 + }, + { + "epoch": 0.047111794421260705, + "grad_norm": 2.2622103691101074, + "learning_rate": 7.833089311859445e-07, + "loss": 1.1037, + "mean_token_accuracy": 0.6854861378669739, + "num_tokens": 11121516.0, + "step": 429 + }, + { + "epoch": 0.047221612123874365, + "grad_norm": 2.665419816970825, + "learning_rate": 7.851390922401173e-07, + "loss": 1.1101, + "mean_token_accuracy": 0.68132483959198, + "num_tokens": 11143146.0, + "step": 430 + }, + { + "epoch": 0.04733142982648803, + "grad_norm": 2.423762798309326, + "learning_rate": 7.8696925329429e-07, + "loss": 0.9487, + "mean_token_accuracy": 0.7157238125801086, + "num_tokens": 11165896.0, + "step": 431 + }, + { + "epoch": 0.04744124752910169, + "grad_norm": 2.4921040534973145, + "learning_rate": 7.887994143484628e-07, + "loss": 1.0461, + "mean_token_accuracy": 0.6852471232414246, + "num_tokens": 11188411.0, + "step": 432 + }, + { + "epoch": 0.04755106523171535, + "grad_norm": 2.2887206077575684, + "learning_rate": 7.906295754026355e-07, + "loss": 1.013, + "mean_token_accuracy": 0.6961243152618408, + "num_tokens": 11215361.0, + "step": 433 + }, + { + "epoch": 0.04766088293432901, + "grad_norm": 2.239487409591675, + "learning_rate": 7.924597364568083e-07, + "loss": 0.9332, + "mean_token_accuracy": 0.7228529453277588, + "num_tokens": 11240727.0, + "step": 434 + }, + { + "epoch": 0.04777070063694268, + "grad_norm": 2.227579116821289, + "learning_rate": 7.942898975109809e-07, + "loss": 1.0727, + "mean_token_accuracy": 0.6805713772773743, + "num_tokens": 11268478.0, + "step": 435 + }, + { + "epoch": 0.04788051833955634, + "grad_norm": 2.4078176021575928, + "learning_rate": 7.961200585651538e-07, + "loss": 1.204, + "mean_token_accuracy": 0.6495325565338135, + "num_tokens": 11296410.0, + "step": 436 + }, + { + "epoch": 0.04799033604217, + "grad_norm": 2.219294786453247, + "learning_rate": 7.979502196193266e-07, + "loss": 1.1025, + "mean_token_accuracy": 0.6729497909545898, + "num_tokens": 11322536.0, + "step": 437 + }, + { + "epoch": 0.04810015374478366, + "grad_norm": 2.767568588256836, + "learning_rate": 7.997803806734994e-07, + "loss": 1.0266, + "mean_token_accuracy": 0.6916590929031372, + "num_tokens": 11342740.0, + "step": 438 + }, + { + "epoch": 0.04820997144739732, + "grad_norm": 2.031006336212158, + "learning_rate": 8.01610541727672e-07, + "loss": 1.1815, + "mean_token_accuracy": 0.654944658279419, + "num_tokens": 11377097.0, + "step": 439 + }, + { + "epoch": 0.048319789150010985, + "grad_norm": 2.37977933883667, + "learning_rate": 8.034407027818448e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.6948659420013428, + "num_tokens": 11401260.0, + "step": 440 + }, + { + "epoch": 0.048429606852624645, + "grad_norm": 2.306346893310547, + "learning_rate": 8.052708638360176e-07, + "loss": 1.0515, + "mean_token_accuracy": 0.68597412109375, + "num_tokens": 11427074.0, + "step": 441 + }, + { + "epoch": 0.048539424555238304, + "grad_norm": 2.1941323280334473, + "learning_rate": 8.071010248901904e-07, + "loss": 1.0447, + "mean_token_accuracy": 0.6886955499649048, + "num_tokens": 11453282.0, + "step": 442 + }, + { + "epoch": 0.048649242257851964, + "grad_norm": 2.6786231994628906, + "learning_rate": 8.089311859443631e-07, + "loss": 1.0631, + "mean_token_accuracy": 0.6908342838287354, + "num_tokens": 11474952.0, + "step": 443 + }, + { + "epoch": 0.048759059960465624, + "grad_norm": 2.4417881965637207, + "learning_rate": 8.107613469985359e-07, + "loss": 1.0357, + "mean_token_accuracy": 0.6892253756523132, + "num_tokens": 11496639.0, + "step": 444 + }, + { + "epoch": 0.04886887766307929, + "grad_norm": 2.0501930713653564, + "learning_rate": 8.125915080527087e-07, + "loss": 1.0555, + "mean_token_accuracy": 0.697124719619751, + "num_tokens": 11527445.0, + "step": 445 + }, + { + "epoch": 0.04897869536569295, + "grad_norm": 2.4726667404174805, + "learning_rate": 8.144216691068815e-07, + "loss": 1.0236, + "mean_token_accuracy": 0.6905689835548401, + "num_tokens": 11550146.0, + "step": 446 + }, + { + "epoch": 0.04908851306830661, + "grad_norm": 2.360016107559204, + "learning_rate": 8.162518301610542e-07, + "loss": 1.1226, + "mean_token_accuracy": 0.6759240627288818, + "num_tokens": 11573236.0, + "step": 447 + }, + { + "epoch": 0.04919833077092027, + "grad_norm": 2.4480202198028564, + "learning_rate": 8.18081991215227e-07, + "loss": 1.0338, + "mean_token_accuracy": 0.6912286281585693, + "num_tokens": 11597031.0, + "step": 448 + }, + { + "epoch": 0.04930814847353393, + "grad_norm": 2.452129602432251, + "learning_rate": 8.199121522693998e-07, + "loss": 1.1097, + "mean_token_accuracy": 0.6803706884384155, + "num_tokens": 11620382.0, + "step": 449 + }, + { + "epoch": 0.0494179661761476, + "grad_norm": 2.4546241760253906, + "learning_rate": 8.217423133235726e-07, + "loss": 1.1301, + "mean_token_accuracy": 0.6700754165649414, + "num_tokens": 11644959.0, + "step": 450 + }, + { + "epoch": 0.04952778387876126, + "grad_norm": 2.103449821472168, + "learning_rate": 8.235724743777453e-07, + "loss": 1.0383, + "mean_token_accuracy": 0.6963422298431396, + "num_tokens": 11675430.0, + "step": 451 + }, + { + "epoch": 0.04963760158137492, + "grad_norm": 2.6957144737243652, + "learning_rate": 8.254026354319181e-07, + "loss": 0.9731, + "mean_token_accuracy": 0.707535982131958, + "num_tokens": 11695225.0, + "step": 452 + }, + { + "epoch": 0.04974741928398858, + "grad_norm": 2.2215824127197266, + "learning_rate": 8.272327964860909e-07, + "loss": 1.135, + "mean_token_accuracy": 0.6664400696754456, + "num_tokens": 11724815.0, + "step": 453 + }, + { + "epoch": 0.049857236986602244, + "grad_norm": 2.480435848236084, + "learning_rate": 8.290629575402637e-07, + "loss": 1.0992, + "mean_token_accuracy": 0.6696038246154785, + "num_tokens": 11747766.0, + "step": 454 + }, + { + "epoch": 0.0499670546892159, + "grad_norm": 2.2639882564544678, + "learning_rate": 8.308931185944364e-07, + "loss": 1.0639, + "mean_token_accuracy": 0.6966941356658936, + "num_tokens": 11776703.0, + "step": 455 + }, + { + "epoch": 0.05007687239182956, + "grad_norm": 2.3568944931030273, + "learning_rate": 8.327232796486092e-07, + "loss": 1.0644, + "mean_token_accuracy": 0.6841026544570923, + "num_tokens": 11801609.0, + "step": 456 + }, + { + "epoch": 0.05018669009444322, + "grad_norm": 2.3373489379882812, + "learning_rate": 8.34553440702782e-07, + "loss": 1.0688, + "mean_token_accuracy": 0.6841868162155151, + "num_tokens": 11827547.0, + "step": 457 + }, + { + "epoch": 0.05029650779705688, + "grad_norm": 2.785487651824951, + "learning_rate": 8.363836017569548e-07, + "loss": 1.0994, + "mean_token_accuracy": 0.6683647632598877, + "num_tokens": 11848127.0, + "step": 458 + }, + { + "epoch": 0.05040632549967055, + "grad_norm": 2.426093101501465, + "learning_rate": 8.382137628111274e-07, + "loss": 1.0879, + "mean_token_accuracy": 0.6770870089530945, + "num_tokens": 11873750.0, + "step": 459 + }, + { + "epoch": 0.05051614320228421, + "grad_norm": 2.430462598800659, + "learning_rate": 8.400439238653002e-07, + "loss": 1.1282, + "mean_token_accuracy": 0.679734468460083, + "num_tokens": 11897127.0, + "step": 460 + }, + { + "epoch": 0.05062596090489787, + "grad_norm": 2.472614288330078, + "learning_rate": 8.418740849194731e-07, + "loss": 1.083, + "mean_token_accuracy": 0.688736617565155, + "num_tokens": 11923045.0, + "step": 461 + }, + { + "epoch": 0.05073577860751153, + "grad_norm": 2.4930102825164795, + "learning_rate": 8.437042459736459e-07, + "loss": 1.0359, + "mean_token_accuracy": 0.6916134357452393, + "num_tokens": 11945577.0, + "step": 462 + }, + { + "epoch": 0.05084559631012519, + "grad_norm": 2.457786798477173, + "learning_rate": 8.455344070278184e-07, + "loss": 1.1066, + "mean_token_accuracy": 0.6667162179946899, + "num_tokens": 11968652.0, + "step": 463 + }, + { + "epoch": 0.050955414012738856, + "grad_norm": 2.1794824600219727, + "learning_rate": 8.473645680819912e-07, + "loss": 1.103, + "mean_token_accuracy": 0.675558865070343, + "num_tokens": 11996650.0, + "step": 464 + }, + { + "epoch": 0.051065231715352516, + "grad_norm": 2.106133222579956, + "learning_rate": 8.49194729136164e-07, + "loss": 1.1052, + "mean_token_accuracy": 0.678864598274231, + "num_tokens": 12026390.0, + "step": 465 + }, + { + "epoch": 0.051175049417966176, + "grad_norm": 2.563131093978882, + "learning_rate": 8.510248901903368e-07, + "loss": 1.081, + "mean_token_accuracy": 0.6778016686439514, + "num_tokens": 12047975.0, + "step": 466 + }, + { + "epoch": 0.051284867120579836, + "grad_norm": 2.5808815956115723, + "learning_rate": 8.528550512445095e-07, + "loss": 1.0948, + "mean_token_accuracy": 0.6770312786102295, + "num_tokens": 12071879.0, + "step": 467 + }, + { + "epoch": 0.051394684823193496, + "grad_norm": 2.537433385848999, + "learning_rate": 8.546852122986823e-07, + "loss": 0.9991, + "mean_token_accuracy": 0.699072539806366, + "num_tokens": 12093955.0, + "step": 468 + }, + { + "epoch": 0.05150450252580716, + "grad_norm": 2.3299720287323, + "learning_rate": 8.565153733528551e-07, + "loss": 1.0781, + "mean_token_accuracy": 0.6808720231056213, + "num_tokens": 12120359.0, + "step": 469 + }, + { + "epoch": 0.05161432022842082, + "grad_norm": 2.2750895023345947, + "learning_rate": 8.583455344070279e-07, + "loss": 1.0097, + "mean_token_accuracy": 0.6971243023872375, + "num_tokens": 12146962.0, + "step": 470 + }, + { + "epoch": 0.05172413793103448, + "grad_norm": 2.3760311603546143, + "learning_rate": 8.601756954612006e-07, + "loss": 0.9831, + "mean_token_accuracy": 0.7072480320930481, + "num_tokens": 12171671.0, + "step": 471 + }, + { + "epoch": 0.05183395563364814, + "grad_norm": 2.4409334659576416, + "learning_rate": 8.620058565153734e-07, + "loss": 1.0345, + "mean_token_accuracy": 0.6974554061889648, + "num_tokens": 12195939.0, + "step": 472 + }, + { + "epoch": 0.05194377333626181, + "grad_norm": 2.379241466522217, + "learning_rate": 8.638360175695462e-07, + "loss": 1.0551, + "mean_token_accuracy": 0.695165753364563, + "num_tokens": 12221794.0, + "step": 473 + }, + { + "epoch": 0.05205359103887547, + "grad_norm": 2.4400577545166016, + "learning_rate": 8.65666178623719e-07, + "loss": 1.1089, + "mean_token_accuracy": 0.6821444630622864, + "num_tokens": 12247901.0, + "step": 474 + }, + { + "epoch": 0.05216340874148913, + "grad_norm": 2.4002065658569336, + "learning_rate": 8.674963396778917e-07, + "loss": 1.0746, + "mean_token_accuracy": 0.6822750568389893, + "num_tokens": 12272743.0, + "step": 475 + }, + { + "epoch": 0.05227322644410279, + "grad_norm": 2.367417335510254, + "learning_rate": 8.693265007320645e-07, + "loss": 1.091, + "mean_token_accuracy": 0.6759639978408813, + "num_tokens": 12299420.0, + "step": 476 + }, + { + "epoch": 0.05238304414671645, + "grad_norm": 2.2487051486968994, + "learning_rate": 8.711566617862373e-07, + "loss": 1.1474, + "mean_token_accuracy": 0.6690199375152588, + "num_tokens": 12329906.0, + "step": 477 + }, + { + "epoch": 0.052492861849330115, + "grad_norm": 2.0765514373779297, + "learning_rate": 8.729868228404101e-07, + "loss": 1.1247, + "mean_token_accuracy": 0.6719833612442017, + "num_tokens": 12361140.0, + "step": 478 + }, + { + "epoch": 0.052602679551943775, + "grad_norm": 2.3052175045013428, + "learning_rate": 8.748169838945828e-07, + "loss": 1.1092, + "mean_token_accuracy": 0.6703672409057617, + "num_tokens": 12386267.0, + "step": 479 + }, + { + "epoch": 0.052712497254557435, + "grad_norm": 2.2637696266174316, + "learning_rate": 8.766471449487556e-07, + "loss": 1.079, + "mean_token_accuracy": 0.6766599416732788, + "num_tokens": 12414351.0, + "step": 480 + }, + { + "epoch": 0.052822314957171095, + "grad_norm": 2.0812618732452393, + "learning_rate": 8.784773060029284e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6912822723388672, + "num_tokens": 12445465.0, + "step": 481 + }, + { + "epoch": 0.052932132659784754, + "grad_norm": 2.302004098892212, + "learning_rate": 8.803074670571012e-07, + "loss": 1.028, + "mean_token_accuracy": 0.6982212066650391, + "num_tokens": 12474213.0, + "step": 482 + }, + { + "epoch": 0.05304195036239842, + "grad_norm": 2.548229694366455, + "learning_rate": 8.821376281112738e-07, + "loss": 1.1101, + "mean_token_accuracy": 0.6755428314208984, + "num_tokens": 12498889.0, + "step": 483 + }, + { + "epoch": 0.05315176806501208, + "grad_norm": 2.381981134414673, + "learning_rate": 8.839677891654466e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6827269792556763, + "num_tokens": 12524091.0, + "step": 484 + }, + { + "epoch": 0.05326158576762574, + "grad_norm": 2.1898601055145264, + "learning_rate": 8.857979502196195e-07, + "loss": 1.1345, + "mean_token_accuracy": 0.667945921421051, + "num_tokens": 12552546.0, + "step": 485 + }, + { + "epoch": 0.0533714034702394, + "grad_norm": 1.9211965799331665, + "learning_rate": 8.876281112737923e-07, + "loss": 1.1148, + "mean_token_accuracy": 0.6730089783668518, + "num_tokens": 12589818.0, + "step": 486 + }, + { + "epoch": 0.05348122117285306, + "grad_norm": 2.0800833702087402, + "learning_rate": 8.894582723279649e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.6701524257659912, + "num_tokens": 12621421.0, + "step": 487 + }, + { + "epoch": 0.05359103887546673, + "grad_norm": 2.480797052383423, + "learning_rate": 8.912884333821377e-07, + "loss": 1.0988, + "mean_token_accuracy": 0.6779568195343018, + "num_tokens": 12644677.0, + "step": 488 + }, + { + "epoch": 0.05370085657808039, + "grad_norm": 2.054351806640625, + "learning_rate": 8.931185944363105e-07, + "loss": 0.9907, + "mean_token_accuracy": 0.7089100480079651, + "num_tokens": 12676152.0, + "step": 489 + }, + { + "epoch": 0.05381067428069405, + "grad_norm": 2.9477877616882324, + "learning_rate": 8.949487554904833e-07, + "loss": 0.9496, + "mean_token_accuracy": 0.7123329043388367, + "num_tokens": 12692842.0, + "step": 490 + }, + { + "epoch": 0.05392049198330771, + "grad_norm": 2.335984945297241, + "learning_rate": 8.96778916544656e-07, + "loss": 1.1606, + "mean_token_accuracy": 0.6610181331634521, + "num_tokens": 12720372.0, + "step": 491 + }, + { + "epoch": 0.054030309685921374, + "grad_norm": 2.502277374267578, + "learning_rate": 8.986090775988287e-07, + "loss": 1.0518, + "mean_token_accuracy": 0.6927993297576904, + "num_tokens": 12742895.0, + "step": 492 + }, + { + "epoch": 0.054140127388535034, + "grad_norm": 2.4229743480682373, + "learning_rate": 9.004392386530015e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6882519721984863, + "num_tokens": 12767902.0, + "step": 493 + }, + { + "epoch": 0.054249945091148694, + "grad_norm": 2.2712790966033936, + "learning_rate": 9.022693997071743e-07, + "loss": 1.0249, + "mean_token_accuracy": 0.700810432434082, + "num_tokens": 12794214.0, + "step": 494 + }, + { + "epoch": 0.05435976279376235, + "grad_norm": 2.1891653537750244, + "learning_rate": 9.04099560761347e-07, + "loss": 1.1439, + "mean_token_accuracy": 0.6744287014007568, + "num_tokens": 12821971.0, + "step": 495 + }, + { + "epoch": 0.05446958049637601, + "grad_norm": 2.1522819995880127, + "learning_rate": 9.059297218155198e-07, + "loss": 1.1669, + "mean_token_accuracy": 0.6607288122177124, + "num_tokens": 12853897.0, + "step": 496 + }, + { + "epoch": 0.05457939819898968, + "grad_norm": 2.5067620277404785, + "learning_rate": 9.077598828696926e-07, + "loss": 1.0217, + "mean_token_accuracy": 0.6906266212463379, + "num_tokens": 12876006.0, + "step": 497 + }, + { + "epoch": 0.05468921590160334, + "grad_norm": 2.4163544178009033, + "learning_rate": 9.095900439238654e-07, + "loss": 1.09, + "mean_token_accuracy": 0.688804566860199, + "num_tokens": 12900094.0, + "step": 498 + }, + { + "epoch": 0.054799033604217, + "grad_norm": 2.271209239959717, + "learning_rate": 9.114202049780381e-07, + "loss": 0.9576, + "mean_token_accuracy": 0.7138667106628418, + "num_tokens": 12924420.0, + "step": 499 + }, + { + "epoch": 0.05490885130683066, + "grad_norm": 2.353187084197998, + "learning_rate": 9.132503660322109e-07, + "loss": 1.0466, + "mean_token_accuracy": 0.6890736818313599, + "num_tokens": 12950995.0, + "step": 500 + }, + { + "epoch": 0.05501866900944432, + "grad_norm": 2.1136796474456787, + "learning_rate": 9.150805270863837e-07, + "loss": 1.0748, + "mean_token_accuracy": 0.6824820041656494, + "num_tokens": 12979397.0, + "step": 501 + }, + { + "epoch": 0.055128486712057986, + "grad_norm": 2.5233702659606934, + "learning_rate": 9.169106881405565e-07, + "loss": 0.9821, + "mean_token_accuracy": 0.7040151357650757, + "num_tokens": 13000877.0, + "step": 502 + }, + { + "epoch": 0.055238304414671646, + "grad_norm": 2.3484818935394287, + "learning_rate": 9.187408491947292e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.690688967704773, + "num_tokens": 13026131.0, + "step": 503 + }, + { + "epoch": 0.055348122117285306, + "grad_norm": 2.6002278327941895, + "learning_rate": 9.20571010248902e-07, + "loss": 1.0193, + "mean_token_accuracy": 0.6959149837493896, + "num_tokens": 13047908.0, + "step": 504 + }, + { + "epoch": 0.055457939819898966, + "grad_norm": 2.107231855392456, + "learning_rate": 9.224011713030748e-07, + "loss": 1.0935, + "mean_token_accuracy": 0.6716436743736267, + "num_tokens": 13079339.0, + "step": 505 + }, + { + "epoch": 0.055567757522512626, + "grad_norm": 2.4584500789642334, + "learning_rate": 9.242313323572476e-07, + "loss": 1.0703, + "mean_token_accuracy": 0.6846218109130859, + "num_tokens": 13102358.0, + "step": 506 + }, + { + "epoch": 0.05567757522512629, + "grad_norm": 2.426217794418335, + "learning_rate": 9.260614934114202e-07, + "loss": 1.0483, + "mean_token_accuracy": 0.6950480937957764, + "num_tokens": 13127164.0, + "step": 507 + }, + { + "epoch": 0.05578739292773995, + "grad_norm": 2.536419630050659, + "learning_rate": 9.27891654465593e-07, + "loss": 1.0071, + "mean_token_accuracy": 0.6962265372276306, + "num_tokens": 13150079.0, + "step": 508 + }, + { + "epoch": 0.05589721063035361, + "grad_norm": 2.281519889831543, + "learning_rate": 9.297218155197658e-07, + "loss": 1.1234, + "mean_token_accuracy": 0.6698265075683594, + "num_tokens": 13176151.0, + "step": 509 + }, + { + "epoch": 0.05600702833296727, + "grad_norm": 2.663844585418701, + "learning_rate": 9.315519765739387e-07, + "loss": 0.9676, + "mean_token_accuracy": 0.7082340717315674, + "num_tokens": 13196222.0, + "step": 510 + }, + { + "epoch": 0.05611684603558094, + "grad_norm": 2.5742669105529785, + "learning_rate": 9.333821376281113e-07, + "loss": 1.1581, + "mean_token_accuracy": 0.6644153594970703, + "num_tokens": 13219683.0, + "step": 511 + }, + { + "epoch": 0.0562266637381946, + "grad_norm": 2.289337635040283, + "learning_rate": 9.352122986822841e-07, + "loss": 1.146, + "mean_token_accuracy": 0.6675143241882324, + "num_tokens": 13246819.0, + "step": 512 + }, + { + "epoch": 0.05633648144080826, + "grad_norm": 2.3044795989990234, + "learning_rate": 9.370424597364569e-07, + "loss": 1.1677, + "mean_token_accuracy": 0.6627034544944763, + "num_tokens": 13274299.0, + "step": 513 + }, + { + "epoch": 0.05644629914342192, + "grad_norm": 2.5720787048339844, + "learning_rate": 9.388726207906297e-07, + "loss": 0.9635, + "mean_token_accuracy": 0.71407151222229, + "num_tokens": 13295274.0, + "step": 514 + }, + { + "epoch": 0.05655611684603558, + "grad_norm": 2.057445526123047, + "learning_rate": 9.407027818448024e-07, + "loss": 1.1376, + "mean_token_accuracy": 0.6665011644363403, + "num_tokens": 13333062.0, + "step": 515 + }, + { + "epoch": 0.056665934548649245, + "grad_norm": 2.558530330657959, + "learning_rate": 9.425329428989752e-07, + "loss": 1.0669, + "mean_token_accuracy": 0.6820436716079712, + "num_tokens": 13354709.0, + "step": 516 + }, + { + "epoch": 0.056775752251262905, + "grad_norm": 2.343456983566284, + "learning_rate": 9.44363103953148e-07, + "loss": 1.058, + "mean_token_accuracy": 0.6874727606773376, + "num_tokens": 13379171.0, + "step": 517 + }, + { + "epoch": 0.056885569953876565, + "grad_norm": 2.5755414962768555, + "learning_rate": 9.461932650073208e-07, + "loss": 1.1413, + "mean_token_accuracy": 0.6640324592590332, + "num_tokens": 13401491.0, + "step": 518 + }, + { + "epoch": 0.056995387656490225, + "grad_norm": 2.240786075592041, + "learning_rate": 9.480234260614935e-07, + "loss": 1.1596, + "mean_token_accuracy": 0.6674482226371765, + "num_tokens": 13429585.0, + "step": 519 + }, + { + "epoch": 0.057105205359103885, + "grad_norm": 2.3950209617614746, + "learning_rate": 9.498535871156663e-07, + "loss": 1.1002, + "mean_token_accuracy": 0.6733884215354919, + "num_tokens": 13452157.0, + "step": 520 + }, + { + "epoch": 0.05721502306171755, + "grad_norm": 2.315957546234131, + "learning_rate": 9.51683748169839e-07, + "loss": 1.0223, + "mean_token_accuracy": 0.6889358758926392, + "num_tokens": 13477964.0, + "step": 521 + }, + { + "epoch": 0.05732484076433121, + "grad_norm": 2.36757493019104, + "learning_rate": 9.535139092240119e-07, + "loss": 1.0685, + "mean_token_accuracy": 0.6789036393165588, + "num_tokens": 13503627.0, + "step": 522 + }, + { + "epoch": 0.05743465846694487, + "grad_norm": 2.2670795917510986, + "learning_rate": 9.553440702781844e-07, + "loss": 1.1338, + "mean_token_accuracy": 0.6657978296279907, + "num_tokens": 13533113.0, + "step": 523 + }, + { + "epoch": 0.05754447616955853, + "grad_norm": 1.979170322418213, + "learning_rate": 9.571742313323572e-07, + "loss": 1.055, + "mean_token_accuracy": 0.6864005923271179, + "num_tokens": 13568620.0, + "step": 524 + }, + { + "epoch": 0.05765429387217219, + "grad_norm": 2.2586748600006104, + "learning_rate": 9.5900439238653e-07, + "loss": 1.0011, + "mean_token_accuracy": 0.7040588855743408, + "num_tokens": 13594140.0, + "step": 525 + }, + { + "epoch": 0.05776411157478586, + "grad_norm": 2.4263508319854736, + "learning_rate": 9.608345534407028e-07, + "loss": 1.1415, + "mean_token_accuracy": 0.6726782917976379, + "num_tokens": 13619469.0, + "step": 526 + }, + { + "epoch": 0.05787392927739952, + "grad_norm": 2.392857551574707, + "learning_rate": 9.626647144948756e-07, + "loss": 1.0349, + "mean_token_accuracy": 0.6886462569236755, + "num_tokens": 13641658.0, + "step": 527 + }, + { + "epoch": 0.05798374698001318, + "grad_norm": 2.3156659603118896, + "learning_rate": 9.644948755490484e-07, + "loss": 1.0251, + "mean_token_accuracy": 0.6866095662117004, + "num_tokens": 13668134.0, + "step": 528 + }, + { + "epoch": 0.05809356468262684, + "grad_norm": 2.3586266040802, + "learning_rate": 9.663250366032212e-07, + "loss": 1.0364, + "mean_token_accuracy": 0.6946056485176086, + "num_tokens": 13694346.0, + "step": 529 + }, + { + "epoch": 0.058203382385240504, + "grad_norm": 2.371295213699341, + "learning_rate": 9.68155197657394e-07, + "loss": 1.0349, + "mean_token_accuracy": 0.6894857883453369, + "num_tokens": 13719031.0, + "step": 530 + }, + { + "epoch": 0.058313200087854164, + "grad_norm": 2.1729960441589355, + "learning_rate": 9.699853587115666e-07, + "loss": 1.0262, + "mean_token_accuracy": 0.6937713027000427, + "num_tokens": 13747255.0, + "step": 531 + }, + { + "epoch": 0.058423017790467824, + "grad_norm": 2.7413504123687744, + "learning_rate": 9.718155197657394e-07, + "loss": 1.1392, + "mean_token_accuracy": 0.675663411617279, + "num_tokens": 13767474.0, + "step": 532 + }, + { + "epoch": 0.058532835493081484, + "grad_norm": 2.166494369506836, + "learning_rate": 9.736456808199122e-07, + "loss": 1.0293, + "mean_token_accuracy": 0.6915645003318787, + "num_tokens": 13794671.0, + "step": 533 + }, + { + "epoch": 0.058642653195695144, + "grad_norm": 2.242465019226074, + "learning_rate": 9.75475841874085e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6885303854942322, + "num_tokens": 13820702.0, + "step": 534 + }, + { + "epoch": 0.05875247089830881, + "grad_norm": 2.356691360473633, + "learning_rate": 9.773060029282578e-07, + "loss": 1.0345, + "mean_token_accuracy": 0.6956143379211426, + "num_tokens": 13842443.0, + "step": 535 + }, + { + "epoch": 0.05886228860092247, + "grad_norm": 2.1159632205963135, + "learning_rate": 9.791361639824306e-07, + "loss": 1.044, + "mean_token_accuracy": 0.6880069971084595, + "num_tokens": 13870245.0, + "step": 536 + }, + { + "epoch": 0.05897210630353613, + "grad_norm": 2.3812553882598877, + "learning_rate": 9.809663250366034e-07, + "loss": 1.0773, + "mean_token_accuracy": 0.6810154914855957, + "num_tokens": 13893142.0, + "step": 537 + }, + { + "epoch": 0.05908192400614979, + "grad_norm": 2.0252134799957275, + "learning_rate": 9.827964860907762e-07, + "loss": 1.0097, + "mean_token_accuracy": 0.6940951347351074, + "num_tokens": 13923802.0, + "step": 538 + }, + { + "epoch": 0.05919174170876345, + "grad_norm": 2.413344383239746, + "learning_rate": 9.846266471449488e-07, + "loss": 0.9927, + "mean_token_accuracy": 0.6974400877952576, + "num_tokens": 13947518.0, + "step": 539 + }, + { + "epoch": 0.05930155941137712, + "grad_norm": 2.301760673522949, + "learning_rate": 9.864568081991216e-07, + "loss": 1.1463, + "mean_token_accuracy": 0.665134072303772, + "num_tokens": 13972457.0, + "step": 540 + }, + { + "epoch": 0.059411377113990776, + "grad_norm": 2.2416555881500244, + "learning_rate": 9.882869692532944e-07, + "loss": 1.0736, + "mean_token_accuracy": 0.6884483695030212, + "num_tokens": 13999527.0, + "step": 541 + }, + { + "epoch": 0.059521194816604436, + "grad_norm": 2.4488906860351562, + "learning_rate": 9.901171303074672e-07, + "loss": 1.0826, + "mean_token_accuracy": 0.6837432980537415, + "num_tokens": 14024171.0, + "step": 542 + }, + { + "epoch": 0.059631012519218096, + "grad_norm": 2.317229747772217, + "learning_rate": 9.919472913616398e-07, + "loss": 1.0638, + "mean_token_accuracy": 0.693930447101593, + "num_tokens": 14050476.0, + "step": 543 + }, + { + "epoch": 0.059740830221831756, + "grad_norm": 2.156261444091797, + "learning_rate": 9.937774524158126e-07, + "loss": 1.0779, + "mean_token_accuracy": 0.6833745837211609, + "num_tokens": 14077677.0, + "step": 544 + }, + { + "epoch": 0.05985064792444542, + "grad_norm": 2.092066526412964, + "learning_rate": 9.956076134699854e-07, + "loss": 1.2147, + "mean_token_accuracy": 0.657728910446167, + "num_tokens": 14108758.0, + "step": 545 + }, + { + "epoch": 0.05996046562705908, + "grad_norm": 2.118772029876709, + "learning_rate": 9.974377745241582e-07, + "loss": 1.1562, + "mean_token_accuracy": 0.6621192097663879, + "num_tokens": 14143028.0, + "step": 546 + }, + { + "epoch": 0.06007028332967274, + "grad_norm": 2.1120800971984863, + "learning_rate": 9.99267935578331e-07, + "loss": 1.0756, + "mean_token_accuracy": 0.6861885190010071, + "num_tokens": 14172920.0, + "step": 547 + }, + { + "epoch": 0.0601801010322864, + "grad_norm": 2.3426530361175537, + "learning_rate": 1.0010980966325038e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.6831667423248291, + "num_tokens": 14197450.0, + "step": 548 + }, + { + "epoch": 0.06028991873490007, + "grad_norm": 2.0836312770843506, + "learning_rate": 1.0029282576866766e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6889939308166504, + "num_tokens": 14230957.0, + "step": 549 + }, + { + "epoch": 0.06039973643751373, + "grad_norm": 2.2850441932678223, + "learning_rate": 1.0047584187408494e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7041582465171814, + "num_tokens": 14254647.0, + "step": 550 + }, + { + "epoch": 0.06050955414012739, + "grad_norm": 2.341621160507202, + "learning_rate": 1.006588579795022e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6851842403411865, + "num_tokens": 14280259.0, + "step": 551 + }, + { + "epoch": 0.06061937184274105, + "grad_norm": 2.892702102661133, + "learning_rate": 1.0084187408491947e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6896889805793762, + "num_tokens": 14298773.0, + "step": 552 + }, + { + "epoch": 0.06072918954535471, + "grad_norm": 2.2887022495269775, + "learning_rate": 1.0102489019033675e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6991175413131714, + "num_tokens": 14325248.0, + "step": 553 + }, + { + "epoch": 0.060839007247968376, + "grad_norm": 2.3086650371551514, + "learning_rate": 1.0120790629575403e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6787078976631165, + "num_tokens": 14349600.0, + "step": 554 + }, + { + "epoch": 0.060948824950582035, + "grad_norm": 2.236985921859741, + "learning_rate": 1.0139092240117131e-06, + "loss": 1.097, + "mean_token_accuracy": 0.674027681350708, + "num_tokens": 14377694.0, + "step": 555 + }, + { + "epoch": 0.061058642653195695, + "grad_norm": 2.065560817718506, + "learning_rate": 1.015739385065886e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6934404373168945, + "num_tokens": 14407413.0, + "step": 556 + }, + { + "epoch": 0.061168460355809355, + "grad_norm": 2.3350422382354736, + "learning_rate": 1.0175695461200587e-06, + "loss": 1.0986, + "mean_token_accuracy": 0.6708605289459229, + "num_tokens": 14434811.0, + "step": 557 + }, + { + "epoch": 0.061278278058423015, + "grad_norm": 2.5769755840301514, + "learning_rate": 1.0193997071742315e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6854250431060791, + "num_tokens": 14456596.0, + "step": 558 + }, + { + "epoch": 0.06138809576103668, + "grad_norm": 2.2464494705200195, + "learning_rate": 1.0212298682284041e-06, + "loss": 1.1063, + "mean_token_accuracy": 0.6701536774635315, + "num_tokens": 14482330.0, + "step": 559 + }, + { + "epoch": 0.06149791346365034, + "grad_norm": 2.5889952182769775, + "learning_rate": 1.023060029282577e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.708965539932251, + "num_tokens": 14502863.0, + "step": 560 + }, + { + "epoch": 0.061607731166264, + "grad_norm": 2.0341341495513916, + "learning_rate": 1.0248901903367497e-06, + "loss": 1.0896, + "mean_token_accuracy": 0.6862695813179016, + "num_tokens": 14535166.0, + "step": 561 + }, + { + "epoch": 0.06171754886887766, + "grad_norm": 2.2777678966522217, + "learning_rate": 1.0267203513909225e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6854784488677979, + "num_tokens": 14560208.0, + "step": 562 + }, + { + "epoch": 0.06182736657149132, + "grad_norm": 2.314115047454834, + "learning_rate": 1.0285505124450953e-06, + "loss": 1.1151, + "mean_token_accuracy": 0.6701977252960205, + "num_tokens": 14586692.0, + "step": 563 + }, + { + "epoch": 0.06193718427410499, + "grad_norm": 2.375837564468384, + "learning_rate": 1.0303806734992681e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6964209675788879, + "num_tokens": 14610681.0, + "step": 564 + }, + { + "epoch": 0.06204700197671865, + "grad_norm": 2.312073230743408, + "learning_rate": 1.032210834553441e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6901412010192871, + "num_tokens": 14639026.0, + "step": 565 + }, + { + "epoch": 0.06215681967933231, + "grad_norm": 2.328702449798584, + "learning_rate": 1.0340409956076137e-06, + "loss": 1.1098, + "mean_token_accuracy": 0.677627682685852, + "num_tokens": 14665655.0, + "step": 566 + }, + { + "epoch": 0.06226663738194597, + "grad_norm": 2.2589097023010254, + "learning_rate": 1.0358711566617863e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6828530430793762, + "num_tokens": 14694721.0, + "step": 567 + }, + { + "epoch": 0.062376455084559634, + "grad_norm": 2.423825740814209, + "learning_rate": 1.037701317715959e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6922272443771362, + "num_tokens": 14718122.0, + "step": 568 + }, + { + "epoch": 0.062486272787173294, + "grad_norm": 2.2363476753234863, + "learning_rate": 1.0395314787701319e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6978490948677063, + "num_tokens": 14744766.0, + "step": 569 + }, + { + "epoch": 0.06259609048978695, + "grad_norm": 2.258350372314453, + "learning_rate": 1.0413616398243047e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6928151249885559, + "num_tokens": 14772843.0, + "step": 570 + }, + { + "epoch": 0.06270590819240061, + "grad_norm": 2.001682758331299, + "learning_rate": 1.0431918008784773e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6894739866256714, + "num_tokens": 14808076.0, + "step": 571 + }, + { + "epoch": 0.06281572589501427, + "grad_norm": 2.3005666732788086, + "learning_rate": 1.04502196193265e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6949687600135803, + "num_tokens": 14834078.0, + "step": 572 + }, + { + "epoch": 0.06292554359762793, + "grad_norm": 2.01583194732666, + "learning_rate": 1.0468521229868229e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6857874393463135, + "num_tokens": 14864532.0, + "step": 573 + }, + { + "epoch": 0.0630353613002416, + "grad_norm": 2.3891799449920654, + "learning_rate": 1.0486822840409957e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6971402168273926, + "num_tokens": 14888493.0, + "step": 574 + }, + { + "epoch": 0.06314517900285525, + "grad_norm": 2.388533353805542, + "learning_rate": 1.0505124450951685e-06, + "loss": 1.1142, + "mean_token_accuracy": 0.6660879850387573, + "num_tokens": 14913696.0, + "step": 575 + }, + { + "epoch": 0.06325499670546893, + "grad_norm": 2.2506515979766846, + "learning_rate": 1.0523426061493413e-06, + "loss": 1.1258, + "mean_token_accuracy": 0.6705377101898193, + "num_tokens": 14940331.0, + "step": 576 + }, + { + "epoch": 0.06336481440808259, + "grad_norm": 2.1790757179260254, + "learning_rate": 1.054172767203514e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6966467499732971, + "num_tokens": 14966020.0, + "step": 577 + }, + { + "epoch": 0.06347463211069625, + "grad_norm": 2.277449369430542, + "learning_rate": 1.0560029282576869e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6922321319580078, + "num_tokens": 14993168.0, + "step": 578 + }, + { + "epoch": 0.0635844498133099, + "grad_norm": 2.647369146347046, + "learning_rate": 1.0578330893118595e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6966391801834106, + "num_tokens": 15014374.0, + "step": 579 + }, + { + "epoch": 0.06369426751592357, + "grad_norm": 2.1673600673675537, + "learning_rate": 1.0596632503660322e-06, + "loss": 1.1452, + "mean_token_accuracy": 0.658057451248169, + "num_tokens": 15044556.0, + "step": 580 + }, + { + "epoch": 0.06380408521853723, + "grad_norm": 2.446493148803711, + "learning_rate": 1.061493411420205e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6984797120094299, + "num_tokens": 15067565.0, + "step": 581 + }, + { + "epoch": 0.06391390292115089, + "grad_norm": 2.4915952682495117, + "learning_rate": 1.0633235724743778e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6996276378631592, + "num_tokens": 15089589.0, + "step": 582 + }, + { + "epoch": 0.06402372062376455, + "grad_norm": 2.140639305114746, + "learning_rate": 1.0651537335285506e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6918267607688904, + "num_tokens": 15118881.0, + "step": 583 + }, + { + "epoch": 0.0641335383263782, + "grad_norm": 2.208601236343384, + "learning_rate": 1.0669838945827234e-06, + "loss": 1.1099, + "mean_token_accuracy": 0.6730544567108154, + "num_tokens": 15145322.0, + "step": 584 + }, + { + "epoch": 0.06424335602899188, + "grad_norm": 2.182400941848755, + "learning_rate": 1.0688140556368962e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6961876153945923, + "num_tokens": 15172368.0, + "step": 585 + }, + { + "epoch": 0.06435317373160554, + "grad_norm": 2.3574378490448, + "learning_rate": 1.070644216691069e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6868278980255127, + "num_tokens": 15196808.0, + "step": 586 + }, + { + "epoch": 0.0644629914342192, + "grad_norm": 2.664295196533203, + "learning_rate": 1.0724743777452416e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6926919221878052, + "num_tokens": 15216853.0, + "step": 587 + }, + { + "epoch": 0.06457280913683286, + "grad_norm": 2.2587382793426514, + "learning_rate": 1.0743045387994144e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7062407732009888, + "num_tokens": 15244073.0, + "step": 588 + }, + { + "epoch": 0.06468262683944652, + "grad_norm": 2.42038631439209, + "learning_rate": 1.0761346998535872e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.6917333602905273, + "num_tokens": 15267879.0, + "step": 589 + }, + { + "epoch": 0.06479244454206018, + "grad_norm": 2.402540683746338, + "learning_rate": 1.07796486090776e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6819613575935364, + "num_tokens": 15293382.0, + "step": 590 + }, + { + "epoch": 0.06490226224467384, + "grad_norm": 2.2761776447296143, + "learning_rate": 1.0797950219619326e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.680334210395813, + "num_tokens": 15317469.0, + "step": 591 + }, + { + "epoch": 0.0650120799472875, + "grad_norm": 2.385906457901001, + "learning_rate": 1.0816251830161054e-06, + "loss": 1.1, + "mean_token_accuracy": 0.6826051473617554, + "num_tokens": 15343369.0, + "step": 592 + }, + { + "epoch": 0.06512189764990116, + "grad_norm": 2.544018030166626, + "learning_rate": 1.0834553440702782e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7033692598342896, + "num_tokens": 15364895.0, + "step": 593 + }, + { + "epoch": 0.06523171535251482, + "grad_norm": 2.3652398586273193, + "learning_rate": 1.085285505124451e-06, + "loss": 1.1368, + "mean_token_accuracy": 0.6694431304931641, + "num_tokens": 15391170.0, + "step": 594 + }, + { + "epoch": 0.06534153305512849, + "grad_norm": 2.118293046951294, + "learning_rate": 1.0871156661786238e-06, + "loss": 1.1039, + "mean_token_accuracy": 0.6808845400810242, + "num_tokens": 15420153.0, + "step": 595 + }, + { + "epoch": 0.06545135075774215, + "grad_norm": 2.3162343502044678, + "learning_rate": 1.0889458272327966e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6843353509902954, + "num_tokens": 15446061.0, + "step": 596 + }, + { + "epoch": 0.06556116846035581, + "grad_norm": 2.4395856857299805, + "learning_rate": 1.0907759882869694e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6845937967300415, + "num_tokens": 15469218.0, + "step": 597 + }, + { + "epoch": 0.06567098616296947, + "grad_norm": 2.1428451538085938, + "learning_rate": 1.0926061493411422e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6917203068733215, + "num_tokens": 15496350.0, + "step": 598 + }, + { + "epoch": 0.06578080386558313, + "grad_norm": 2.574434280395508, + "learning_rate": 1.0944363103953148e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6828156113624573, + "num_tokens": 15523726.0, + "step": 599 + }, + { + "epoch": 0.06589062156819679, + "grad_norm": 2.541602611541748, + "learning_rate": 1.0962664714494876e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7156336307525635, + "num_tokens": 15543932.0, + "step": 600 + }, + { + "epoch": 0.06600043927081045, + "grad_norm": 2.3624181747436523, + "learning_rate": 1.0980966325036604e-06, + "loss": 1.1245, + "mean_token_accuracy": 0.6705842018127441, + "num_tokens": 15569476.0, + "step": 601 + }, + { + "epoch": 0.06611025697342411, + "grad_norm": 2.2249417304992676, + "learning_rate": 1.0999267935578332e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6862510442733765, + "num_tokens": 15596692.0, + "step": 602 + }, + { + "epoch": 0.06622007467603777, + "grad_norm": 2.374751329421997, + "learning_rate": 1.101756954612006e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6778714060783386, + "num_tokens": 15621310.0, + "step": 603 + }, + { + "epoch": 0.06632989237865144, + "grad_norm": 2.275635242462158, + "learning_rate": 1.1035871156661788e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6834549903869629, + "num_tokens": 15648436.0, + "step": 604 + }, + { + "epoch": 0.0664397100812651, + "grad_norm": 2.253401041030884, + "learning_rate": 1.1054172767203516e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6868107318878174, + "num_tokens": 15675770.0, + "step": 605 + }, + { + "epoch": 0.06654952778387876, + "grad_norm": 2.4118332862854004, + "learning_rate": 1.1072474377745244e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6865822672843933, + "num_tokens": 15700386.0, + "step": 606 + }, + { + "epoch": 0.06665934548649242, + "grad_norm": 2.359020233154297, + "learning_rate": 1.109077598828697e-06, + "loss": 1.1063, + "mean_token_accuracy": 0.6775742173194885, + "num_tokens": 15725738.0, + "step": 607 + }, + { + "epoch": 0.06676916318910608, + "grad_norm": 2.2371954917907715, + "learning_rate": 1.1109077598828698e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6934623718261719, + "num_tokens": 15752951.0, + "step": 608 + }, + { + "epoch": 0.06687898089171974, + "grad_norm": 2.4569625854492188, + "learning_rate": 1.1127379209370426e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6749953031539917, + "num_tokens": 15777650.0, + "step": 609 + }, + { + "epoch": 0.0669887985943334, + "grad_norm": 2.270289897918701, + "learning_rate": 1.1145680819912154e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7014426589012146, + "num_tokens": 15803446.0, + "step": 610 + }, + { + "epoch": 0.06709861629694706, + "grad_norm": 2.345302104949951, + "learning_rate": 1.1163982430453882e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7104076743125916, + "num_tokens": 15830829.0, + "step": 611 + }, + { + "epoch": 0.06720843399956072, + "grad_norm": 2.488420248031616, + "learning_rate": 1.118228404099561e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7009477019309998, + "num_tokens": 15852013.0, + "step": 612 + }, + { + "epoch": 0.06731825170217438, + "grad_norm": 2.1728878021240234, + "learning_rate": 1.1200585651537337e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7315019369125366, + "num_tokens": 15877714.0, + "step": 613 + }, + { + "epoch": 0.06742806940478806, + "grad_norm": 2.71893572807312, + "learning_rate": 1.1218887262079065e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6966979503631592, + "num_tokens": 15896481.0, + "step": 614 + }, + { + "epoch": 0.06753788710740172, + "grad_norm": 2.7263576984405518, + "learning_rate": 1.1237188872620791e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7156102657318115, + "num_tokens": 15915060.0, + "step": 615 + }, + { + "epoch": 0.06764770481001538, + "grad_norm": 2.262775421142578, + "learning_rate": 1.125549048316252e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6921675205230713, + "num_tokens": 15940889.0, + "step": 616 + }, + { + "epoch": 0.06775752251262904, + "grad_norm": 2.3924970626831055, + "learning_rate": 1.1273792093704247e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6954002380371094, + "num_tokens": 15963357.0, + "step": 617 + }, + { + "epoch": 0.0678673402152427, + "grad_norm": 2.3848752975463867, + "learning_rate": 1.1292093704245975e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6826539039611816, + "num_tokens": 15987676.0, + "step": 618 + }, + { + "epoch": 0.06797715791785636, + "grad_norm": 2.2310943603515625, + "learning_rate": 1.1310395314787701e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6926614046096802, + "num_tokens": 16013312.0, + "step": 619 + }, + { + "epoch": 0.06808697562047002, + "grad_norm": 2.317502975463867, + "learning_rate": 1.132869692532943e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6912598609924316, + "num_tokens": 16039300.0, + "step": 620 + }, + { + "epoch": 0.06819679332308368, + "grad_norm": 1.8872519731521606, + "learning_rate": 1.1346998535871157e-06, + "loss": 1.0791, + "mean_token_accuracy": 0.6830843091011047, + "num_tokens": 16079124.0, + "step": 621 + }, + { + "epoch": 0.06830661102569734, + "grad_norm": 2.6005752086639404, + "learning_rate": 1.1365300146412885e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6988505125045776, + "num_tokens": 16099723.0, + "step": 622 + }, + { + "epoch": 0.06841642872831101, + "grad_norm": 2.328848123550415, + "learning_rate": 1.1383601756954613e-06, + "loss": 1.0949, + "mean_token_accuracy": 0.6779663562774658, + "num_tokens": 16125973.0, + "step": 623 + }, + { + "epoch": 0.06852624643092467, + "grad_norm": 2.576923370361328, + "learning_rate": 1.1401903367496341e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.678848922252655, + "num_tokens": 16147599.0, + "step": 624 + }, + { + "epoch": 0.06863606413353833, + "grad_norm": 2.0702171325683594, + "learning_rate": 1.142020497803807e-06, + "loss": 1.0818, + "mean_token_accuracy": 0.6751532554626465, + "num_tokens": 16180581.0, + "step": 625 + }, + { + "epoch": 0.06874588183615199, + "grad_norm": 2.5012943744659424, + "learning_rate": 1.1438506588579797e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6943838596343994, + "num_tokens": 16204195.0, + "step": 626 + }, + { + "epoch": 0.06885569953876565, + "grad_norm": 2.1859688758850098, + "learning_rate": 1.1456808199121523e-06, + "loss": 1.2068, + "mean_token_accuracy": 0.6558350920677185, + "num_tokens": 16235119.0, + "step": 627 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 2.3686435222625732, + "learning_rate": 1.147510980966325e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6794748306274414, + "num_tokens": 16260196.0, + "step": 628 + }, + { + "epoch": 0.06907533494399297, + "grad_norm": 2.480360984802246, + "learning_rate": 1.1493411420204979e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6950423121452332, + "num_tokens": 16284331.0, + "step": 629 + }, + { + "epoch": 0.06918515264660663, + "grad_norm": 2.3895957469940186, + "learning_rate": 1.1511713030746707e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7035046815872192, + "num_tokens": 16306704.0, + "step": 630 + }, + { + "epoch": 0.06929497034922029, + "grad_norm": 2.2850608825683594, + "learning_rate": 1.1530014641288435e-06, + "loss": 1.1274, + "mean_token_accuracy": 0.6659748554229736, + "num_tokens": 16336004.0, + "step": 631 + }, + { + "epoch": 0.06940478805183395, + "grad_norm": 2.3410184383392334, + "learning_rate": 1.1548316251830163e-06, + "loss": 1.1202, + "mean_token_accuracy": 0.6686033010482788, + "num_tokens": 16363725.0, + "step": 632 + }, + { + "epoch": 0.06951460575444762, + "grad_norm": 2.1268932819366455, + "learning_rate": 1.156661786237189e-06, + "loss": 1.0917, + "mean_token_accuracy": 0.6843673586845398, + "num_tokens": 16393282.0, + "step": 633 + }, + { + "epoch": 0.06962442345706128, + "grad_norm": 2.1481924057006836, + "learning_rate": 1.1584919472913619e-06, + "loss": 1.1363, + "mean_token_accuracy": 0.669183611869812, + "num_tokens": 16423955.0, + "step": 634 + }, + { + "epoch": 0.06973424115967494, + "grad_norm": 2.3508522510528564, + "learning_rate": 1.1603221083455345e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7064652442932129, + "num_tokens": 16448078.0, + "step": 635 + }, + { + "epoch": 0.0698440588622886, + "grad_norm": 2.5211925506591797, + "learning_rate": 1.1621522693997073e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6905688047409058, + "num_tokens": 16470671.0, + "step": 636 + }, + { + "epoch": 0.06995387656490226, + "grad_norm": 2.5739641189575195, + "learning_rate": 1.16398243045388e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7091958522796631, + "num_tokens": 16491187.0, + "step": 637 + }, + { + "epoch": 0.07006369426751592, + "grad_norm": 2.4030303955078125, + "learning_rate": 1.1658125915080529e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6990543603897095, + "num_tokens": 16512833.0, + "step": 638 + }, + { + "epoch": 0.07017351197012958, + "grad_norm": 2.3952786922454834, + "learning_rate": 1.1676427525622254e-06, + "loss": 1.107, + "mean_token_accuracy": 0.6754163503646851, + "num_tokens": 16538174.0, + "step": 639 + }, + { + "epoch": 0.07028332967274324, + "grad_norm": 2.680105209350586, + "learning_rate": 1.1694729136163982e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7142612934112549, + "num_tokens": 16557268.0, + "step": 640 + }, + { + "epoch": 0.0703931473753569, + "grad_norm": 2.1696524620056152, + "learning_rate": 1.171303074670571e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.694198727607727, + "num_tokens": 16585239.0, + "step": 641 + }, + { + "epoch": 0.07050296507797058, + "grad_norm": 2.4416239261627197, + "learning_rate": 1.1731332357247438e-06, + "loss": 1.1213, + "mean_token_accuracy": 0.6831202507019043, + "num_tokens": 16611970.0, + "step": 642 + }, + { + "epoch": 0.07061278278058424, + "grad_norm": 2.353233575820923, + "learning_rate": 1.1749633967789166e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6881475448608398, + "num_tokens": 16636178.0, + "step": 643 + }, + { + "epoch": 0.0707226004831979, + "grad_norm": 2.744215726852417, + "learning_rate": 1.1767935578330894e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6795729398727417, + "num_tokens": 16655803.0, + "step": 644 + }, + { + "epoch": 0.07083241818581155, + "grad_norm": 2.7883269786834717, + "learning_rate": 1.1786237188872622e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7157618403434753, + "num_tokens": 16673092.0, + "step": 645 + }, + { + "epoch": 0.07094223588842521, + "grad_norm": 2.1540679931640625, + "learning_rate": 1.180453879941435e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7025617361068726, + "num_tokens": 16701841.0, + "step": 646 + }, + { + "epoch": 0.07105205359103887, + "grad_norm": 2.0835533142089844, + "learning_rate": 1.1822840409956076e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.6807781457901001, + "num_tokens": 16730323.0, + "step": 647 + }, + { + "epoch": 0.07116187129365253, + "grad_norm": 2.254840850830078, + "learning_rate": 1.1841142020497804e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7089033722877502, + "num_tokens": 16758387.0, + "step": 648 + }, + { + "epoch": 0.0712716889962662, + "grad_norm": 2.709393262863159, + "learning_rate": 1.1859443631039532e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7181327939033508, + "num_tokens": 16777948.0, + "step": 649 + }, + { + "epoch": 0.07138150669887985, + "grad_norm": 2.458078622817993, + "learning_rate": 1.187774524158126e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7069375514984131, + "num_tokens": 16800218.0, + "step": 650 + }, + { + "epoch": 0.07149132440149351, + "grad_norm": 2.419766902923584, + "learning_rate": 1.1896046852122988e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6990324258804321, + "num_tokens": 16824747.0, + "step": 651 + }, + { + "epoch": 0.07160114210410719, + "grad_norm": 2.497084140777588, + "learning_rate": 1.1914348462664716e-06, + "loss": 1.1661, + "mean_token_accuracy": 0.6709829568862915, + "num_tokens": 16848945.0, + "step": 652 + }, + { + "epoch": 0.07171095980672085, + "grad_norm": 2.3774805068969727, + "learning_rate": 1.1932650073206444e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.692409873008728, + "num_tokens": 16874584.0, + "step": 653 + }, + { + "epoch": 0.07182077750933451, + "grad_norm": 2.201204299926758, + "learning_rate": 1.1950951683748172e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6882723569869995, + "num_tokens": 16903108.0, + "step": 654 + }, + { + "epoch": 0.07193059521194817, + "grad_norm": 2.3744699954986572, + "learning_rate": 1.1969253294289898e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6992850303649902, + "num_tokens": 16929678.0, + "step": 655 + }, + { + "epoch": 0.07204041291456183, + "grad_norm": 2.7570385932922363, + "learning_rate": 1.1987554904831626e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6894065141677856, + "num_tokens": 16950170.0, + "step": 656 + }, + { + "epoch": 0.07215023061717549, + "grad_norm": 2.365858554840088, + "learning_rate": 1.2005856515373354e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7108068466186523, + "num_tokens": 16971679.0, + "step": 657 + }, + { + "epoch": 0.07226004831978915, + "grad_norm": 2.183664083480835, + "learning_rate": 1.2024158125915082e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6952924132347107, + "num_tokens": 17000328.0, + "step": 658 + }, + { + "epoch": 0.0723698660224028, + "grad_norm": 2.284895896911621, + "learning_rate": 1.2042459736456808e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6911829710006714, + "num_tokens": 17027310.0, + "step": 659 + }, + { + "epoch": 0.07247968372501647, + "grad_norm": 2.244859457015991, + "learning_rate": 1.2060761346998538e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6933692097663879, + "num_tokens": 17054332.0, + "step": 660 + }, + { + "epoch": 0.07258950142763014, + "grad_norm": 2.3271291255950928, + "learning_rate": 1.2079062957540266e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6832638382911682, + "num_tokens": 17080378.0, + "step": 661 + }, + { + "epoch": 0.0726993191302438, + "grad_norm": 2.3240644931793213, + "learning_rate": 1.2097364568081994e-06, + "loss": 1.1527, + "mean_token_accuracy": 0.6612659692764282, + "num_tokens": 17109821.0, + "step": 662 + }, + { + "epoch": 0.07280913683285746, + "grad_norm": 2.0699479579925537, + "learning_rate": 1.211566617862372e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7047457695007324, + "num_tokens": 17138965.0, + "step": 663 + }, + { + "epoch": 0.07291895453547112, + "grad_norm": 2.315486431121826, + "learning_rate": 1.2133967789165448e-06, + "loss": 1.1544, + "mean_token_accuracy": 0.661436915397644, + "num_tokens": 17166681.0, + "step": 664 + }, + { + "epoch": 0.07302877223808478, + "grad_norm": 2.299215316772461, + "learning_rate": 1.2152269399707176e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6852428913116455, + "num_tokens": 17193340.0, + "step": 665 + }, + { + "epoch": 0.07313858994069844, + "grad_norm": 2.6044962406158447, + "learning_rate": 1.2170571010248904e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6947742104530334, + "num_tokens": 17215847.0, + "step": 666 + }, + { + "epoch": 0.0732484076433121, + "grad_norm": 2.275789976119995, + "learning_rate": 1.218887262079063e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7028314471244812, + "num_tokens": 17240907.0, + "step": 667 + }, + { + "epoch": 0.07335822534592576, + "grad_norm": 2.1289219856262207, + "learning_rate": 1.2207174231332358e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7047756910324097, + "num_tokens": 17271481.0, + "step": 668 + }, + { + "epoch": 0.07346804304853942, + "grad_norm": 2.283541679382324, + "learning_rate": 1.2225475841874086e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7147896885871887, + "num_tokens": 17295539.0, + "step": 669 + }, + { + "epoch": 0.07357786075115308, + "grad_norm": 2.306562900543213, + "learning_rate": 1.2243777452415813e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6835716962814331, + "num_tokens": 17323186.0, + "step": 670 + }, + { + "epoch": 0.07368767845376675, + "grad_norm": 2.826174259185791, + "learning_rate": 1.2262079062957541e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7062252759933472, + "num_tokens": 17342993.0, + "step": 671 + }, + { + "epoch": 0.07379749615638041, + "grad_norm": 2.3371243476867676, + "learning_rate": 1.228038067349927e-06, + "loss": 1.0988, + "mean_token_accuracy": 0.6734662055969238, + "num_tokens": 17369691.0, + "step": 672 + }, + { + "epoch": 0.07390731385899407, + "grad_norm": 2.8101320266723633, + "learning_rate": 1.2298682284040997e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7073061466217041, + "num_tokens": 17387254.0, + "step": 673 + }, + { + "epoch": 0.07401713156160773, + "grad_norm": 2.3684768676757812, + "learning_rate": 1.2316983894582725e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.6735561490058899, + "num_tokens": 17411789.0, + "step": 674 + }, + { + "epoch": 0.07412694926422139, + "grad_norm": 2.4854776859283447, + "learning_rate": 1.2335285505124451e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6971673369407654, + "num_tokens": 17435407.0, + "step": 675 + }, + { + "epoch": 0.07423676696683505, + "grad_norm": 2.572396755218506, + "learning_rate": 1.235358711566618e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.68961101770401, + "num_tokens": 17456317.0, + "step": 676 + }, + { + "epoch": 0.07434658466944871, + "grad_norm": 2.790412187576294, + "learning_rate": 1.2371888726207907e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6908167600631714, + "num_tokens": 17475270.0, + "step": 677 + }, + { + "epoch": 0.07445640237206237, + "grad_norm": 2.2163069248199463, + "learning_rate": 1.2390190336749635e-06, + "loss": 1.212, + "mean_token_accuracy": 0.6576230525970459, + "num_tokens": 17505763.0, + "step": 678 + }, + { + "epoch": 0.07456622007467603, + "grad_norm": 2.2123239040374756, + "learning_rate": 1.2408491947291363e-06, + "loss": 1.2115, + "mean_token_accuracy": 0.6453306674957275, + "num_tokens": 17536837.0, + "step": 679 + }, + { + "epoch": 0.0746760377772897, + "grad_norm": 2.539823532104492, + "learning_rate": 1.2426793557833091e-06, + "loss": 1.0849, + "mean_token_accuracy": 0.6792007684707642, + "num_tokens": 17559754.0, + "step": 680 + }, + { + "epoch": 0.07478585547990337, + "grad_norm": 2.222013235092163, + "learning_rate": 1.244509516837482e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6822829842567444, + "num_tokens": 17587933.0, + "step": 681 + }, + { + "epoch": 0.07489567318251703, + "grad_norm": 2.2431485652923584, + "learning_rate": 1.2463396778916547e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6869246959686279, + "num_tokens": 17617716.0, + "step": 682 + }, + { + "epoch": 0.07500549088513069, + "grad_norm": 2.5116655826568604, + "learning_rate": 1.2481698389458273e-06, + "loss": 1.0953, + "mean_token_accuracy": 0.6839919686317444, + "num_tokens": 17641126.0, + "step": 683 + }, + { + "epoch": 0.07511530858774434, + "grad_norm": 2.3246102333068848, + "learning_rate": 1.25e-06, + "loss": 1.1573, + "mean_token_accuracy": 0.6610027551651001, + "num_tokens": 17670388.0, + "step": 684 + }, + { + "epoch": 0.075225126290358, + "grad_norm": 2.2945990562438965, + "learning_rate": 1.2518301610541727e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.6720162630081177, + "num_tokens": 17697836.0, + "step": 685 + }, + { + "epoch": 0.07533494399297166, + "grad_norm": 2.298156261444092, + "learning_rate": 1.2536603221083457e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6800563931465149, + "num_tokens": 17726228.0, + "step": 686 + }, + { + "epoch": 0.07544476169558532, + "grad_norm": 2.4575860500335693, + "learning_rate": 1.2554904831625183e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6878335475921631, + "num_tokens": 17751156.0, + "step": 687 + }, + { + "epoch": 0.07555457939819898, + "grad_norm": 2.1869122982025146, + "learning_rate": 1.257320644216691e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6927695274353027, + "num_tokens": 17777844.0, + "step": 688 + }, + { + "epoch": 0.07566439710081264, + "grad_norm": 2.6275217533111572, + "learning_rate": 1.2591508052708639e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7111775875091553, + "num_tokens": 17797491.0, + "step": 689 + }, + { + "epoch": 0.07577421480342632, + "grad_norm": 2.3919570446014404, + "learning_rate": 1.2609809663250367e-06, + "loss": 1.0966, + "mean_token_accuracy": 0.6837007999420166, + "num_tokens": 17832775.0, + "step": 690 + }, + { + "epoch": 0.07588403250603998, + "grad_norm": 2.2297213077545166, + "learning_rate": 1.2628111273792095e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6958776712417603, + "num_tokens": 17858088.0, + "step": 691 + }, + { + "epoch": 0.07599385020865364, + "grad_norm": 2.1058881282806396, + "learning_rate": 1.2646412884333823e-06, + "loss": 1.116, + "mean_token_accuracy": 0.6719158887863159, + "num_tokens": 17889058.0, + "step": 692 + }, + { + "epoch": 0.0761036679112673, + "grad_norm": 2.384384870529175, + "learning_rate": 1.2664714494875549e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6937069892883301, + "num_tokens": 17917872.0, + "step": 693 + }, + { + "epoch": 0.07621348561388096, + "grad_norm": 2.08693528175354, + "learning_rate": 1.2683016105417279e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6896955370903015, + "num_tokens": 17947269.0, + "step": 694 + }, + { + "epoch": 0.07632330331649462, + "grad_norm": 2.208536386489868, + "learning_rate": 1.2701317715959005e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6869146823883057, + "num_tokens": 17974411.0, + "step": 695 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 2.5773706436157227, + "learning_rate": 1.2719619326500733e-06, + "loss": 1.1084, + "mean_token_accuracy": 0.675640344619751, + "num_tokens": 17998280.0, + "step": 696 + }, + { + "epoch": 0.07654293872172194, + "grad_norm": 2.506455898284912, + "learning_rate": 1.273792093704246e-06, + "loss": 1.0951, + "mean_token_accuracy": 0.6759030818939209, + "num_tokens": 18021082.0, + "step": 697 + }, + { + "epoch": 0.0766527564243356, + "grad_norm": 2.1436359882354736, + "learning_rate": 1.2756222547584189e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7098819017410278, + "num_tokens": 18049122.0, + "step": 698 + }, + { + "epoch": 0.07676257412694927, + "grad_norm": 2.3721930980682373, + "learning_rate": 1.2774524158125917e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7043600082397461, + "num_tokens": 18071669.0, + "step": 699 + }, + { + "epoch": 0.07687239182956293, + "grad_norm": 2.5397374629974365, + "learning_rate": 1.2792825768667645e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7070221900939941, + "num_tokens": 18094951.0, + "step": 700 + }, + { + "epoch": 0.07698220953217659, + "grad_norm": 2.4496796131134033, + "learning_rate": 1.281112737920937e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6873705387115479, + "num_tokens": 18118571.0, + "step": 701 + }, + { + "epoch": 0.07709202723479025, + "grad_norm": 2.664055347442627, + "learning_rate": 1.28294289897511e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6854043006896973, + "num_tokens": 18138524.0, + "step": 702 + }, + { + "epoch": 0.07720184493740391, + "grad_norm": 2.3788533210754395, + "learning_rate": 1.2847730600292826e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7137336730957031, + "num_tokens": 18161629.0, + "step": 703 + }, + { + "epoch": 0.07731166264001757, + "grad_norm": 2.084852457046509, + "learning_rate": 1.2866032210834552e-06, + "loss": 1.1447, + "mean_token_accuracy": 0.6654503345489502, + "num_tokens": 18191802.0, + "step": 704 + }, + { + "epoch": 0.07742148034263123, + "grad_norm": 2.259676694869995, + "learning_rate": 1.2884333821376282e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.690543532371521, + "num_tokens": 18215672.0, + "step": 705 + }, + { + "epoch": 0.07753129804524489, + "grad_norm": 2.515875816345215, + "learning_rate": 1.2902635431918008e-06, + "loss": 1.085, + "mean_token_accuracy": 0.6853643655776978, + "num_tokens": 18237047.0, + "step": 706 + }, + { + "epoch": 0.07764111574785855, + "grad_norm": 2.1531386375427246, + "learning_rate": 1.2920937042459738e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.682395339012146, + "num_tokens": 18265993.0, + "step": 707 + }, + { + "epoch": 0.07775093345047221, + "grad_norm": 2.110705852508545, + "learning_rate": 1.2939238653001464e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6916935443878174, + "num_tokens": 18296797.0, + "step": 708 + }, + { + "epoch": 0.07786075115308588, + "grad_norm": 2.5885276794433594, + "learning_rate": 1.2957540263543192e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6942789554595947, + "num_tokens": 18316512.0, + "step": 709 + }, + { + "epoch": 0.07797056885569954, + "grad_norm": 2.081955909729004, + "learning_rate": 1.2975841874084922e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7062791585922241, + "num_tokens": 18346058.0, + "step": 710 + }, + { + "epoch": 0.0780803865583132, + "grad_norm": 2.2713427543640137, + "learning_rate": 1.2994143484626648e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6860449910163879, + "num_tokens": 18371204.0, + "step": 711 + }, + { + "epoch": 0.07819020426092686, + "grad_norm": 1.9653834104537964, + "learning_rate": 1.3012445095168374e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7016893029212952, + "num_tokens": 18404642.0, + "step": 712 + }, + { + "epoch": 0.07830002196354052, + "grad_norm": 2.353724479675293, + "learning_rate": 1.3030746705710104e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6913173198699951, + "num_tokens": 18428701.0, + "step": 713 + }, + { + "epoch": 0.07840983966615418, + "grad_norm": 2.0201544761657715, + "learning_rate": 1.304904831625183e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6777544021606445, + "num_tokens": 18458904.0, + "step": 714 + }, + { + "epoch": 0.07851965736876784, + "grad_norm": 2.4842700958251953, + "learning_rate": 1.306734992679356e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7142635583877563, + "num_tokens": 18479855.0, + "step": 715 + }, + { + "epoch": 0.0786294750713815, + "grad_norm": 2.3354268074035645, + "learning_rate": 1.3085651537335286e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6894763112068176, + "num_tokens": 18504939.0, + "step": 716 + }, + { + "epoch": 0.07873929277399516, + "grad_norm": 2.246230125427246, + "learning_rate": 1.3103953147877014e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6743897199630737, + "num_tokens": 18531167.0, + "step": 717 + }, + { + "epoch": 0.07884911047660884, + "grad_norm": 2.094808578491211, + "learning_rate": 1.3122254758418742e-06, + "loss": 1.0925, + "mean_token_accuracy": 0.677797794342041, + "num_tokens": 18562145.0, + "step": 718 + }, + { + "epoch": 0.0789589281792225, + "grad_norm": 2.4393310546875, + "learning_rate": 1.314055636896047e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7229115962982178, + "num_tokens": 18586101.0, + "step": 719 + }, + { + "epoch": 0.07906874588183616, + "grad_norm": 2.425447940826416, + "learning_rate": 1.3158857979502196e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6972521543502808, + "num_tokens": 18608551.0, + "step": 720 + }, + { + "epoch": 0.07917856358444982, + "grad_norm": 2.247429132461548, + "learning_rate": 1.3177159590043926e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6992690563201904, + "num_tokens": 18633891.0, + "step": 721 + }, + { + "epoch": 0.07928838128706348, + "grad_norm": 2.257441997528076, + "learning_rate": 1.3195461200585652e-06, + "loss": 1.1114, + "mean_token_accuracy": 0.668522298336029, + "num_tokens": 18660340.0, + "step": 722 + }, + { + "epoch": 0.07939819898967714, + "grad_norm": 2.3045055866241455, + "learning_rate": 1.3213762811127382e-06, + "loss": 1.0916, + "mean_token_accuracy": 0.6724821329116821, + "num_tokens": 18686163.0, + "step": 723 + }, + { + "epoch": 0.0795080166922908, + "grad_norm": 2.277878999710083, + "learning_rate": 1.3232064421669108e-06, + "loss": 1.1783, + "mean_token_accuracy": 0.6541460752487183, + "num_tokens": 18715154.0, + "step": 724 + }, + { + "epoch": 0.07961783439490445, + "grad_norm": 2.525681734085083, + "learning_rate": 1.3250366032210836e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6848687529563904, + "num_tokens": 18737845.0, + "step": 725 + }, + { + "epoch": 0.07972765209751811, + "grad_norm": 2.5540950298309326, + "learning_rate": 1.3268667642752564e-06, + "loss": 1.0918, + "mean_token_accuracy": 0.6761848330497742, + "num_tokens": 18760808.0, + "step": 726 + }, + { + "epoch": 0.07983746980013177, + "grad_norm": 2.2363319396972656, + "learning_rate": 1.3286969253294292e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6883573532104492, + "num_tokens": 18789267.0, + "step": 727 + }, + { + "epoch": 0.07994728750274545, + "grad_norm": 2.3786137104034424, + "learning_rate": 1.3305270863836017e-06, + "loss": 1.1034, + "mean_token_accuracy": 0.6808124780654907, + "num_tokens": 18814119.0, + "step": 728 + }, + { + "epoch": 0.08005710520535911, + "grad_norm": 2.28578782081604, + "learning_rate": 1.3323572474377748e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7045937776565552, + "num_tokens": 18838148.0, + "step": 729 + }, + { + "epoch": 0.08016692290797277, + "grad_norm": 2.232503890991211, + "learning_rate": 1.3341874084919473e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.6800082921981812, + "num_tokens": 18864990.0, + "step": 730 + }, + { + "epoch": 0.08027674061058643, + "grad_norm": 2.4936370849609375, + "learning_rate": 1.3360175695461204e-06, + "loss": 1.134, + "mean_token_accuracy": 0.664015531539917, + "num_tokens": 18891464.0, + "step": 731 + }, + { + "epoch": 0.08038655831320009, + "grad_norm": 2.674668550491333, + "learning_rate": 1.337847730600293e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.692380428314209, + "num_tokens": 18912615.0, + "step": 732 + }, + { + "epoch": 0.08049637601581375, + "grad_norm": 2.059605121612549, + "learning_rate": 1.3396778916544655e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.683771014213562, + "num_tokens": 18941167.0, + "step": 733 + }, + { + "epoch": 0.08060619371842741, + "grad_norm": 2.286419153213501, + "learning_rate": 1.3415080527086385e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.6794401407241821, + "num_tokens": 18967645.0, + "step": 734 + }, + { + "epoch": 0.08071601142104107, + "grad_norm": 2.3266477584838867, + "learning_rate": 1.3433382137628111e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6800175905227661, + "num_tokens": 18991395.0, + "step": 735 + }, + { + "epoch": 0.08082582912365473, + "grad_norm": 2.2671310901641846, + "learning_rate": 1.345168374816984e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.680463969707489, + "num_tokens": 19020275.0, + "step": 736 + }, + { + "epoch": 0.0809356468262684, + "grad_norm": 3.69795560836792, + "learning_rate": 1.3469985358711567e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.69437575340271, + "num_tokens": 19042024.0, + "step": 737 + }, + { + "epoch": 0.08104546452888206, + "grad_norm": 2.398355722427368, + "learning_rate": 1.3488286969253295e-06, + "loss": 1.0912, + "mean_token_accuracy": 0.6866046190261841, + "num_tokens": 19067398.0, + "step": 738 + }, + { + "epoch": 0.08115528223149572, + "grad_norm": 2.371962547302246, + "learning_rate": 1.3506588579795023e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6840715408325195, + "num_tokens": 19091311.0, + "step": 739 + }, + { + "epoch": 0.08126509993410938, + "grad_norm": 2.360893487930298, + "learning_rate": 1.3524890190336751e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6895284652709961, + "num_tokens": 19115457.0, + "step": 740 + }, + { + "epoch": 0.08137491763672304, + "grad_norm": 2.0903398990631104, + "learning_rate": 1.3543191800878477e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6883247494697571, + "num_tokens": 19147644.0, + "step": 741 + }, + { + "epoch": 0.0814847353393367, + "grad_norm": 2.6006884574890137, + "learning_rate": 1.3561493411420207e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6981140971183777, + "num_tokens": 19166991.0, + "step": 742 + }, + { + "epoch": 0.08159455304195036, + "grad_norm": 1.9888633489608765, + "learning_rate": 1.3579795021961933e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6833779811859131, + "num_tokens": 19200723.0, + "step": 743 + }, + { + "epoch": 0.08170437074456402, + "grad_norm": 2.1963655948638916, + "learning_rate": 1.359809663250366e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.6831595301628113, + "num_tokens": 19227899.0, + "step": 744 + }, + { + "epoch": 0.08181418844717768, + "grad_norm": 2.2547731399536133, + "learning_rate": 1.361639824304539e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.698525607585907, + "num_tokens": 19253606.0, + "step": 745 + }, + { + "epoch": 0.08192400614979134, + "grad_norm": 2.381336212158203, + "learning_rate": 1.3634699853587117e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7005480527877808, + "num_tokens": 19277665.0, + "step": 746 + }, + { + "epoch": 0.08203382385240501, + "grad_norm": 2.2827303409576416, + "learning_rate": 1.3653001464128845e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7006678581237793, + "num_tokens": 19303947.0, + "step": 747 + }, + { + "epoch": 0.08214364155501867, + "grad_norm": 2.467456340789795, + "learning_rate": 1.3671303074670573e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7061303853988647, + "num_tokens": 19325748.0, + "step": 748 + }, + { + "epoch": 0.08225345925763233, + "grad_norm": 2.5861411094665527, + "learning_rate": 1.3689604685212299e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.697941780090332, + "num_tokens": 19350005.0, + "step": 749 + }, + { + "epoch": 0.08236327696024599, + "grad_norm": 2.43172025680542, + "learning_rate": 1.3707906295754029e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7145662307739258, + "num_tokens": 19371933.0, + "step": 750 + }, + { + "epoch": 0.08247309466285965, + "grad_norm": 2.5751378536224365, + "learning_rate": 1.3726207906295755e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7232630252838135, + "num_tokens": 19391386.0, + "step": 751 + }, + { + "epoch": 0.08258291236547331, + "grad_norm": 2.3809475898742676, + "learning_rate": 1.374450951683748e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6932128667831421, + "num_tokens": 19416792.0, + "step": 752 + }, + { + "epoch": 0.08269273006808697, + "grad_norm": 1.9830600023269653, + "learning_rate": 1.376281112737921e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.6788616180419922, + "num_tokens": 19450091.0, + "step": 753 + }, + { + "epoch": 0.08280254777070063, + "grad_norm": 2.342216730117798, + "learning_rate": 1.3781112737920937e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6691786050796509, + "num_tokens": 19475785.0, + "step": 754 + }, + { + "epoch": 0.08291236547331429, + "grad_norm": 2.364586353302002, + "learning_rate": 1.3799414348462667e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.6772383451461792, + "num_tokens": 19499563.0, + "step": 755 + }, + { + "epoch": 0.08302218317592797, + "grad_norm": 2.0468087196350098, + "learning_rate": 1.3817715959004393e-06, + "loss": 1.0872, + "mean_token_accuracy": 0.6773324012756348, + "num_tokens": 19531020.0, + "step": 756 + }, + { + "epoch": 0.08313200087854163, + "grad_norm": 2.2822089195251465, + "learning_rate": 1.383601756954612e-06, + "loss": 1.0931, + "mean_token_accuracy": 0.6866973638534546, + "num_tokens": 19556933.0, + "step": 757 + }, + { + "epoch": 0.08324181858115529, + "grad_norm": 2.4229342937469482, + "learning_rate": 1.385431918008785e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7066225409507751, + "num_tokens": 19579735.0, + "step": 758 + }, + { + "epoch": 0.08335163628376895, + "grad_norm": 2.0005719661712646, + "learning_rate": 1.3872620790629576e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6953040361404419, + "num_tokens": 19608543.0, + "step": 759 + }, + { + "epoch": 0.0834614539863826, + "grad_norm": 2.469393491744995, + "learning_rate": 1.3890922401171302e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6898808479309082, + "num_tokens": 19631465.0, + "step": 760 + }, + { + "epoch": 0.08357127168899627, + "grad_norm": 2.3113911151885986, + "learning_rate": 1.3909224011713032e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7220426797866821, + "num_tokens": 19658525.0, + "step": 761 + }, + { + "epoch": 0.08368108939160993, + "grad_norm": 2.2096080780029297, + "learning_rate": 1.3927525622254758e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6891541481018066, + "num_tokens": 19686575.0, + "step": 762 + }, + { + "epoch": 0.08379090709422359, + "grad_norm": 2.254387378692627, + "learning_rate": 1.3945827232796488e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6987746953964233, + "num_tokens": 19714709.0, + "step": 763 + }, + { + "epoch": 0.08390072479683724, + "grad_norm": 2.177772045135498, + "learning_rate": 1.3964128843338214e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6883844137191772, + "num_tokens": 19742350.0, + "step": 764 + }, + { + "epoch": 0.0840105424994509, + "grad_norm": 2.4453377723693848, + "learning_rate": 1.3982430453879942e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.7005026936531067, + "num_tokens": 19763421.0, + "step": 765 + }, + { + "epoch": 0.08412036020206458, + "grad_norm": 2.1651482582092285, + "learning_rate": 1.400073206442167e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6843952536582947, + "num_tokens": 19790436.0, + "step": 766 + }, + { + "epoch": 0.08423017790467824, + "grad_norm": 2.561889410018921, + "learning_rate": 1.4019033674963398e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6836188435554504, + "num_tokens": 19812813.0, + "step": 767 + }, + { + "epoch": 0.0843399956072919, + "grad_norm": 2.1898202896118164, + "learning_rate": 1.4037335285505124e-06, + "loss": 1.116, + "mean_token_accuracy": 0.6724109053611755, + "num_tokens": 19842380.0, + "step": 768 + }, + { + "epoch": 0.08444981330990556, + "grad_norm": 2.7006845474243164, + "learning_rate": 1.4055636896046854e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7255488038063049, + "num_tokens": 19859713.0, + "step": 769 + }, + { + "epoch": 0.08455963101251922, + "grad_norm": 2.6743905544281006, + "learning_rate": 1.407393850658858e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.7025921940803528, + "num_tokens": 19877484.0, + "step": 770 + }, + { + "epoch": 0.08466944871513288, + "grad_norm": 2.1688997745513916, + "learning_rate": 1.409224011713031e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6947850584983826, + "num_tokens": 19906257.0, + "step": 771 + }, + { + "epoch": 0.08477926641774654, + "grad_norm": 2.093921661376953, + "learning_rate": 1.4110541727672036e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6948038339614868, + "num_tokens": 19936542.0, + "step": 772 + }, + { + "epoch": 0.0848890841203602, + "grad_norm": 2.2189459800720215, + "learning_rate": 1.4128843338213762e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.6766310930252075, + "num_tokens": 19963364.0, + "step": 773 + }, + { + "epoch": 0.08499890182297386, + "grad_norm": 2.4252469539642334, + "learning_rate": 1.4147144948755492e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7003713846206665, + "num_tokens": 19986632.0, + "step": 774 + }, + { + "epoch": 0.08510871952558753, + "grad_norm": 2.509523868560791, + "learning_rate": 1.416544655929722e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6828901767730713, + "num_tokens": 20008985.0, + "step": 775 + }, + { + "epoch": 0.08521853722820119, + "grad_norm": 2.083956480026245, + "learning_rate": 1.4183748169838946e-06, + "loss": 1.1072, + "mean_token_accuracy": 0.6688941121101379, + "num_tokens": 20039950.0, + "step": 776 + }, + { + "epoch": 0.08532835493081485, + "grad_norm": 1.9028677940368652, + "learning_rate": 1.4202049780380676e-06, + "loss": 1.0944, + "mean_token_accuracy": 0.6746198534965515, + "num_tokens": 20076611.0, + "step": 777 + }, + { + "epoch": 0.08543817263342851, + "grad_norm": 2.3139712810516357, + "learning_rate": 1.4220351390922402e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6900299191474915, + "num_tokens": 20103360.0, + "step": 778 + }, + { + "epoch": 0.08554799033604217, + "grad_norm": 2.3118834495544434, + "learning_rate": 1.4238653001464132e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6892503499984741, + "num_tokens": 20127652.0, + "step": 779 + }, + { + "epoch": 0.08565780803865583, + "grad_norm": 2.4091219902038574, + "learning_rate": 1.4256954612005858e-06, + "loss": 1.1343, + "mean_token_accuracy": 0.673589289188385, + "num_tokens": 20152607.0, + "step": 780 + }, + { + "epoch": 0.08576762574126949, + "grad_norm": 1.946776032447815, + "learning_rate": 1.4275256222547584e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6782004833221436, + "num_tokens": 20185230.0, + "step": 781 + }, + { + "epoch": 0.08587744344388315, + "grad_norm": 2.3151872158050537, + "learning_rate": 1.4293557833089314e-06, + "loss": 1.1031, + "mean_token_accuracy": 0.6830103993415833, + "num_tokens": 20210246.0, + "step": 782 + }, + { + "epoch": 0.08598726114649681, + "grad_norm": 2.360535144805908, + "learning_rate": 1.431185944363104e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7035452127456665, + "num_tokens": 20233564.0, + "step": 783 + }, + { + "epoch": 0.08609707884911047, + "grad_norm": 2.343594551086426, + "learning_rate": 1.4330161054172768e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6793848276138306, + "num_tokens": 20257340.0, + "step": 784 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 2.2035577297210693, + "learning_rate": 1.4348462664714496e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6997925043106079, + "num_tokens": 20286125.0, + "step": 785 + }, + { + "epoch": 0.0863167142543378, + "grad_norm": 1.9382853507995605, + "learning_rate": 1.4366764275256224e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6882298588752747, + "num_tokens": 20321609.0, + "step": 786 + }, + { + "epoch": 0.08642653195695146, + "grad_norm": 2.1427531242370605, + "learning_rate": 1.4385065885797952e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.6829932928085327, + "num_tokens": 20353026.0, + "step": 787 + }, + { + "epoch": 0.08653634965956512, + "grad_norm": 2.438516139984131, + "learning_rate": 1.440336749633968e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.703875720500946, + "num_tokens": 20375177.0, + "step": 788 + }, + { + "epoch": 0.08664616736217878, + "grad_norm": 2.2339067459106445, + "learning_rate": 1.4421669106881405e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6893581748008728, + "num_tokens": 20402120.0, + "step": 789 + }, + { + "epoch": 0.08675598506479244, + "grad_norm": 2.246311902999878, + "learning_rate": 1.4439970717423136e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6829434633255005, + "num_tokens": 20428475.0, + "step": 790 + }, + { + "epoch": 0.0868658027674061, + "grad_norm": 2.4762184619903564, + "learning_rate": 1.4458272327964861e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6953016519546509, + "num_tokens": 20451499.0, + "step": 791 + }, + { + "epoch": 0.08697562047001976, + "grad_norm": 2.2944514751434326, + "learning_rate": 1.447657393850659e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6926838159561157, + "num_tokens": 20476224.0, + "step": 792 + }, + { + "epoch": 0.08708543817263342, + "grad_norm": 1.9594039916992188, + "learning_rate": 1.4494875549048317e-06, + "loss": 1.133, + "mean_token_accuracy": 0.664836049079895, + "num_tokens": 20511890.0, + "step": 793 + }, + { + "epoch": 0.0871952558752471, + "grad_norm": 2.2018649578094482, + "learning_rate": 1.4513177159590045e-06, + "loss": 1.0866, + "mean_token_accuracy": 0.680583119392395, + "num_tokens": 20543650.0, + "step": 794 + }, + { + "epoch": 0.08730507357786076, + "grad_norm": 2.469050645828247, + "learning_rate": 1.4531478770131773e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6857365369796753, + "num_tokens": 20566571.0, + "step": 795 + }, + { + "epoch": 0.08741489128047442, + "grad_norm": 3.0044403076171875, + "learning_rate": 1.4549780380673501e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6967573761940002, + "num_tokens": 20585499.0, + "step": 796 + }, + { + "epoch": 0.08752470898308808, + "grad_norm": 2.152517318725586, + "learning_rate": 1.4568081991215227e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.6817067265510559, + "num_tokens": 20614943.0, + "step": 797 + }, + { + "epoch": 0.08763452668570174, + "grad_norm": 2.142561912536621, + "learning_rate": 1.4586383601756957e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6877919435501099, + "num_tokens": 20640516.0, + "step": 798 + }, + { + "epoch": 0.0877443443883154, + "grad_norm": 2.0707173347473145, + "learning_rate": 1.4604685212298683e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7090843319892883, + "num_tokens": 20669590.0, + "step": 799 + }, + { + "epoch": 0.08785416209092906, + "grad_norm": 2.275674343109131, + "learning_rate": 1.462298682284041e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.685224175453186, + "num_tokens": 20693853.0, + "step": 800 + }, + { + "epoch": 0.08796397979354272, + "grad_norm": 2.130009412765503, + "learning_rate": 1.464128843338214e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7013375759124756, + "num_tokens": 20722026.0, + "step": 801 + }, + { + "epoch": 0.08807379749615638, + "grad_norm": 2.3955423831939697, + "learning_rate": 1.4659590043923865e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6840609312057495, + "num_tokens": 20747946.0, + "step": 802 + }, + { + "epoch": 0.08818361519877004, + "grad_norm": 2.084047555923462, + "learning_rate": 1.4677891654465595e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7147344350814819, + "num_tokens": 20776383.0, + "step": 803 + }, + { + "epoch": 0.08829343290138371, + "grad_norm": 2.079068422317505, + "learning_rate": 1.469619326500732e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.685759425163269, + "num_tokens": 20806040.0, + "step": 804 + }, + { + "epoch": 0.08840325060399737, + "grad_norm": 2.154160737991333, + "learning_rate": 1.4714494875549049e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7006321549415588, + "num_tokens": 20835247.0, + "step": 805 + }, + { + "epoch": 0.08851306830661103, + "grad_norm": 2.184633493423462, + "learning_rate": 1.4732796486090777e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.6730568408966064, + "num_tokens": 20865188.0, + "step": 806 + }, + { + "epoch": 0.08862288600922469, + "grad_norm": 2.0934853553771973, + "learning_rate": 1.4751098096632505e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6910343170166016, + "num_tokens": 20894011.0, + "step": 807 + }, + { + "epoch": 0.08873270371183835, + "grad_norm": 2.1403887271881104, + "learning_rate": 1.476939970717423e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6873581409454346, + "num_tokens": 20923917.0, + "step": 808 + }, + { + "epoch": 0.08884252141445201, + "grad_norm": 2.1710386276245117, + "learning_rate": 1.478770131771596e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6876229643821716, + "num_tokens": 20953055.0, + "step": 809 + }, + { + "epoch": 0.08895233911706567, + "grad_norm": 1.894240140914917, + "learning_rate": 1.4806002928257687e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6896669864654541, + "num_tokens": 20990878.0, + "step": 810 + }, + { + "epoch": 0.08906215681967933, + "grad_norm": 2.238567352294922, + "learning_rate": 1.4824304538799417e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7033143639564514, + "num_tokens": 21019333.0, + "step": 811 + }, + { + "epoch": 0.08917197452229299, + "grad_norm": 2.1342766284942627, + "learning_rate": 1.4842606149341143e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.69620680809021, + "num_tokens": 21049094.0, + "step": 812 + }, + { + "epoch": 0.08928179222490666, + "grad_norm": 2.4774603843688965, + "learning_rate": 1.486090775988287e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.702077329158783, + "num_tokens": 21070764.0, + "step": 813 + }, + { + "epoch": 0.08939160992752032, + "grad_norm": 2.296412706375122, + "learning_rate": 1.4879209370424599e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6761495471000671, + "num_tokens": 21095420.0, + "step": 814 + }, + { + "epoch": 0.08950142763013398, + "grad_norm": 2.4670588970184326, + "learning_rate": 1.4897510980966327e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7072901725769043, + "num_tokens": 21117018.0, + "step": 815 + }, + { + "epoch": 0.08961124533274764, + "grad_norm": 2.22832989692688, + "learning_rate": 1.4915812591508053e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6730377078056335, + "num_tokens": 21145069.0, + "step": 816 + }, + { + "epoch": 0.0897210630353613, + "grad_norm": 2.0595438480377197, + "learning_rate": 1.4934114202049783e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.6856311559677124, + "num_tokens": 21175637.0, + "step": 817 + }, + { + "epoch": 0.08983088073797496, + "grad_norm": 1.998032569885254, + "learning_rate": 1.4952415812591508e-06, + "loss": 1.1826, + "mean_token_accuracy": 0.6576592922210693, + "num_tokens": 21211465.0, + "step": 818 + }, + { + "epoch": 0.08994069844058862, + "grad_norm": 2.39705753326416, + "learning_rate": 1.4970717423133239e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7119095921516418, + "num_tokens": 21233489.0, + "step": 819 + }, + { + "epoch": 0.09005051614320228, + "grad_norm": 2.4712703227996826, + "learning_rate": 1.4989019033674964e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7043080925941467, + "num_tokens": 21254797.0, + "step": 820 + }, + { + "epoch": 0.09016033384581594, + "grad_norm": 2.5018224716186523, + "learning_rate": 1.500732064421669e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6910544633865356, + "num_tokens": 21275344.0, + "step": 821 + }, + { + "epoch": 0.0902701515484296, + "grad_norm": 2.675144672393799, + "learning_rate": 1.502562225475842e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6820976138114929, + "num_tokens": 21296384.0, + "step": 822 + }, + { + "epoch": 0.09037996925104327, + "grad_norm": 2.280566453933716, + "learning_rate": 1.5043923865300148e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6923546195030212, + "num_tokens": 21321685.0, + "step": 823 + }, + { + "epoch": 0.09048978695365693, + "grad_norm": 2.3556222915649414, + "learning_rate": 1.5062225475841874e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6872192025184631, + "num_tokens": 21345698.0, + "step": 824 + }, + { + "epoch": 0.0905996046562706, + "grad_norm": 2.5065760612487793, + "learning_rate": 1.5080527086383604e-06, + "loss": 1.039, + "mean_token_accuracy": 0.688526451587677, + "num_tokens": 21368581.0, + "step": 825 + }, + { + "epoch": 0.09070942235888425, + "grad_norm": 2.2585134506225586, + "learning_rate": 1.509882869692533e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.6801366806030273, + "num_tokens": 21397908.0, + "step": 826 + }, + { + "epoch": 0.09081924006149791, + "grad_norm": 2.2681233882904053, + "learning_rate": 1.511713030746706e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6932809352874756, + "num_tokens": 21422969.0, + "step": 827 + }, + { + "epoch": 0.09092905776411157, + "grad_norm": 2.4711315631866455, + "learning_rate": 1.5135431918008786e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6861393451690674, + "num_tokens": 21445845.0, + "step": 828 + }, + { + "epoch": 0.09103887546672523, + "grad_norm": 2.378488540649414, + "learning_rate": 1.5153733528550512e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6848686337471008, + "num_tokens": 21472135.0, + "step": 829 + }, + { + "epoch": 0.09114869316933889, + "grad_norm": 2.2933261394500732, + "learning_rate": 1.5172035139092242e-06, + "loss": 1.0981, + "mean_token_accuracy": 0.6789476871490479, + "num_tokens": 21501013.0, + "step": 830 + }, + { + "epoch": 0.09125851087195255, + "grad_norm": 2.20474910736084, + "learning_rate": 1.5190336749633968e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.700356125831604, + "num_tokens": 21525766.0, + "step": 831 + }, + { + "epoch": 0.09136832857456623, + "grad_norm": 2.405982732772827, + "learning_rate": 1.5208638360175696e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7226903438568115, + "num_tokens": 21548907.0, + "step": 832 + }, + { + "epoch": 0.09147814627717989, + "grad_norm": 2.401808738708496, + "learning_rate": 1.5226939970717424e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6831666231155396, + "num_tokens": 21573434.0, + "step": 833 + }, + { + "epoch": 0.09158796397979355, + "grad_norm": 2.4408135414123535, + "learning_rate": 1.5245241581259152e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6889650821685791, + "num_tokens": 21596791.0, + "step": 834 + }, + { + "epoch": 0.0916977816824072, + "grad_norm": 2.264362335205078, + "learning_rate": 1.526354319180088e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.680194616317749, + "num_tokens": 21625612.0, + "step": 835 + }, + { + "epoch": 0.09180759938502087, + "grad_norm": 2.6703107357025146, + "learning_rate": 1.5281844802342608e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6927430033683777, + "num_tokens": 21643992.0, + "step": 836 + }, + { + "epoch": 0.09191741708763453, + "grad_norm": 2.4004502296447754, + "learning_rate": 1.5300146412884334e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6881699562072754, + "num_tokens": 21669192.0, + "step": 837 + }, + { + "epoch": 0.09202723479024819, + "grad_norm": 2.1647486686706543, + "learning_rate": 1.5318448023426064e-06, + "loss": 1.1067, + "mean_token_accuracy": 0.6773272752761841, + "num_tokens": 21696513.0, + "step": 838 + }, + { + "epoch": 0.09213705249286185, + "grad_norm": 2.271617889404297, + "learning_rate": 1.533674963396779e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.700514554977417, + "num_tokens": 21721117.0, + "step": 839 + }, + { + "epoch": 0.0922468701954755, + "grad_norm": 2.3646647930145264, + "learning_rate": 1.5355051244509518e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.6859309673309326, + "num_tokens": 21746585.0, + "step": 840 + }, + { + "epoch": 0.09235668789808917, + "grad_norm": 2.1488161087036133, + "learning_rate": 1.5373352855051246e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6830739974975586, + "num_tokens": 21776087.0, + "step": 841 + }, + { + "epoch": 0.09246650560070284, + "grad_norm": 2.355388879776001, + "learning_rate": 1.5391654465592974e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7165321111679077, + "num_tokens": 21798845.0, + "step": 842 + }, + { + "epoch": 0.0925763233033165, + "grad_norm": 2.2793359756469727, + "learning_rate": 1.5409956076134702e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6987842917442322, + "num_tokens": 21823567.0, + "step": 843 + }, + { + "epoch": 0.09268614100593016, + "grad_norm": 2.5200629234313965, + "learning_rate": 1.542825768667643e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6966334581375122, + "num_tokens": 21846618.0, + "step": 844 + }, + { + "epoch": 0.09279595870854382, + "grad_norm": 2.596444845199585, + "learning_rate": 1.5446559297218156e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.6833420991897583, + "num_tokens": 21869149.0, + "step": 845 + }, + { + "epoch": 0.09290577641115748, + "grad_norm": 2.3210766315460205, + "learning_rate": 1.5464860907759886e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6961880922317505, + "num_tokens": 21892917.0, + "step": 846 + }, + { + "epoch": 0.09301559411377114, + "grad_norm": 2.130445718765259, + "learning_rate": 1.5483162518301612e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6858341693878174, + "num_tokens": 21920457.0, + "step": 847 + }, + { + "epoch": 0.0931254118163848, + "grad_norm": 2.3502049446105957, + "learning_rate": 1.5501464128843337e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6897379159927368, + "num_tokens": 21947011.0, + "step": 848 + }, + { + "epoch": 0.09323522951899846, + "grad_norm": 2.105394124984741, + "learning_rate": 1.5519765739385067e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6883758306503296, + "num_tokens": 21975974.0, + "step": 849 + }, + { + "epoch": 0.09334504722161212, + "grad_norm": 2.0188634395599365, + "learning_rate": 1.5538067349926793e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6790294647216797, + "num_tokens": 22006803.0, + "step": 850 + }, + { + "epoch": 0.09345486492422579, + "grad_norm": 2.5463244915008545, + "learning_rate": 1.5556368960468523e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7147258520126343, + "num_tokens": 22025860.0, + "step": 851 + }, + { + "epoch": 0.09356468262683945, + "grad_norm": 1.929311990737915, + "learning_rate": 1.557467057101025e-06, + "loss": 1.1495, + "mean_token_accuracy": 0.6606210470199585, + "num_tokens": 22059372.0, + "step": 852 + }, + { + "epoch": 0.09367450032945311, + "grad_norm": 2.2981812953948975, + "learning_rate": 1.5592972181551977e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6775506138801575, + "num_tokens": 22089197.0, + "step": 853 + }, + { + "epoch": 0.09378431803206677, + "grad_norm": 2.4307332038879395, + "learning_rate": 1.5611273792093705e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6863329410552979, + "num_tokens": 22113436.0, + "step": 854 + }, + { + "epoch": 0.09389413573468043, + "grad_norm": 2.31144118309021, + "learning_rate": 1.5629575402635433e-06, + "loss": 1.1044, + "mean_token_accuracy": 0.6692426204681396, + "num_tokens": 22140959.0, + "step": 855 + }, + { + "epoch": 0.09400395343729409, + "grad_norm": 2.477440595626831, + "learning_rate": 1.564787701317716e-06, + "loss": 1.094, + "mean_token_accuracy": 0.6766085624694824, + "num_tokens": 22164906.0, + "step": 856 + }, + { + "epoch": 0.09411377113990775, + "grad_norm": 2.139523506164551, + "learning_rate": 1.566617862371889e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.7027624845504761, + "num_tokens": 22192383.0, + "step": 857 + }, + { + "epoch": 0.09422358884252141, + "grad_norm": 2.211611747741699, + "learning_rate": 1.5684480234260615e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6893569827079773, + "num_tokens": 22219791.0, + "step": 858 + }, + { + "epoch": 0.09433340654513507, + "grad_norm": 2.012927293777466, + "learning_rate": 1.5702781844802345e-06, + "loss": 1.1176, + "mean_token_accuracy": 0.6721435189247131, + "num_tokens": 22251197.0, + "step": 859 + }, + { + "epoch": 0.09444322424774873, + "grad_norm": 2.169332504272461, + "learning_rate": 1.5721083455344071e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6920056343078613, + "num_tokens": 22278629.0, + "step": 860 + }, + { + "epoch": 0.0945530419503624, + "grad_norm": 2.041694402694702, + "learning_rate": 1.57393850658858e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6866579055786133, + "num_tokens": 22308079.0, + "step": 861 + }, + { + "epoch": 0.09466285965297606, + "grad_norm": 2.4201951026916504, + "learning_rate": 1.5757686676427527e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6863811016082764, + "num_tokens": 22330728.0, + "step": 862 + }, + { + "epoch": 0.09477267735558972, + "grad_norm": 2.162388563156128, + "learning_rate": 1.5775988286969255e-06, + "loss": 1.1234, + "mean_token_accuracy": 0.6685315370559692, + "num_tokens": 22359471.0, + "step": 863 + }, + { + "epoch": 0.09488249505820338, + "grad_norm": 2.2282800674438477, + "learning_rate": 1.579428989751098e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6895480751991272, + "num_tokens": 22384995.0, + "step": 864 + }, + { + "epoch": 0.09499231276081704, + "grad_norm": 2.669743537902832, + "learning_rate": 1.581259150805271e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6847001910209656, + "num_tokens": 22406712.0, + "step": 865 + }, + { + "epoch": 0.0951021304634307, + "grad_norm": 2.09126877784729, + "learning_rate": 1.5830893118594437e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6969508528709412, + "num_tokens": 22435345.0, + "step": 866 + }, + { + "epoch": 0.09521194816604436, + "grad_norm": 2.2980916500091553, + "learning_rate": 1.5849194729136167e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6917616128921509, + "num_tokens": 22461160.0, + "step": 867 + }, + { + "epoch": 0.09532176586865802, + "grad_norm": 2.195108413696289, + "learning_rate": 1.5867496339677893e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6843464374542236, + "num_tokens": 22487928.0, + "step": 868 + }, + { + "epoch": 0.09543158357127168, + "grad_norm": 2.5245890617370605, + "learning_rate": 1.5885797950219619e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6920220851898193, + "num_tokens": 22509315.0, + "step": 869 + }, + { + "epoch": 0.09554140127388536, + "grad_norm": 2.553873062133789, + "learning_rate": 1.5904099560761349e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.6975198984146118, + "num_tokens": 22529006.0, + "step": 870 + }, + { + "epoch": 0.09565121897649902, + "grad_norm": 2.306706190109253, + "learning_rate": 1.5922401171303077e-06, + "loss": 1.1157, + "mean_token_accuracy": 0.6779115200042725, + "num_tokens": 22557356.0, + "step": 871 + }, + { + "epoch": 0.09576103667911268, + "grad_norm": 2.200503349304199, + "learning_rate": 1.5940702781844803e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.6729334592819214, + "num_tokens": 22586580.0, + "step": 872 + }, + { + "epoch": 0.09587085438172634, + "grad_norm": 2.226548910140991, + "learning_rate": 1.5959004392386533e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7166810035705566, + "num_tokens": 22612390.0, + "step": 873 + }, + { + "epoch": 0.09598067208434, + "grad_norm": 2.015007734298706, + "learning_rate": 1.5977306002928259e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6891074776649475, + "num_tokens": 22644702.0, + "step": 874 + }, + { + "epoch": 0.09609048978695366, + "grad_norm": 2.5928127765655518, + "learning_rate": 1.5995607613469989e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7058278322219849, + "num_tokens": 22665790.0, + "step": 875 + }, + { + "epoch": 0.09620030748956732, + "grad_norm": 2.223111152648926, + "learning_rate": 1.6013909224011715e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7228342890739441, + "num_tokens": 22687687.0, + "step": 876 + }, + { + "epoch": 0.09631012519218098, + "grad_norm": 2.3600106239318848, + "learning_rate": 1.603221083455344e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6943204402923584, + "num_tokens": 22713824.0, + "step": 877 + }, + { + "epoch": 0.09641994289479464, + "grad_norm": 2.6737728118896484, + "learning_rate": 1.605051244509517e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.7093530893325806, + "num_tokens": 22734585.0, + "step": 878 + }, + { + "epoch": 0.0965297605974083, + "grad_norm": 2.432178258895874, + "learning_rate": 1.6068814055636896e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7119638919830322, + "num_tokens": 22755740.0, + "step": 879 + }, + { + "epoch": 0.09663957830002197, + "grad_norm": 2.621281623840332, + "learning_rate": 1.6087115666178624e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.709349513053894, + "num_tokens": 22774976.0, + "step": 880 + }, + { + "epoch": 0.09674939600263563, + "grad_norm": 2.069790840148926, + "learning_rate": 1.6105417276720352e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6943182945251465, + "num_tokens": 22805573.0, + "step": 881 + }, + { + "epoch": 0.09685921370524929, + "grad_norm": 2.9274840354919434, + "learning_rate": 1.612371888726208e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6946780681610107, + "num_tokens": 22823349.0, + "step": 882 + }, + { + "epoch": 0.09696903140786295, + "grad_norm": 1.8467448949813843, + "learning_rate": 1.6142020497803808e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.708309531211853, + "num_tokens": 22856808.0, + "step": 883 + }, + { + "epoch": 0.09707884911047661, + "grad_norm": 2.346951484680176, + "learning_rate": 1.6160322108345536e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6931358575820923, + "num_tokens": 22878171.0, + "step": 884 + }, + { + "epoch": 0.09718866681309027, + "grad_norm": 2.176290512084961, + "learning_rate": 1.6178623718887262e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.6807096600532532, + "num_tokens": 22906186.0, + "step": 885 + }, + { + "epoch": 0.09729848451570393, + "grad_norm": 2.2472684383392334, + "learning_rate": 1.6196925329428992e-06, + "loss": 1.1229, + "mean_token_accuracy": 0.6672447323799133, + "num_tokens": 22936226.0, + "step": 886 + }, + { + "epoch": 0.09740830221831759, + "grad_norm": 2.185537815093994, + "learning_rate": 1.6215226939970718e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6873799562454224, + "num_tokens": 22964074.0, + "step": 887 + }, + { + "epoch": 0.09751811992093125, + "grad_norm": 2.569840431213379, + "learning_rate": 1.6233528550512446e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7018556594848633, + "num_tokens": 22982153.0, + "step": 888 + }, + { + "epoch": 0.09762793762354492, + "grad_norm": 2.33624529838562, + "learning_rate": 1.6251830161054174e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7018195986747742, + "num_tokens": 23006506.0, + "step": 889 + }, + { + "epoch": 0.09773775532615858, + "grad_norm": 2.2782106399536133, + "learning_rate": 1.6270131771595902e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6831895112991333, + "num_tokens": 23032488.0, + "step": 890 + }, + { + "epoch": 0.09784757302877224, + "grad_norm": 2.2148449420928955, + "learning_rate": 1.628843338213763e-06, + "loss": 1.1216, + "mean_token_accuracy": 0.6694834232330322, + "num_tokens": 23060198.0, + "step": 891 + }, + { + "epoch": 0.0979573907313859, + "grad_norm": 2.3066318035125732, + "learning_rate": 1.6306734992679358e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6818501949310303, + "num_tokens": 23086773.0, + "step": 892 + }, + { + "epoch": 0.09806720843399956, + "grad_norm": 2.170266628265381, + "learning_rate": 1.6325036603221084e-06, + "loss": 1.0936, + "mean_token_accuracy": 0.6844502687454224, + "num_tokens": 23115599.0, + "step": 893 + }, + { + "epoch": 0.09817702613661322, + "grad_norm": 2.185351848602295, + "learning_rate": 1.6343338213762814e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6754145622253418, + "num_tokens": 23143360.0, + "step": 894 + }, + { + "epoch": 0.09828684383922688, + "grad_norm": 2.228416919708252, + "learning_rate": 1.636163982430454e-06, + "loss": 1.1144, + "mean_token_accuracy": 0.6766352653503418, + "num_tokens": 23171807.0, + "step": 895 + }, + { + "epoch": 0.09839666154184054, + "grad_norm": 2.4552395343780518, + "learning_rate": 1.6379941434846266e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.714665949344635, + "num_tokens": 23192796.0, + "step": 896 + }, + { + "epoch": 0.0985064792444542, + "grad_norm": 2.343296766281128, + "learning_rate": 1.6398243045387996e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7413418889045715, + "num_tokens": 23216285.0, + "step": 897 + }, + { + "epoch": 0.09861629694706786, + "grad_norm": 2.3964405059814453, + "learning_rate": 1.6416544655929722e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6864128708839417, + "num_tokens": 23240520.0, + "step": 898 + }, + { + "epoch": 0.09872611464968153, + "grad_norm": 2.275223731994629, + "learning_rate": 1.6434846266471452e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7037304639816284, + "num_tokens": 23267703.0, + "step": 899 + }, + { + "epoch": 0.0988359323522952, + "grad_norm": 2.4480626583099365, + "learning_rate": 1.6453147877013178e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.7031506299972534, + "num_tokens": 23291581.0, + "step": 900 + }, + { + "epoch": 0.09894575005490885, + "grad_norm": 2.2589290142059326, + "learning_rate": 1.6471449487554906e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6962993144989014, + "num_tokens": 23316744.0, + "step": 901 + }, + { + "epoch": 0.09905556775752251, + "grad_norm": 2.2177417278289795, + "learning_rate": 1.6489751098096634e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6936910152435303, + "num_tokens": 23341595.0, + "step": 902 + }, + { + "epoch": 0.09916538546013617, + "grad_norm": 2.3635058403015137, + "learning_rate": 1.6508052708638362e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6841267347335815, + "num_tokens": 23368264.0, + "step": 903 + }, + { + "epoch": 0.09927520316274983, + "grad_norm": 2.1652448177337646, + "learning_rate": 1.6526354319180088e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7032550573348999, + "num_tokens": 23393965.0, + "step": 904 + }, + { + "epoch": 0.0993850208653635, + "grad_norm": 2.205472946166992, + "learning_rate": 1.6544655929721818e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6981278657913208, + "num_tokens": 23420238.0, + "step": 905 + }, + { + "epoch": 0.09949483856797715, + "grad_norm": 2.206221342086792, + "learning_rate": 1.6562957540263543e-06, + "loss": 1.1538, + "mean_token_accuracy": 0.6686501502990723, + "num_tokens": 23449489.0, + "step": 906 + }, + { + "epoch": 0.09960465627059081, + "grad_norm": 2.2884953022003174, + "learning_rate": 1.6581259150805274e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6899277567863464, + "num_tokens": 23476124.0, + "step": 907 + }, + { + "epoch": 0.09971447397320449, + "grad_norm": 2.138639211654663, + "learning_rate": 1.6599560761347e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7061219215393066, + "num_tokens": 23503041.0, + "step": 908 + }, + { + "epoch": 0.09982429167581815, + "grad_norm": 2.0865554809570312, + "learning_rate": 1.6617862371888727e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6829385757446289, + "num_tokens": 23533660.0, + "step": 909 + }, + { + "epoch": 0.0999341093784318, + "grad_norm": 2.27158522605896, + "learning_rate": 1.6636163982430455e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.6823568344116211, + "num_tokens": 23561457.0, + "step": 910 + }, + { + "epoch": 0.10004392708104547, + "grad_norm": 2.1433804035186768, + "learning_rate": 1.6654465592972183e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6843112111091614, + "num_tokens": 23588630.0, + "step": 911 + }, + { + "epoch": 0.10015374478365913, + "grad_norm": 2.665236473083496, + "learning_rate": 1.667276720351391e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.698837161064148, + "num_tokens": 23610316.0, + "step": 912 + }, + { + "epoch": 0.10026356248627279, + "grad_norm": 2.4645802974700928, + "learning_rate": 1.669106881405564e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6656038761138916, + "num_tokens": 23634911.0, + "step": 913 + }, + { + "epoch": 0.10037338018888645, + "grad_norm": 2.4303202629089355, + "learning_rate": 1.6709370424597365e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.7019643783569336, + "num_tokens": 23660093.0, + "step": 914 + }, + { + "epoch": 0.1004831978915001, + "grad_norm": 2.240656614303589, + "learning_rate": 1.6727672035139095e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6956279277801514, + "num_tokens": 23688764.0, + "step": 915 + }, + { + "epoch": 0.10059301559411377, + "grad_norm": 2.2530629634857178, + "learning_rate": 1.6745973645680821e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6868491172790527, + "num_tokens": 23713807.0, + "step": 916 + }, + { + "epoch": 0.10070283329672743, + "grad_norm": 2.12148118019104, + "learning_rate": 1.6764275256222547e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6974269151687622, + "num_tokens": 23741777.0, + "step": 917 + }, + { + "epoch": 0.1008126509993411, + "grad_norm": 2.132396936416626, + "learning_rate": 1.6782576866764277e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6910474896430969, + "num_tokens": 23770522.0, + "step": 918 + }, + { + "epoch": 0.10092246870195476, + "grad_norm": 1.9629111289978027, + "learning_rate": 1.6800878477306003e-06, + "loss": 1.1776, + "mean_token_accuracy": 0.6509077548980713, + "num_tokens": 23806905.0, + "step": 919 + }, + { + "epoch": 0.10103228640456842, + "grad_norm": 2.2161459922790527, + "learning_rate": 1.681918008784773e-06, + "loss": 1.0895, + "mean_token_accuracy": 0.6730324029922485, + "num_tokens": 23835245.0, + "step": 920 + }, + { + "epoch": 0.10114210410718208, + "grad_norm": 1.9162266254425049, + "learning_rate": 1.6837481698389461e-06, + "loss": 1.1274, + "mean_token_accuracy": 0.6712036728858948, + "num_tokens": 23872637.0, + "step": 921 + }, + { + "epoch": 0.10125192180979574, + "grad_norm": 2.3308310508728027, + "learning_rate": 1.6855783308931187e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6845465302467346, + "num_tokens": 23896074.0, + "step": 922 + }, + { + "epoch": 0.1013617395124094, + "grad_norm": 2.4518916606903076, + "learning_rate": 1.6874084919472917e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6895356178283691, + "num_tokens": 23920186.0, + "step": 923 + }, + { + "epoch": 0.10147155721502306, + "grad_norm": 2.008784532546997, + "learning_rate": 1.6892386530014643e-06, + "loss": 1.1075, + "mean_token_accuracy": 0.6705164313316345, + "num_tokens": 23950460.0, + "step": 924 + }, + { + "epoch": 0.10158137491763672, + "grad_norm": 2.410231113433838, + "learning_rate": 1.6910688140556369e-06, + "loss": 1.0949, + "mean_token_accuracy": 0.6758481860160828, + "num_tokens": 23974631.0, + "step": 925 + }, + { + "epoch": 0.10169119262025038, + "grad_norm": 2.357999801635742, + "learning_rate": 1.6928989751098099e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7039762735366821, + "num_tokens": 23996293.0, + "step": 926 + }, + { + "epoch": 0.10180101032286405, + "grad_norm": 2.230713367462158, + "learning_rate": 1.6947291361639825e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6877454519271851, + "num_tokens": 24023024.0, + "step": 927 + }, + { + "epoch": 0.10191082802547771, + "grad_norm": 2.6065988540649414, + "learning_rate": 1.6965592972181553e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6915557384490967, + "num_tokens": 24043550.0, + "step": 928 + }, + { + "epoch": 0.10202064572809137, + "grad_norm": 1.998787760734558, + "learning_rate": 1.698389458272328e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6840627193450928, + "num_tokens": 24076680.0, + "step": 929 + }, + { + "epoch": 0.10213046343070503, + "grad_norm": 2.1134231090545654, + "learning_rate": 1.7002196193265009e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6804203391075134, + "num_tokens": 24105417.0, + "step": 930 + }, + { + "epoch": 0.10224028113331869, + "grad_norm": 2.502505302429199, + "learning_rate": 1.7020497803806737e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7149083614349365, + "num_tokens": 24126038.0, + "step": 931 + }, + { + "epoch": 0.10235009883593235, + "grad_norm": 1.942708134651184, + "learning_rate": 1.7038799414348465e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6928018927574158, + "num_tokens": 24158706.0, + "step": 932 + }, + { + "epoch": 0.10245991653854601, + "grad_norm": 2.1554107666015625, + "learning_rate": 1.705710102489019e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6768396496772766, + "num_tokens": 24187899.0, + "step": 933 + }, + { + "epoch": 0.10256973424115967, + "grad_norm": 2.2665812969207764, + "learning_rate": 1.707540263543192e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6896227598190308, + "num_tokens": 24213383.0, + "step": 934 + }, + { + "epoch": 0.10267955194377333, + "grad_norm": 2.370673418045044, + "learning_rate": 1.7093704245973647e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6915130615234375, + "num_tokens": 24238308.0, + "step": 935 + }, + { + "epoch": 0.10278936964638699, + "grad_norm": 1.8873796463012695, + "learning_rate": 1.7112005856515375e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6979165077209473, + "num_tokens": 24272634.0, + "step": 936 + }, + { + "epoch": 0.10289918734900066, + "grad_norm": 2.2213528156280518, + "learning_rate": 1.7130307467057103e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6931500434875488, + "num_tokens": 24298630.0, + "step": 937 + }, + { + "epoch": 0.10300900505161432, + "grad_norm": 2.1758460998535156, + "learning_rate": 1.714860907759883e-06, + "loss": 1.093, + "mean_token_accuracy": 0.6749221086502075, + "num_tokens": 24328976.0, + "step": 938 + }, + { + "epoch": 0.10311882275422798, + "grad_norm": 2.10555362701416, + "learning_rate": 1.7166910688140558e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7122087478637695, + "num_tokens": 24356450.0, + "step": 939 + }, + { + "epoch": 0.10322864045684164, + "grad_norm": 2.5331332683563232, + "learning_rate": 1.7185212298682286e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.7029217481613159, + "num_tokens": 24379121.0, + "step": 940 + }, + { + "epoch": 0.1033384581594553, + "grad_norm": 2.2858047485351562, + "learning_rate": 1.7203513909224012e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6946136951446533, + "num_tokens": 24403892.0, + "step": 941 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 2.2695250511169434, + "learning_rate": 1.7221815519765742e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6918835639953613, + "num_tokens": 24428395.0, + "step": 942 + }, + { + "epoch": 0.10355809356468262, + "grad_norm": 2.205864429473877, + "learning_rate": 1.7240117130307468e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6868416666984558, + "num_tokens": 24456113.0, + "step": 943 + }, + { + "epoch": 0.10366791126729628, + "grad_norm": 2.2875256538391113, + "learning_rate": 1.7258418740849194e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6838675141334534, + "num_tokens": 24482051.0, + "step": 944 + }, + { + "epoch": 0.10377772896990994, + "grad_norm": 2.343754768371582, + "learning_rate": 1.7276720351390924e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.691798985004425, + "num_tokens": 24506021.0, + "step": 945 + }, + { + "epoch": 0.10388754667252362, + "grad_norm": 2.1635942459106445, + "learning_rate": 1.729502196193265e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7118044495582581, + "num_tokens": 24532440.0, + "step": 946 + }, + { + "epoch": 0.10399736437513728, + "grad_norm": 2.4030330181121826, + "learning_rate": 1.731332357247438e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6876935958862305, + "num_tokens": 24554667.0, + "step": 947 + }, + { + "epoch": 0.10410718207775094, + "grad_norm": 2.1779274940490723, + "learning_rate": 1.7331625183016106e-06, + "loss": 1.1168, + "mean_token_accuracy": 0.6685634851455688, + "num_tokens": 24584828.0, + "step": 948 + }, + { + "epoch": 0.1042169997803646, + "grad_norm": 2.4060091972351074, + "learning_rate": 1.7349926793557834e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7073323726654053, + "num_tokens": 24607482.0, + "step": 949 + }, + { + "epoch": 0.10432681748297826, + "grad_norm": 2.3824503421783447, + "learning_rate": 1.7368228404099562e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.704155445098877, + "num_tokens": 24630783.0, + "step": 950 + }, + { + "epoch": 0.10443663518559192, + "grad_norm": 2.1985955238342285, + "learning_rate": 1.738653001464129e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6825018525123596, + "num_tokens": 24658380.0, + "step": 951 + }, + { + "epoch": 0.10454645288820558, + "grad_norm": 2.3213961124420166, + "learning_rate": 1.7404831625183016e-06, + "loss": 1.1343, + "mean_token_accuracy": 0.6742069721221924, + "num_tokens": 24683759.0, + "step": 952 + }, + { + "epoch": 0.10465627059081924, + "grad_norm": 2.2280421257019043, + "learning_rate": 1.7423133235724746e-06, + "loss": 1.0989, + "mean_token_accuracy": 0.6729410886764526, + "num_tokens": 24713021.0, + "step": 953 + }, + { + "epoch": 0.1047660882934329, + "grad_norm": 2.4513325691223145, + "learning_rate": 1.7441434846266472e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6956996917724609, + "num_tokens": 24735990.0, + "step": 954 + }, + { + "epoch": 0.10487590599604656, + "grad_norm": 2.250469923019409, + "learning_rate": 1.7459736456808202e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7041444182395935, + "num_tokens": 24762414.0, + "step": 955 + }, + { + "epoch": 0.10498572369866023, + "grad_norm": 2.077455520629883, + "learning_rate": 1.7478038067349928e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7065999507904053, + "num_tokens": 24790695.0, + "step": 956 + }, + { + "epoch": 0.10509554140127389, + "grad_norm": 2.341085195541382, + "learning_rate": 1.7496339677891656e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6877012252807617, + "num_tokens": 24814783.0, + "step": 957 + }, + { + "epoch": 0.10520535910388755, + "grad_norm": 2.419090747833252, + "learning_rate": 1.7514641288433384e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6899106502532959, + "num_tokens": 24837586.0, + "step": 958 + }, + { + "epoch": 0.10531517680650121, + "grad_norm": 2.1141717433929443, + "learning_rate": 1.7532942898975112e-06, + "loss": 1.1015, + "mean_token_accuracy": 0.6816953420639038, + "num_tokens": 24868721.0, + "step": 959 + }, + { + "epoch": 0.10542499450911487, + "grad_norm": 2.5764694213867188, + "learning_rate": 1.7551244509516838e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.7008141279220581, + "num_tokens": 24889567.0, + "step": 960 + }, + { + "epoch": 0.10553481221172853, + "grad_norm": 1.9975709915161133, + "learning_rate": 1.7569546120058568e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6856292486190796, + "num_tokens": 24921608.0, + "step": 961 + }, + { + "epoch": 0.10564462991434219, + "grad_norm": 2.325071334838867, + "learning_rate": 1.7587847730600294e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7067358493804932, + "num_tokens": 24947974.0, + "step": 962 + }, + { + "epoch": 0.10575444761695585, + "grad_norm": 2.4417030811309814, + "learning_rate": 1.7606149341142024e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7057479619979858, + "num_tokens": 24968040.0, + "step": 963 + }, + { + "epoch": 0.10586426531956951, + "grad_norm": 2.397829294204712, + "learning_rate": 1.762445095168375e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.697720468044281, + "num_tokens": 24993080.0, + "step": 964 + }, + { + "epoch": 0.10597408302218318, + "grad_norm": 2.4111452102661133, + "learning_rate": 1.7642752562225475e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6935697793960571, + "num_tokens": 25021014.0, + "step": 965 + }, + { + "epoch": 0.10608390072479684, + "grad_norm": 2.106088161468506, + "learning_rate": 1.7661054172767206e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6849572658538818, + "num_tokens": 25049149.0, + "step": 966 + }, + { + "epoch": 0.1061937184274105, + "grad_norm": 2.1979713439941406, + "learning_rate": 1.7679355783308931e-06, + "loss": 1.1799, + "mean_token_accuracy": 0.653618335723877, + "num_tokens": 25078929.0, + "step": 967 + }, + { + "epoch": 0.10630353613002416, + "grad_norm": 2.1970062255859375, + "learning_rate": 1.769765739385066e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6843055486679077, + "num_tokens": 25104629.0, + "step": 968 + }, + { + "epoch": 0.10641335383263782, + "grad_norm": 2.0151302814483643, + "learning_rate": 1.771595900439239e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6996591687202454, + "num_tokens": 25136869.0, + "step": 969 + }, + { + "epoch": 0.10652317153525148, + "grad_norm": 2.477684736251831, + "learning_rate": 1.7734260614934115e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6861941814422607, + "num_tokens": 25160141.0, + "step": 970 + }, + { + "epoch": 0.10663298923786514, + "grad_norm": 2.437593460083008, + "learning_rate": 1.7752562225475845e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7064831256866455, + "num_tokens": 25181800.0, + "step": 971 + }, + { + "epoch": 0.1067428069404788, + "grad_norm": 1.9867595434188843, + "learning_rate": 1.7770863836017571e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6931901574134827, + "num_tokens": 25211977.0, + "step": 972 + }, + { + "epoch": 0.10685262464309246, + "grad_norm": 2.6140661239624023, + "learning_rate": 1.7789165446559297e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6942983865737915, + "num_tokens": 25230967.0, + "step": 973 + }, + { + "epoch": 0.10696244234570612, + "grad_norm": 2.3246688842773438, + "learning_rate": 1.7807467057101027e-06, + "loss": 1.0887, + "mean_token_accuracy": 0.6801620721817017, + "num_tokens": 25256694.0, + "step": 974 + }, + { + "epoch": 0.1070722600483198, + "grad_norm": 2.5868005752563477, + "learning_rate": 1.7825768667642753e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7094493508338928, + "num_tokens": 25276933.0, + "step": 975 + }, + { + "epoch": 0.10718207775093345, + "grad_norm": 2.8547942638397217, + "learning_rate": 1.7844070278184481e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.707186222076416, + "num_tokens": 25296636.0, + "step": 976 + }, + { + "epoch": 0.10729189545354711, + "grad_norm": 2.3319177627563477, + "learning_rate": 1.786237188872621e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7088027000427246, + "num_tokens": 25319852.0, + "step": 977 + }, + { + "epoch": 0.10740171315616077, + "grad_norm": 2.2971696853637695, + "learning_rate": 1.7880673499267937e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6878591179847717, + "num_tokens": 25346226.0, + "step": 978 + }, + { + "epoch": 0.10751153085877443, + "grad_norm": 2.3355772495269775, + "learning_rate": 1.7898975109809665e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.694022536277771, + "num_tokens": 25371463.0, + "step": 979 + }, + { + "epoch": 0.1076213485613881, + "grad_norm": 2.4004456996917725, + "learning_rate": 1.7917276720351393e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6963130235671997, + "num_tokens": 25398042.0, + "step": 980 + }, + { + "epoch": 0.10773116626400175, + "grad_norm": 2.694127321243286, + "learning_rate": 1.793557833089312e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7190319299697876, + "num_tokens": 25417655.0, + "step": 981 + }, + { + "epoch": 0.10784098396661541, + "grad_norm": 2.4773879051208496, + "learning_rate": 1.795387994143485e-06, + "loss": 1.128, + "mean_token_accuracy": 0.6691261529922485, + "num_tokens": 25440229.0, + "step": 982 + }, + { + "epoch": 0.10795080166922907, + "grad_norm": 2.223289728164673, + "learning_rate": 1.7972181551976575e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7009642124176025, + "num_tokens": 25464785.0, + "step": 983 + }, + { + "epoch": 0.10806061937184275, + "grad_norm": 2.2653427124023438, + "learning_rate": 1.79904831625183e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7104071378707886, + "num_tokens": 25490337.0, + "step": 984 + }, + { + "epoch": 0.10817043707445641, + "grad_norm": 2.283911943435669, + "learning_rate": 1.800878477306003e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6840249300003052, + "num_tokens": 25518675.0, + "step": 985 + }, + { + "epoch": 0.10828025477707007, + "grad_norm": 2.242759943008423, + "learning_rate": 1.8027086383601759e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.696385383605957, + "num_tokens": 25543790.0, + "step": 986 + }, + { + "epoch": 0.10839007247968373, + "grad_norm": 2.177098274230957, + "learning_rate": 1.8045387994143487e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7046478390693665, + "num_tokens": 25571684.0, + "step": 987 + }, + { + "epoch": 0.10849989018229739, + "grad_norm": 2.4028053283691406, + "learning_rate": 1.8063689604685215e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6987757682800293, + "num_tokens": 25595520.0, + "step": 988 + }, + { + "epoch": 0.10860970788491105, + "grad_norm": 2.27712082862854, + "learning_rate": 1.808199121522694e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7038557529449463, + "num_tokens": 25620669.0, + "step": 989 + }, + { + "epoch": 0.1087195255875247, + "grad_norm": 2.105752944946289, + "learning_rate": 1.810029282576867e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7024929523468018, + "num_tokens": 25649831.0, + "step": 990 + }, + { + "epoch": 0.10882934329013837, + "grad_norm": 2.474118947982788, + "learning_rate": 1.8118594436310397e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7130299806594849, + "num_tokens": 25673297.0, + "step": 991 + }, + { + "epoch": 0.10893916099275203, + "grad_norm": 2.559695243835449, + "learning_rate": 1.8136896046852123e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7057952284812927, + "num_tokens": 25694614.0, + "step": 992 + }, + { + "epoch": 0.10904897869536569, + "grad_norm": 2.3917465209960938, + "learning_rate": 1.8155197657393853e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7218557000160217, + "num_tokens": 25715016.0, + "step": 993 + }, + { + "epoch": 0.10915879639797936, + "grad_norm": 2.2790989875793457, + "learning_rate": 1.8173499267935579e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.685847818851471, + "num_tokens": 25741186.0, + "step": 994 + }, + { + "epoch": 0.10926861410059302, + "grad_norm": 2.212286949157715, + "learning_rate": 1.8191800878477309e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6826033592224121, + "num_tokens": 25769915.0, + "step": 995 + }, + { + "epoch": 0.10937843180320668, + "grad_norm": 2.3993098735809326, + "learning_rate": 1.8210102489019034e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.6787094473838806, + "num_tokens": 25794265.0, + "step": 996 + }, + { + "epoch": 0.10948824950582034, + "grad_norm": 2.476836919784546, + "learning_rate": 1.8228404099560762e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7070509791374207, + "num_tokens": 25814876.0, + "step": 997 + }, + { + "epoch": 0.109598067208434, + "grad_norm": 2.0749893188476562, + "learning_rate": 1.824670571010249e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7096327543258667, + "num_tokens": 25843844.0, + "step": 998 + }, + { + "epoch": 0.10970788491104766, + "grad_norm": 2.1774613857269287, + "learning_rate": 1.8265007320644218e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6859779357910156, + "num_tokens": 25870380.0, + "step": 999 + }, + { + "epoch": 0.10981770261366132, + "grad_norm": 2.2997500896453857, + "learning_rate": 1.8283308931185944e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6755933165550232, + "num_tokens": 25896923.0, + "step": 1000 + }, + { + "epoch": 0.10992752031627498, + "grad_norm": 2.51408052444458, + "learning_rate": 1.8301610541727674e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7125355005264282, + "num_tokens": 25918175.0, + "step": 1001 + }, + { + "epoch": 0.11003733801888864, + "grad_norm": 2.2232167720794678, + "learning_rate": 1.83199121522694e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.7043442726135254, + "num_tokens": 25946418.0, + "step": 1002 + }, + { + "epoch": 0.11014715572150231, + "grad_norm": 1.9260404109954834, + "learning_rate": 1.833821376281113e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6781231164932251, + "num_tokens": 25980129.0, + "step": 1003 + }, + { + "epoch": 0.11025697342411597, + "grad_norm": 2.260749578475952, + "learning_rate": 1.8356515373352856e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7042335271835327, + "num_tokens": 26006422.0, + "step": 1004 + }, + { + "epoch": 0.11036679112672963, + "grad_norm": 2.3011813163757324, + "learning_rate": 1.8374816983894584e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6947512626647949, + "num_tokens": 26031934.0, + "step": 1005 + }, + { + "epoch": 0.11047660882934329, + "grad_norm": 2.1728949546813965, + "learning_rate": 1.8393118594436312e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6923262476921082, + "num_tokens": 26061638.0, + "step": 1006 + }, + { + "epoch": 0.11058642653195695, + "grad_norm": 2.323777914047241, + "learning_rate": 1.841142020497804e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.69184410572052, + "num_tokens": 26085980.0, + "step": 1007 + }, + { + "epoch": 0.11069624423457061, + "grad_norm": 2.391585111618042, + "learning_rate": 1.8429721815519766e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.6989377737045288, + "num_tokens": 26107689.0, + "step": 1008 + }, + { + "epoch": 0.11080606193718427, + "grad_norm": 2.089768886566162, + "learning_rate": 1.8448023426061496e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7047703266143799, + "num_tokens": 26134153.0, + "step": 1009 + }, + { + "epoch": 0.11091587963979793, + "grad_norm": 2.1988472938537598, + "learning_rate": 1.8466325036603222e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.707060694694519, + "num_tokens": 26161698.0, + "step": 1010 + }, + { + "epoch": 0.11102569734241159, + "grad_norm": 2.3280887603759766, + "learning_rate": 1.8484626647144952e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6960054636001587, + "num_tokens": 26185559.0, + "step": 1011 + }, + { + "epoch": 0.11113551504502525, + "grad_norm": 2.1971004009246826, + "learning_rate": 1.8502928257686678e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6879299879074097, + "num_tokens": 26214132.0, + "step": 1012 + }, + { + "epoch": 0.11124533274763893, + "grad_norm": 2.1254818439483643, + "learning_rate": 1.8521229868228404e-06, + "loss": 1.0987, + "mean_token_accuracy": 0.6724522113800049, + "num_tokens": 26244062.0, + "step": 1013 + }, + { + "epoch": 0.11135515045025259, + "grad_norm": 2.382174253463745, + "learning_rate": 1.8539531478770134e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6861237287521362, + "num_tokens": 26267542.0, + "step": 1014 + }, + { + "epoch": 0.11146496815286625, + "grad_norm": 2.171907663345337, + "learning_rate": 1.855783308931186e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6885989308357239, + "num_tokens": 26294549.0, + "step": 1015 + }, + { + "epoch": 0.1115747858554799, + "grad_norm": 2.3194220066070557, + "learning_rate": 1.8576134699853588e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7041941285133362, + "num_tokens": 26316171.0, + "step": 1016 + }, + { + "epoch": 0.11168460355809356, + "grad_norm": 2.2940821647644043, + "learning_rate": 1.8594436310395316e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6913393139839172, + "num_tokens": 26340899.0, + "step": 1017 + }, + { + "epoch": 0.11179442126070722, + "grad_norm": 2.218890428543091, + "learning_rate": 1.8612737920937044e-06, + "loss": 1.0907, + "mean_token_accuracy": 0.6767852306365967, + "num_tokens": 26367914.0, + "step": 1018 + }, + { + "epoch": 0.11190423896332088, + "grad_norm": 2.1174304485321045, + "learning_rate": 1.8631039531478774e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7047536969184875, + "num_tokens": 26394660.0, + "step": 1019 + }, + { + "epoch": 0.11201405666593454, + "grad_norm": 2.3186447620391846, + "learning_rate": 1.86493411420205e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6909184455871582, + "num_tokens": 26418397.0, + "step": 1020 + }, + { + "epoch": 0.1121238743685482, + "grad_norm": 2.093996286392212, + "learning_rate": 1.8667642752562226e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6916601657867432, + "num_tokens": 26447217.0, + "step": 1021 + }, + { + "epoch": 0.11223369207116188, + "grad_norm": 2.483214855194092, + "learning_rate": 1.8685944363103956e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6938408017158508, + "num_tokens": 26468995.0, + "step": 1022 + }, + { + "epoch": 0.11234350977377554, + "grad_norm": 2.118811845779419, + "learning_rate": 1.8704245973645682e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.709631085395813, + "num_tokens": 26496512.0, + "step": 1023 + }, + { + "epoch": 0.1124533274763892, + "grad_norm": 2.1446425914764404, + "learning_rate": 1.872254758418741e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7199783325195312, + "num_tokens": 26522427.0, + "step": 1024 + }, + { + "epoch": 0.11256314517900286, + "grad_norm": 2.322779417037964, + "learning_rate": 1.8740849194729138e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6956804990768433, + "num_tokens": 26546416.0, + "step": 1025 + }, + { + "epoch": 0.11267296288161652, + "grad_norm": 2.210207462310791, + "learning_rate": 1.8759150805270866e-06, + "loss": 1.0825, + "mean_token_accuracy": 0.6825430393218994, + "num_tokens": 26575413.0, + "step": 1026 + }, + { + "epoch": 0.11278278058423018, + "grad_norm": 2.3951900005340576, + "learning_rate": 1.8777452415812594e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7012231349945068, + "num_tokens": 26597463.0, + "step": 1027 + }, + { + "epoch": 0.11289259828684384, + "grad_norm": 2.301001787185669, + "learning_rate": 1.8795754026354321e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.689923882484436, + "num_tokens": 26622641.0, + "step": 1028 + }, + { + "epoch": 0.1130024159894575, + "grad_norm": 2.221994400024414, + "learning_rate": 1.8814055636896047e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6976871490478516, + "num_tokens": 26648539.0, + "step": 1029 + }, + { + "epoch": 0.11311223369207116, + "grad_norm": 2.143458366394043, + "learning_rate": 1.8832357247437777e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6934418678283691, + "num_tokens": 26677825.0, + "step": 1030 + }, + { + "epoch": 0.11322205139468482, + "grad_norm": 2.400158405303955, + "learning_rate": 1.8850658857979503e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6805144548416138, + "num_tokens": 26701513.0, + "step": 1031 + }, + { + "epoch": 0.11333186909729849, + "grad_norm": 2.1708831787109375, + "learning_rate": 1.886896046852123e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6971462368965149, + "num_tokens": 26731403.0, + "step": 1032 + }, + { + "epoch": 0.11344168679991215, + "grad_norm": 2.4700961112976074, + "learning_rate": 1.888726207906296e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7008602619171143, + "num_tokens": 26754092.0, + "step": 1033 + }, + { + "epoch": 0.11355150450252581, + "grad_norm": 2.1632513999938965, + "learning_rate": 1.8905563689604687e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7137168645858765, + "num_tokens": 26779009.0, + "step": 1034 + }, + { + "epoch": 0.11366132220513947, + "grad_norm": 2.168138027191162, + "learning_rate": 1.8923865300146415e-06, + "loss": 1.1488, + "mean_token_accuracy": 0.659452497959137, + "num_tokens": 26808069.0, + "step": 1035 + }, + { + "epoch": 0.11377113990775313, + "grad_norm": 2.340667247772217, + "learning_rate": 1.8942166910688143e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.692183256149292, + "num_tokens": 26833122.0, + "step": 1036 + }, + { + "epoch": 0.11388095761036679, + "grad_norm": 2.4492595195770264, + "learning_rate": 1.896046852122987e-06, + "loss": 1.1304, + "mean_token_accuracy": 0.6721964478492737, + "num_tokens": 26858942.0, + "step": 1037 + }, + { + "epoch": 0.11399077531298045, + "grad_norm": 2.338383197784424, + "learning_rate": 1.89787701317716e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7082839012145996, + "num_tokens": 26882608.0, + "step": 1038 + }, + { + "epoch": 0.11410059301559411, + "grad_norm": 2.230579376220703, + "learning_rate": 1.8997071742313325e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6974510550498962, + "num_tokens": 26906830.0, + "step": 1039 + }, + { + "epoch": 0.11421041071820777, + "grad_norm": 2.2141480445861816, + "learning_rate": 1.901537335285505e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6978576183319092, + "num_tokens": 26934123.0, + "step": 1040 + }, + { + "epoch": 0.11432022842082144, + "grad_norm": 2.331413745880127, + "learning_rate": 1.903367496339678e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6887853145599365, + "num_tokens": 26957794.0, + "step": 1041 + }, + { + "epoch": 0.1144300461234351, + "grad_norm": 2.1882715225219727, + "learning_rate": 1.9051976573938507e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6986681818962097, + "num_tokens": 26983761.0, + "step": 1042 + }, + { + "epoch": 0.11453986382604876, + "grad_norm": 2.382802963256836, + "learning_rate": 1.9070278184480237e-06, + "loss": 1.0874, + "mean_token_accuracy": 0.6788634061813354, + "num_tokens": 27008966.0, + "step": 1043 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 2.450862407684326, + "learning_rate": 1.9088579795021963e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6857289671897888, + "num_tokens": 27032798.0, + "step": 1044 + }, + { + "epoch": 0.11475949923127608, + "grad_norm": 2.2659714221954346, + "learning_rate": 1.910688140556369e-06, + "loss": 1.1009, + "mean_token_accuracy": 0.680162250995636, + "num_tokens": 27060624.0, + "step": 1045 + }, + { + "epoch": 0.11486931693388974, + "grad_norm": 1.9618635177612305, + "learning_rate": 1.912518301610542e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7035270929336548, + "num_tokens": 27093437.0, + "step": 1046 + }, + { + "epoch": 0.1149791346365034, + "grad_norm": 2.060229539871216, + "learning_rate": 1.9143484626647145e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6796585321426392, + "num_tokens": 27122871.0, + "step": 1047 + }, + { + "epoch": 0.11508895233911706, + "grad_norm": 2.173996686935425, + "learning_rate": 1.9161786237188875e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7002465724945068, + "num_tokens": 27148087.0, + "step": 1048 + }, + { + "epoch": 0.11519877004173072, + "grad_norm": 2.53109073638916, + "learning_rate": 1.91800878477306e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6934319734573364, + "num_tokens": 27173516.0, + "step": 1049 + }, + { + "epoch": 0.11530858774434438, + "grad_norm": 2.1651604175567627, + "learning_rate": 1.919838945827233e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6898962259292603, + "num_tokens": 27201861.0, + "step": 1050 + }, + { + "epoch": 0.11541840544695806, + "grad_norm": 2.008450746536255, + "learning_rate": 1.9216691068814057e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7132295370101929, + "num_tokens": 27232068.0, + "step": 1051 + }, + { + "epoch": 0.11552822314957172, + "grad_norm": 2.3441357612609863, + "learning_rate": 1.9234992679355787e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.707133948802948, + "num_tokens": 27256455.0, + "step": 1052 + }, + { + "epoch": 0.11563804085218538, + "grad_norm": 1.9277862310409546, + "learning_rate": 1.9253294289897513e-06, + "loss": 1.11, + "mean_token_accuracy": 0.6793738603591919, + "num_tokens": 27289813.0, + "step": 1053 + }, + { + "epoch": 0.11574785855479904, + "grad_norm": 2.25246262550354, + "learning_rate": 1.9271595900439243e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6998156309127808, + "num_tokens": 27316845.0, + "step": 1054 + }, + { + "epoch": 0.1158576762574127, + "grad_norm": 2.5441043376922607, + "learning_rate": 1.928989751098097e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7069264650344849, + "num_tokens": 27335786.0, + "step": 1055 + }, + { + "epoch": 0.11596749396002635, + "grad_norm": 2.312333583831787, + "learning_rate": 1.9308199121522694e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7093783020973206, + "num_tokens": 27358915.0, + "step": 1056 + }, + { + "epoch": 0.11607731166264001, + "grad_norm": 2.6660900115966797, + "learning_rate": 1.9326500732064425e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6951675415039062, + "num_tokens": 27378640.0, + "step": 1057 + }, + { + "epoch": 0.11618712936525367, + "grad_norm": 1.9705004692077637, + "learning_rate": 1.934480234260615e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7246311902999878, + "num_tokens": 27406497.0, + "step": 1058 + }, + { + "epoch": 0.11629694706786733, + "grad_norm": 2.2746238708496094, + "learning_rate": 1.936310395314788e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.704575777053833, + "num_tokens": 27429740.0, + "step": 1059 + }, + { + "epoch": 0.11640676477048101, + "grad_norm": 2.215670347213745, + "learning_rate": 1.9381405563689606e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6943389773368835, + "num_tokens": 27453491.0, + "step": 1060 + }, + { + "epoch": 0.11651658247309467, + "grad_norm": 2.219334125518799, + "learning_rate": 1.9399707174231332e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6935729384422302, + "num_tokens": 27482883.0, + "step": 1061 + }, + { + "epoch": 0.11662640017570833, + "grad_norm": 2.2947847843170166, + "learning_rate": 1.9418008784773062e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6938014626502991, + "num_tokens": 27508021.0, + "step": 1062 + }, + { + "epoch": 0.11673621787832199, + "grad_norm": 2.0276782512664795, + "learning_rate": 1.943631039531479e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6933159232139587, + "num_tokens": 27540182.0, + "step": 1063 + }, + { + "epoch": 0.11684603558093565, + "grad_norm": 2.2139968872070312, + "learning_rate": 1.9454612005856514e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6900852918624878, + "num_tokens": 27565637.0, + "step": 1064 + }, + { + "epoch": 0.11695585328354931, + "grad_norm": 1.9727627038955688, + "learning_rate": 1.9472913616398244e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.699055552482605, + "num_tokens": 27598960.0, + "step": 1065 + }, + { + "epoch": 0.11706567098616297, + "grad_norm": 2.3489999771118164, + "learning_rate": 1.949121522693997e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6991604566574097, + "num_tokens": 27624292.0, + "step": 1066 + }, + { + "epoch": 0.11717548868877663, + "grad_norm": 2.1532156467437744, + "learning_rate": 1.95095168374817e-06, + "loss": 1.1091, + "mean_token_accuracy": 0.676337480545044, + "num_tokens": 27652104.0, + "step": 1067 + }, + { + "epoch": 0.11728530639139029, + "grad_norm": 2.446214199066162, + "learning_rate": 1.9527818448023426e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7038745284080505, + "num_tokens": 27675585.0, + "step": 1068 + }, + { + "epoch": 0.11739512409400395, + "grad_norm": 2.3833463191986084, + "learning_rate": 1.9546120058565156e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6997545957565308, + "num_tokens": 27697871.0, + "step": 1069 + }, + { + "epoch": 0.11750494179661762, + "grad_norm": 2.3606717586517334, + "learning_rate": 1.956442166910688e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7066409587860107, + "num_tokens": 27722419.0, + "step": 1070 + }, + { + "epoch": 0.11761475949923128, + "grad_norm": 2.248912811279297, + "learning_rate": 1.958272327964861e-06, + "loss": 1.0756, + "mean_token_accuracy": 0.6787667870521545, + "num_tokens": 27749658.0, + "step": 1071 + }, + { + "epoch": 0.11772457720184494, + "grad_norm": 2.395555257797241, + "learning_rate": 1.960102489019034e-06, + "loss": 1.0858, + "mean_token_accuracy": 0.6816682815551758, + "num_tokens": 27775556.0, + "step": 1072 + }, + { + "epoch": 0.1178343949044586, + "grad_norm": 2.175070285797119, + "learning_rate": 1.961932650073207e-06, + "loss": 1.0808, + "mean_token_accuracy": 0.6756857633590698, + "num_tokens": 27802975.0, + "step": 1073 + }, + { + "epoch": 0.11794421260707226, + "grad_norm": 2.3812038898468018, + "learning_rate": 1.9637628111273794e-06, + "loss": 1.1068, + "mean_token_accuracy": 0.6739549040794373, + "num_tokens": 27827101.0, + "step": 1074 + }, + { + "epoch": 0.11805403030968592, + "grad_norm": 2.5437817573547363, + "learning_rate": 1.9655929721815524e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7032386064529419, + "num_tokens": 27848508.0, + "step": 1075 + }, + { + "epoch": 0.11816384801229958, + "grad_norm": 2.1240975856781006, + "learning_rate": 1.967423133235725e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7002719640731812, + "num_tokens": 27876958.0, + "step": 1076 + }, + { + "epoch": 0.11827366571491324, + "grad_norm": 2.350580930709839, + "learning_rate": 1.9692532942898976e-06, + "loss": 1.0862, + "mean_token_accuracy": 0.6811988353729248, + "num_tokens": 27901150.0, + "step": 1077 + }, + { + "epoch": 0.1183834834175269, + "grad_norm": 2.0694987773895264, + "learning_rate": 1.9710834553440706e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7177320718765259, + "num_tokens": 27929112.0, + "step": 1078 + }, + { + "epoch": 0.11849330112014057, + "grad_norm": 2.096269369125366, + "learning_rate": 1.972913616398243e-06, + "loss": 1.1029, + "mean_token_accuracy": 0.673615038394928, + "num_tokens": 27958193.0, + "step": 1079 + }, + { + "epoch": 0.11860311882275423, + "grad_norm": 2.362203359603882, + "learning_rate": 1.9747437774524158e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6889214515686035, + "num_tokens": 27983248.0, + "step": 1080 + }, + { + "epoch": 0.1187129365253679, + "grad_norm": 2.3301286697387695, + "learning_rate": 1.9765739385065888e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6928797960281372, + "num_tokens": 28006461.0, + "step": 1081 + }, + { + "epoch": 0.11882275422798155, + "grad_norm": 2.5222041606903076, + "learning_rate": 1.9784040995607614e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6734554171562195, + "num_tokens": 28027541.0, + "step": 1082 + }, + { + "epoch": 0.11893257193059521, + "grad_norm": 2.2934303283691406, + "learning_rate": 1.9802342606149344e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7088513374328613, + "num_tokens": 28050626.0, + "step": 1083 + }, + { + "epoch": 0.11904238963320887, + "grad_norm": 2.3869073390960693, + "learning_rate": 1.982064421669107e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7064129114151001, + "num_tokens": 28074116.0, + "step": 1084 + }, + { + "epoch": 0.11915220733582253, + "grad_norm": 2.4842309951782227, + "learning_rate": 1.9838945827232795e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7060992121696472, + "num_tokens": 28096727.0, + "step": 1085 + }, + { + "epoch": 0.11926202503843619, + "grad_norm": 2.301386594772339, + "learning_rate": 1.9857247437774525e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6958421468734741, + "num_tokens": 28122639.0, + "step": 1086 + }, + { + "epoch": 0.11937184274104985, + "grad_norm": 2.4786524772644043, + "learning_rate": 1.987554904831625e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6936922073364258, + "num_tokens": 28145817.0, + "step": 1087 + }, + { + "epoch": 0.11948166044366351, + "grad_norm": 2.145728349685669, + "learning_rate": 1.989385065885798e-06, + "loss": 1.0915, + "mean_token_accuracy": 0.6774666905403137, + "num_tokens": 28174439.0, + "step": 1088 + }, + { + "epoch": 0.11959147814627719, + "grad_norm": 2.1287481784820557, + "learning_rate": 1.9912152269399707e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7079463005065918, + "num_tokens": 28201289.0, + "step": 1089 + }, + { + "epoch": 0.11970129584889085, + "grad_norm": 2.2502424716949463, + "learning_rate": 1.9930453879941437e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7164593935012817, + "num_tokens": 28226893.0, + "step": 1090 + }, + { + "epoch": 0.1198111135515045, + "grad_norm": 2.42466139793396, + "learning_rate": 1.9948755490483163e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7136176824569702, + "num_tokens": 28247697.0, + "step": 1091 + }, + { + "epoch": 0.11992093125411817, + "grad_norm": 2.2155537605285645, + "learning_rate": 1.9967057101024893e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7071223855018616, + "num_tokens": 28274287.0, + "step": 1092 + }, + { + "epoch": 0.12003074895673183, + "grad_norm": 2.301508665084839, + "learning_rate": 1.998535871156662e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.712818443775177, + "num_tokens": 28297021.0, + "step": 1093 + }, + { + "epoch": 0.12014056665934549, + "grad_norm": 2.3961522579193115, + "learning_rate": 2.000366032210835e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6876732110977173, + "num_tokens": 28319866.0, + "step": 1094 + }, + { + "epoch": 0.12025038436195915, + "grad_norm": 2.211437225341797, + "learning_rate": 2.0021961932650075e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6880162954330444, + "num_tokens": 28344370.0, + "step": 1095 + }, + { + "epoch": 0.1203602020645728, + "grad_norm": 2.120204210281372, + "learning_rate": 2.00402635431918e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7025144696235657, + "num_tokens": 28375383.0, + "step": 1096 + }, + { + "epoch": 0.12047001976718646, + "grad_norm": 2.057502031326294, + "learning_rate": 2.005856515373353e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6838334202766418, + "num_tokens": 28407153.0, + "step": 1097 + }, + { + "epoch": 0.12057983746980014, + "grad_norm": 2.2199902534484863, + "learning_rate": 2.0076866764275257e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.690180778503418, + "num_tokens": 28433871.0, + "step": 1098 + }, + { + "epoch": 0.1206896551724138, + "grad_norm": 2.367612600326538, + "learning_rate": 2.0095168374816987e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6980170011520386, + "num_tokens": 28458368.0, + "step": 1099 + }, + { + "epoch": 0.12079947287502746, + "grad_norm": 2.110137939453125, + "learning_rate": 2.0113469985358713e-06, + "loss": 1.028, + "mean_token_accuracy": 0.704051673412323, + "num_tokens": 28486445.0, + "step": 1100 + }, + { + "epoch": 0.12090929057764112, + "grad_norm": 1.9698398113250732, + "learning_rate": 2.013177159590044e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.711382269859314, + "num_tokens": 28518946.0, + "step": 1101 + }, + { + "epoch": 0.12101910828025478, + "grad_norm": 2.12910532951355, + "learning_rate": 2.015007320644217e-06, + "loss": 1.1, + "mean_token_accuracy": 0.6781628727912903, + "num_tokens": 28552649.0, + "step": 1102 + }, + { + "epoch": 0.12112892598286844, + "grad_norm": 2.935833692550659, + "learning_rate": 2.0168374816983895e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6893024444580078, + "num_tokens": 28568233.0, + "step": 1103 + }, + { + "epoch": 0.1212387436854821, + "grad_norm": 2.369194269180298, + "learning_rate": 2.018667642752562e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7067152857780457, + "num_tokens": 28591718.0, + "step": 1104 + }, + { + "epoch": 0.12134856138809576, + "grad_norm": 2.306283473968506, + "learning_rate": 2.020497803806735e-06, + "loss": 1.1321, + "mean_token_accuracy": 0.6613104343414307, + "num_tokens": 28618442.0, + "step": 1105 + }, + { + "epoch": 0.12145837909070942, + "grad_norm": 2.1768288612365723, + "learning_rate": 2.0223279648609077e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7021183371543884, + "num_tokens": 28644673.0, + "step": 1106 + }, + { + "epoch": 0.12156819679332308, + "grad_norm": 2.195185422897339, + "learning_rate": 2.0241581259150807e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6991317272186279, + "num_tokens": 28672577.0, + "step": 1107 + }, + { + "epoch": 0.12167801449593675, + "grad_norm": 2.3938088417053223, + "learning_rate": 2.0259882869692537e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7026547193527222, + "num_tokens": 28695398.0, + "step": 1108 + }, + { + "epoch": 0.12178783219855041, + "grad_norm": 2.1618988513946533, + "learning_rate": 2.0278184480234263e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6945311427116394, + "num_tokens": 28722550.0, + "step": 1109 + }, + { + "epoch": 0.12189764990116407, + "grad_norm": 2.3172826766967773, + "learning_rate": 2.0296486090775993e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7061413526535034, + "num_tokens": 28747396.0, + "step": 1110 + }, + { + "epoch": 0.12200746760377773, + "grad_norm": 2.122255563735962, + "learning_rate": 2.031478770131772e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7023608088493347, + "num_tokens": 28774466.0, + "step": 1111 + }, + { + "epoch": 0.12211728530639139, + "grad_norm": 2.1957693099975586, + "learning_rate": 2.0333089311859445e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7016764879226685, + "num_tokens": 28800172.0, + "step": 1112 + }, + { + "epoch": 0.12222710300900505, + "grad_norm": 2.4760098457336426, + "learning_rate": 2.0351390922401175e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6827907562255859, + "num_tokens": 28823348.0, + "step": 1113 + }, + { + "epoch": 0.12233692071161871, + "grad_norm": 2.005991220474243, + "learning_rate": 2.03696925329429e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6899328827857971, + "num_tokens": 28853584.0, + "step": 1114 + }, + { + "epoch": 0.12244673841423237, + "grad_norm": 2.6282801628112793, + "learning_rate": 2.038799414348463e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6894341707229614, + "num_tokens": 28872997.0, + "step": 1115 + }, + { + "epoch": 0.12255655611684603, + "grad_norm": 1.9005225896835327, + "learning_rate": 2.0406295754026357e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.6772177219390869, + "num_tokens": 28909835.0, + "step": 1116 + }, + { + "epoch": 0.1226663738194597, + "grad_norm": 2.27182674407959, + "learning_rate": 2.0424597364568082e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7104592323303223, + "num_tokens": 28934511.0, + "step": 1117 + }, + { + "epoch": 0.12277619152207336, + "grad_norm": 1.9880099296569824, + "learning_rate": 2.0442898975109812e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6979694366455078, + "num_tokens": 28966687.0, + "step": 1118 + }, + { + "epoch": 0.12288600922468702, + "grad_norm": 2.5652480125427246, + "learning_rate": 2.046120058565154e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7084904909133911, + "num_tokens": 28985221.0, + "step": 1119 + }, + { + "epoch": 0.12299582692730068, + "grad_norm": 2.3238375186920166, + "learning_rate": 2.0479502196193264e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6929726600646973, + "num_tokens": 29010174.0, + "step": 1120 + }, + { + "epoch": 0.12310564462991434, + "grad_norm": 2.0711047649383545, + "learning_rate": 2.0497803806734994e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6883833408355713, + "num_tokens": 29041979.0, + "step": 1121 + }, + { + "epoch": 0.123215462332528, + "grad_norm": 2.367980718612671, + "learning_rate": 2.051610541727672e-06, + "loss": 1.0911, + "mean_token_accuracy": 0.6741581559181213, + "num_tokens": 29067203.0, + "step": 1122 + }, + { + "epoch": 0.12332528003514166, + "grad_norm": 2.1605312824249268, + "learning_rate": 2.053440702781845e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6935395002365112, + "num_tokens": 29094075.0, + "step": 1123 + }, + { + "epoch": 0.12343509773775532, + "grad_norm": 2.272432327270508, + "learning_rate": 2.0552708638360176e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7059911489486694, + "num_tokens": 29117553.0, + "step": 1124 + }, + { + "epoch": 0.12354491544036898, + "grad_norm": 2.355090379714966, + "learning_rate": 2.0571010248901906e-06, + "loss": 1.0935, + "mean_token_accuracy": 0.674971878528595, + "num_tokens": 29143345.0, + "step": 1125 + }, + { + "epoch": 0.12365473314298264, + "grad_norm": 2.4052226543426514, + "learning_rate": 2.0589311859443632e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.6880959272384644, + "num_tokens": 29168125.0, + "step": 1126 + }, + { + "epoch": 0.12376455084559632, + "grad_norm": 2.324096202850342, + "learning_rate": 2.0607613469985362e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6818134784698486, + "num_tokens": 29195632.0, + "step": 1127 + }, + { + "epoch": 0.12387436854820998, + "grad_norm": 1.9422985315322876, + "learning_rate": 2.062591508052709e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6939585208892822, + "num_tokens": 29225865.0, + "step": 1128 + }, + { + "epoch": 0.12398418625082364, + "grad_norm": 2.293689727783203, + "learning_rate": 2.064421669106882e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.681877613067627, + "num_tokens": 29253536.0, + "step": 1129 + }, + { + "epoch": 0.1240940039534373, + "grad_norm": 2.319983720779419, + "learning_rate": 2.0662518301610544e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7050809264183044, + "num_tokens": 29278415.0, + "step": 1130 + }, + { + "epoch": 0.12420382165605096, + "grad_norm": 2.3053996562957764, + "learning_rate": 2.0680819912152274e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7034938931465149, + "num_tokens": 29304523.0, + "step": 1131 + }, + { + "epoch": 0.12431363935866462, + "grad_norm": 2.245884418487549, + "learning_rate": 2.0699121522694e-06, + "loss": 1.1365, + "mean_token_accuracy": 0.6664113998413086, + "num_tokens": 29333562.0, + "step": 1132 + }, + { + "epoch": 0.12442345706127828, + "grad_norm": 2.2191781997680664, + "learning_rate": 2.0717423133235726e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6914233565330505, + "num_tokens": 29360365.0, + "step": 1133 + }, + { + "epoch": 0.12453327476389194, + "grad_norm": 2.4595603942871094, + "learning_rate": 2.0735724743777456e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.734110951423645, + "num_tokens": 29381600.0, + "step": 1134 + }, + { + "epoch": 0.1246430924665056, + "grad_norm": 2.1228199005126953, + "learning_rate": 2.075402635431918e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6910964846611023, + "num_tokens": 29410673.0, + "step": 1135 + }, + { + "epoch": 0.12475291016911927, + "grad_norm": 2.472696304321289, + "learning_rate": 2.0772327964860908e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7349643707275391, + "num_tokens": 29430243.0, + "step": 1136 + }, + { + "epoch": 0.12486272787173293, + "grad_norm": 2.23372745513916, + "learning_rate": 2.0790629575402638e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6954523324966431, + "num_tokens": 29455281.0, + "step": 1137 + }, + { + "epoch": 0.12497254557434659, + "grad_norm": 2.488889217376709, + "learning_rate": 2.0808931185944364e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.694970428943634, + "num_tokens": 29479749.0, + "step": 1138 + }, + { + "epoch": 0.12508236327696023, + "grad_norm": 2.1290206909179688, + "learning_rate": 2.0827232796486094e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6926644444465637, + "num_tokens": 29513461.0, + "step": 1139 + }, + { + "epoch": 0.1251921809795739, + "grad_norm": 2.452226161956787, + "learning_rate": 2.084553440702782e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6872435808181763, + "num_tokens": 29535638.0, + "step": 1140 + }, + { + "epoch": 0.12530199868218758, + "grad_norm": 2.0260274410247803, + "learning_rate": 2.0863836017569546e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6895515322685242, + "num_tokens": 29569728.0, + "step": 1141 + }, + { + "epoch": 0.12541181638480123, + "grad_norm": 2.4461510181427, + "learning_rate": 2.0882137628111276e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6992164850234985, + "num_tokens": 29593233.0, + "step": 1142 + }, + { + "epoch": 0.1255216340874149, + "grad_norm": 2.1846871376037598, + "learning_rate": 2.0900439238653e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6846155524253845, + "num_tokens": 29622928.0, + "step": 1143 + }, + { + "epoch": 0.12563145179002855, + "grad_norm": 2.4431512355804443, + "learning_rate": 2.091874084919473e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6990246772766113, + "num_tokens": 29646152.0, + "step": 1144 + }, + { + "epoch": 0.12574126949264222, + "grad_norm": 2.4805266857147217, + "learning_rate": 2.0937042459736457e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6858934164047241, + "num_tokens": 29668476.0, + "step": 1145 + }, + { + "epoch": 0.12585108719525587, + "grad_norm": 2.4701671600341797, + "learning_rate": 2.0955344070278188e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6764212846755981, + "num_tokens": 29693096.0, + "step": 1146 + }, + { + "epoch": 0.12596090489786954, + "grad_norm": 2.1994240283966064, + "learning_rate": 2.0973645680819913e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7051869630813599, + "num_tokens": 29718583.0, + "step": 1147 + }, + { + "epoch": 0.1260707226004832, + "grad_norm": 2.628814935684204, + "learning_rate": 2.0991947291361644e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7019226551055908, + "num_tokens": 29740776.0, + "step": 1148 + }, + { + "epoch": 0.12618054030309686, + "grad_norm": 2.091095447540283, + "learning_rate": 2.101024890190337e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.686343252658844, + "num_tokens": 29770908.0, + "step": 1149 + }, + { + "epoch": 0.1262903580057105, + "grad_norm": 2.1578855514526367, + "learning_rate": 2.10285505124451e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6942316889762878, + "num_tokens": 29799660.0, + "step": 1150 + }, + { + "epoch": 0.12640017570832418, + "grad_norm": 2.454463005065918, + "learning_rate": 2.1046852122986825e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7100183963775635, + "num_tokens": 29821406.0, + "step": 1151 + }, + { + "epoch": 0.12650999341093785, + "grad_norm": 2.0207414627075195, + "learning_rate": 2.106515373352855e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6858803033828735, + "num_tokens": 29852311.0, + "step": 1152 + }, + { + "epoch": 0.1266198111135515, + "grad_norm": 2.1298491954803467, + "learning_rate": 2.108345534407028e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6988835334777832, + "num_tokens": 29878974.0, + "step": 1153 + }, + { + "epoch": 0.12672962881616517, + "grad_norm": 2.4329471588134766, + "learning_rate": 2.1101756954612007e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7250717282295227, + "num_tokens": 29900135.0, + "step": 1154 + }, + { + "epoch": 0.12683944651877882, + "grad_norm": 2.0266058444976807, + "learning_rate": 2.1120058565153737e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6862987875938416, + "num_tokens": 29931322.0, + "step": 1155 + }, + { + "epoch": 0.1269492642213925, + "grad_norm": 2.265700101852417, + "learning_rate": 2.1138360175695463e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7039483785629272, + "num_tokens": 29954354.0, + "step": 1156 + }, + { + "epoch": 0.12705908192400614, + "grad_norm": 2.2096211910247803, + "learning_rate": 2.115666178623719e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.681883692741394, + "num_tokens": 29982041.0, + "step": 1157 + }, + { + "epoch": 0.1271688996266198, + "grad_norm": 2.296351671218872, + "learning_rate": 2.117496339677892e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7114946842193604, + "num_tokens": 30006936.0, + "step": 1158 + }, + { + "epoch": 0.12727871732923346, + "grad_norm": 2.1981306076049805, + "learning_rate": 2.1193265007320645e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6868925094604492, + "num_tokens": 30034138.0, + "step": 1159 + }, + { + "epoch": 0.12738853503184713, + "grad_norm": 2.061732530593872, + "learning_rate": 2.121156661786237e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6903946399688721, + "num_tokens": 30066370.0, + "step": 1160 + }, + { + "epoch": 0.1274983527344608, + "grad_norm": 2.2526891231536865, + "learning_rate": 2.12298682284041e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6839644312858582, + "num_tokens": 30093585.0, + "step": 1161 + }, + { + "epoch": 0.12760817043707445, + "grad_norm": 2.3739633560180664, + "learning_rate": 2.1248169838945827e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6863799095153809, + "num_tokens": 30118307.0, + "step": 1162 + }, + { + "epoch": 0.12771798813968813, + "grad_norm": 2.351100444793701, + "learning_rate": 2.1266471449487557e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6857290267944336, + "num_tokens": 30142446.0, + "step": 1163 + }, + { + "epoch": 0.12782780584230177, + "grad_norm": 2.231083393096924, + "learning_rate": 2.1284773060029283e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.714133620262146, + "num_tokens": 30170360.0, + "step": 1164 + }, + { + "epoch": 0.12793762354491545, + "grad_norm": 2.0589606761932373, + "learning_rate": 2.1303074670571013e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7073901295661926, + "num_tokens": 30198162.0, + "step": 1165 + }, + { + "epoch": 0.1280474412475291, + "grad_norm": 2.7581541538238525, + "learning_rate": 2.132137628111274e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.722052276134491, + "num_tokens": 30216040.0, + "step": 1166 + }, + { + "epoch": 0.12815725895014277, + "grad_norm": 2.1895439624786377, + "learning_rate": 2.133967789165447e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.6975749135017395, + "num_tokens": 30243768.0, + "step": 1167 + }, + { + "epoch": 0.1282670766527564, + "grad_norm": 2.2987444400787354, + "learning_rate": 2.1357979502196195e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6992309093475342, + "num_tokens": 30267287.0, + "step": 1168 + }, + { + "epoch": 0.12837689435537009, + "grad_norm": 2.160353422164917, + "learning_rate": 2.1376281112737925e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6842266321182251, + "num_tokens": 30296049.0, + "step": 1169 + }, + { + "epoch": 0.12848671205798376, + "grad_norm": 2.2085065841674805, + "learning_rate": 2.139458272327965e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7008923888206482, + "num_tokens": 30324318.0, + "step": 1170 + }, + { + "epoch": 0.1285965297605974, + "grad_norm": 2.0221941471099854, + "learning_rate": 2.141288433382138e-06, + "loss": 1.083, + "mean_token_accuracy": 0.6790465116500854, + "num_tokens": 30356554.0, + "step": 1171 + }, + { + "epoch": 0.12870634746321108, + "grad_norm": 2.368760824203491, + "learning_rate": 2.1431185944363107e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.690437376499176, + "num_tokens": 30382766.0, + "step": 1172 + }, + { + "epoch": 0.12881616516582473, + "grad_norm": 1.9775431156158447, + "learning_rate": 2.1449487554904833e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6929912567138672, + "num_tokens": 30414120.0, + "step": 1173 + }, + { + "epoch": 0.1289259828684384, + "grad_norm": 2.2971792221069336, + "learning_rate": 2.1467789165446563e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6827515959739685, + "num_tokens": 30441365.0, + "step": 1174 + }, + { + "epoch": 0.12903580057105205, + "grad_norm": 2.3689987659454346, + "learning_rate": 2.148609077598829e-06, + "loss": 1.0804, + "mean_token_accuracy": 0.6797046661376953, + "num_tokens": 30466343.0, + "step": 1175 + }, + { + "epoch": 0.12914561827366572, + "grad_norm": 2.099839448928833, + "learning_rate": 2.1504392386530014e-06, + "loss": 1.07, + "mean_token_accuracy": 0.6844382286071777, + "num_tokens": 30499977.0, + "step": 1176 + }, + { + "epoch": 0.12925543597627936, + "grad_norm": 2.139263868331909, + "learning_rate": 2.1522693997071744e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7081284523010254, + "num_tokens": 30528235.0, + "step": 1177 + }, + { + "epoch": 0.12936525367889304, + "grad_norm": 2.139904260635376, + "learning_rate": 2.154099560761347e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7148005366325378, + "num_tokens": 30554590.0, + "step": 1178 + }, + { + "epoch": 0.1294750713815067, + "grad_norm": 2.125391721725464, + "learning_rate": 2.15592972181552e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6975334882736206, + "num_tokens": 30581064.0, + "step": 1179 + }, + { + "epoch": 0.12958488908412036, + "grad_norm": 1.9700446128845215, + "learning_rate": 2.1577598828696926e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6800177097320557, + "num_tokens": 30613096.0, + "step": 1180 + }, + { + "epoch": 0.12969470678673403, + "grad_norm": 2.255540132522583, + "learning_rate": 2.1595900439238652e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6765243411064148, + "num_tokens": 30639323.0, + "step": 1181 + }, + { + "epoch": 0.12980452448934768, + "grad_norm": 2.3890886306762695, + "learning_rate": 2.1614202049780382e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6895264387130737, + "num_tokens": 30664024.0, + "step": 1182 + }, + { + "epoch": 0.12991434219196135, + "grad_norm": 2.176637887954712, + "learning_rate": 2.163250366032211e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6991443634033203, + "num_tokens": 30688819.0, + "step": 1183 + }, + { + "epoch": 0.130024159894575, + "grad_norm": 2.18302583694458, + "learning_rate": 2.165080527086384e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.688910961151123, + "num_tokens": 30715583.0, + "step": 1184 + }, + { + "epoch": 0.13013397759718867, + "grad_norm": 2.3429813385009766, + "learning_rate": 2.1669106881405564e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7087903022766113, + "num_tokens": 30740172.0, + "step": 1185 + }, + { + "epoch": 0.13024379529980232, + "grad_norm": 2.3639142513275146, + "learning_rate": 2.1687408491947294e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7198450565338135, + "num_tokens": 30762807.0, + "step": 1186 + }, + { + "epoch": 0.130353613002416, + "grad_norm": 2.189924955368042, + "learning_rate": 2.170571010248902e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.6683456897735596, + "num_tokens": 30793966.0, + "step": 1187 + }, + { + "epoch": 0.13046343070502964, + "grad_norm": 2.0863897800445557, + "learning_rate": 2.172401171303075e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6749801635742188, + "num_tokens": 30826100.0, + "step": 1188 + }, + { + "epoch": 0.1305732484076433, + "grad_norm": 2.413950204849243, + "learning_rate": 2.1742313323572476e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6988364458084106, + "num_tokens": 30849690.0, + "step": 1189 + }, + { + "epoch": 0.13068306611025698, + "grad_norm": 2.434093952178955, + "learning_rate": 2.1760614934114206e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6976240873336792, + "num_tokens": 30871565.0, + "step": 1190 + }, + { + "epoch": 0.13079288381287063, + "grad_norm": 2.275385856628418, + "learning_rate": 2.177891654465593e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6926507949829102, + "num_tokens": 30897711.0, + "step": 1191 + }, + { + "epoch": 0.1309027015154843, + "grad_norm": 2.169175148010254, + "learning_rate": 2.1797218155197658e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6926087141036987, + "num_tokens": 30925653.0, + "step": 1192 + }, + { + "epoch": 0.13101251921809795, + "grad_norm": 2.153585195541382, + "learning_rate": 2.181551976573939e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6738969087600708, + "num_tokens": 30955387.0, + "step": 1193 + }, + { + "epoch": 0.13112233692071162, + "grad_norm": 2.1776680946350098, + "learning_rate": 2.1833821376281114e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6950995326042175, + "num_tokens": 30981370.0, + "step": 1194 + }, + { + "epoch": 0.13123215462332527, + "grad_norm": 2.5139808654785156, + "learning_rate": 2.1852122986822844e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6987833380699158, + "num_tokens": 31002829.0, + "step": 1195 + }, + { + "epoch": 0.13134197232593894, + "grad_norm": 2.0868186950683594, + "learning_rate": 2.187042459736457e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6989164352416992, + "num_tokens": 31033226.0, + "step": 1196 + }, + { + "epoch": 0.1314517900285526, + "grad_norm": 2.110016345977783, + "learning_rate": 2.1888726207906296e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6936566829681396, + "num_tokens": 31061794.0, + "step": 1197 + }, + { + "epoch": 0.13156160773116626, + "grad_norm": 2.2603137493133545, + "learning_rate": 2.1907027818448026e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6913881301879883, + "num_tokens": 31087167.0, + "step": 1198 + }, + { + "epoch": 0.13167142543377994, + "grad_norm": 2.2884271144866943, + "learning_rate": 2.192532942898975e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6966596245765686, + "num_tokens": 31112644.0, + "step": 1199 + }, + { + "epoch": 0.13178124313639358, + "grad_norm": 2.1837756633758545, + "learning_rate": 2.1943631039531477e-06, + "loss": 1.0769, + "mean_token_accuracy": 0.6841261386871338, + "num_tokens": 31143599.0, + "step": 1200 + }, + { + "epoch": 0.13189106083900726, + "grad_norm": 2.509824514389038, + "learning_rate": 2.1961932650073208e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7100014686584473, + "num_tokens": 31165148.0, + "step": 1201 + }, + { + "epoch": 0.1320008785416209, + "grad_norm": 1.8918324708938599, + "learning_rate": 2.1980234260614933e-06, + "loss": 1.1418, + "mean_token_accuracy": 0.6706090569496155, + "num_tokens": 31199542.0, + "step": 1202 + }, + { + "epoch": 0.13211069624423458, + "grad_norm": 2.122774600982666, + "learning_rate": 2.1998535871156664e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6935091018676758, + "num_tokens": 31227590.0, + "step": 1203 + }, + { + "epoch": 0.13222051394684822, + "grad_norm": 2.0222067832946777, + "learning_rate": 2.201683748169839e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7028124332427979, + "num_tokens": 31258465.0, + "step": 1204 + }, + { + "epoch": 0.1323303316494619, + "grad_norm": 2.0785601139068604, + "learning_rate": 2.203513909224012e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6898006200790405, + "num_tokens": 31287256.0, + "step": 1205 + }, + { + "epoch": 0.13244014935207554, + "grad_norm": 2.216792583465576, + "learning_rate": 2.205344070278185e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6845569014549255, + "num_tokens": 31316314.0, + "step": 1206 + }, + { + "epoch": 0.13254996705468922, + "grad_norm": 1.970212459564209, + "learning_rate": 2.2071742313323575e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7059272527694702, + "num_tokens": 31345928.0, + "step": 1207 + }, + { + "epoch": 0.1326597847573029, + "grad_norm": 2.21850848197937, + "learning_rate": 2.20900439238653e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6800894141197205, + "num_tokens": 31371261.0, + "step": 1208 + }, + { + "epoch": 0.13276960245991654, + "grad_norm": 2.005889415740967, + "learning_rate": 2.210834553440703e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6825562715530396, + "num_tokens": 31402966.0, + "step": 1209 + }, + { + "epoch": 0.1328794201625302, + "grad_norm": 2.264354944229126, + "learning_rate": 2.2126647144948757e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6907984018325806, + "num_tokens": 31426832.0, + "step": 1210 + }, + { + "epoch": 0.13298923786514386, + "grad_norm": 2.251542091369629, + "learning_rate": 2.2144948755490487e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6852911710739136, + "num_tokens": 31452174.0, + "step": 1211 + }, + { + "epoch": 0.13309905556775753, + "grad_norm": 2.5070111751556396, + "learning_rate": 2.2163250366032213e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7075403928756714, + "num_tokens": 31473197.0, + "step": 1212 + }, + { + "epoch": 0.13320887327037118, + "grad_norm": 2.6628713607788086, + "learning_rate": 2.218155197657394e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7040296196937561, + "num_tokens": 31493088.0, + "step": 1213 + }, + { + "epoch": 0.13331869097298485, + "grad_norm": 2.1444835662841797, + "learning_rate": 2.219985358711567e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7049341797828674, + "num_tokens": 31519700.0, + "step": 1214 + }, + { + "epoch": 0.1334285086755985, + "grad_norm": 2.484896421432495, + "learning_rate": 2.2218155197657395e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6826748847961426, + "num_tokens": 31541301.0, + "step": 1215 + }, + { + "epoch": 0.13353832637821217, + "grad_norm": 1.9919801950454712, + "learning_rate": 2.223645680819912e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.681725263595581, + "num_tokens": 31573952.0, + "step": 1216 + }, + { + "epoch": 0.13364814408082584, + "grad_norm": 2.1346051692962646, + "learning_rate": 2.225475841874085e-06, + "loss": 1.1067, + "mean_token_accuracy": 0.6783560514450073, + "num_tokens": 31602175.0, + "step": 1217 + }, + { + "epoch": 0.1337579617834395, + "grad_norm": 2.182558298110962, + "learning_rate": 2.2273060029282577e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6899198293685913, + "num_tokens": 31628665.0, + "step": 1218 + }, + { + "epoch": 0.13386777948605316, + "grad_norm": 2.3286702632904053, + "learning_rate": 2.2291361639824307e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6899181008338928, + "num_tokens": 31654193.0, + "step": 1219 + }, + { + "epoch": 0.1339775971886668, + "grad_norm": 2.0854876041412354, + "learning_rate": 2.2309663250366033e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6840939521789551, + "num_tokens": 31684405.0, + "step": 1220 + }, + { + "epoch": 0.13408741489128048, + "grad_norm": 2.223271369934082, + "learning_rate": 2.2327964860907763e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7095369100570679, + "num_tokens": 31712807.0, + "step": 1221 + }, + { + "epoch": 0.13419723259389413, + "grad_norm": 2.4296987056732178, + "learning_rate": 2.234626647144949e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6876481771469116, + "num_tokens": 31736077.0, + "step": 1222 + }, + { + "epoch": 0.1343070502965078, + "grad_norm": 2.5672004222869873, + "learning_rate": 2.236456808199122e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7020493149757385, + "num_tokens": 31755774.0, + "step": 1223 + }, + { + "epoch": 0.13441686799912145, + "grad_norm": 2.3730363845825195, + "learning_rate": 2.2382869692532945e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6804203987121582, + "num_tokens": 31779216.0, + "step": 1224 + }, + { + "epoch": 0.13452668570173512, + "grad_norm": 2.0714597702026367, + "learning_rate": 2.2401171303074675e-06, + "loss": 1.1239, + "mean_token_accuracy": 0.6698597073554993, + "num_tokens": 31809476.0, + "step": 1225 + }, + { + "epoch": 0.13463650340434877, + "grad_norm": 2.185687780380249, + "learning_rate": 2.24194729136164e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6854480504989624, + "num_tokens": 31838224.0, + "step": 1226 + }, + { + "epoch": 0.13474632110696244, + "grad_norm": 2.138728141784668, + "learning_rate": 2.243777452415813e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7028270363807678, + "num_tokens": 31867281.0, + "step": 1227 + }, + { + "epoch": 0.13485613880957611, + "grad_norm": 2.2407093048095703, + "learning_rate": 2.2456076134699857e-06, + "loss": 1.0933, + "mean_token_accuracy": 0.6706656813621521, + "num_tokens": 31893677.0, + "step": 1228 + }, + { + "epoch": 0.13496595651218976, + "grad_norm": 2.374077320098877, + "learning_rate": 2.2474377745241583e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7060999870300293, + "num_tokens": 31918844.0, + "step": 1229 + }, + { + "epoch": 0.13507577421480343, + "grad_norm": 2.4214119911193848, + "learning_rate": 2.2492679355783313e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6940968036651611, + "num_tokens": 31941061.0, + "step": 1230 + }, + { + "epoch": 0.13518559191741708, + "grad_norm": 2.377474784851074, + "learning_rate": 2.251098096632504e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.694957435131073, + "num_tokens": 31963630.0, + "step": 1231 + }, + { + "epoch": 0.13529540962003075, + "grad_norm": 2.489093780517578, + "learning_rate": 2.2529282576866764e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6973589658737183, + "num_tokens": 31985309.0, + "step": 1232 + }, + { + "epoch": 0.1354052273226444, + "grad_norm": 2.3032705783843994, + "learning_rate": 2.2547584187408495e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7087502479553223, + "num_tokens": 32008457.0, + "step": 1233 + }, + { + "epoch": 0.13551504502525807, + "grad_norm": 2.5084946155548096, + "learning_rate": 2.256588579795022e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7021132707595825, + "num_tokens": 32030345.0, + "step": 1234 + }, + { + "epoch": 0.13562486272787172, + "grad_norm": 2.027836799621582, + "learning_rate": 2.258418740849195e-06, + "loss": 1.0824, + "mean_token_accuracy": 0.6749637126922607, + "num_tokens": 32065454.0, + "step": 1235 + }, + { + "epoch": 0.1357346804304854, + "grad_norm": 1.9865363836288452, + "learning_rate": 2.2602489019033676e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.683239221572876, + "num_tokens": 32097740.0, + "step": 1236 + }, + { + "epoch": 0.13584449813309907, + "grad_norm": 1.9224867820739746, + "learning_rate": 2.2620790629575402e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7000406980514526, + "num_tokens": 32128646.0, + "step": 1237 + }, + { + "epoch": 0.1359543158357127, + "grad_norm": 2.3518118858337402, + "learning_rate": 2.2639092240117132e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6875183582305908, + "num_tokens": 32152618.0, + "step": 1238 + }, + { + "epoch": 0.1360641335383264, + "grad_norm": 2.131664991378784, + "learning_rate": 2.265739385065886e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6874587535858154, + "num_tokens": 32182916.0, + "step": 1239 + }, + { + "epoch": 0.13617395124094003, + "grad_norm": 2.0263686180114746, + "learning_rate": 2.267569546120059e-06, + "loss": 1.133, + "mean_token_accuracy": 0.6705540418624878, + "num_tokens": 32215022.0, + "step": 1240 + }, + { + "epoch": 0.1362837689435537, + "grad_norm": 2.35532283782959, + "learning_rate": 2.2693997071742314e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6985280513763428, + "num_tokens": 32236882.0, + "step": 1241 + }, + { + "epoch": 0.13639358664616735, + "grad_norm": 1.9860312938690186, + "learning_rate": 2.2712298682284044e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6850335597991943, + "num_tokens": 32267483.0, + "step": 1242 + }, + { + "epoch": 0.13650340434878103, + "grad_norm": 2.1852457523345947, + "learning_rate": 2.273060029282577e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.6852450370788574, + "num_tokens": 32295967.0, + "step": 1243 + }, + { + "epoch": 0.13661322205139467, + "grad_norm": 2.2215442657470703, + "learning_rate": 2.27489019033675e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7026128768920898, + "num_tokens": 32320955.0, + "step": 1244 + }, + { + "epoch": 0.13672303975400835, + "grad_norm": 2.227112293243408, + "learning_rate": 2.2767203513909226e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6855790019035339, + "num_tokens": 32347699.0, + "step": 1245 + }, + { + "epoch": 0.13683285745662202, + "grad_norm": 2.1016459465026855, + "learning_rate": 2.2785505124450956e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6802054047584534, + "num_tokens": 32374442.0, + "step": 1246 + }, + { + "epoch": 0.13694267515923567, + "grad_norm": 2.310878276824951, + "learning_rate": 2.2803806734992682e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6962617039680481, + "num_tokens": 32398347.0, + "step": 1247 + }, + { + "epoch": 0.13705249286184934, + "grad_norm": 2.5054845809936523, + "learning_rate": 2.282210834553441e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.6765617728233337, + "num_tokens": 32422743.0, + "step": 1248 + }, + { + "epoch": 0.13716231056446299, + "grad_norm": 2.293088912963867, + "learning_rate": 2.284040995607614e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6762463450431824, + "num_tokens": 32450065.0, + "step": 1249 + }, + { + "epoch": 0.13727212826707666, + "grad_norm": 2.2258684635162354, + "learning_rate": 2.2858711566617864e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6880013942718506, + "num_tokens": 32474213.0, + "step": 1250 + }, + { + "epoch": 0.1373819459696903, + "grad_norm": 2.478745937347412, + "learning_rate": 2.2877013177159594e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7300089597702026, + "num_tokens": 32492540.0, + "step": 1251 + }, + { + "epoch": 0.13749176367230398, + "grad_norm": 2.398437261581421, + "learning_rate": 2.289531478770132e-06, + "loss": 1.1091, + "mean_token_accuracy": 0.6752479672431946, + "num_tokens": 32517007.0, + "step": 1252 + }, + { + "epoch": 0.13760158137491763, + "grad_norm": 2.257821798324585, + "learning_rate": 2.2913616398243046e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6885647773742676, + "num_tokens": 32541854.0, + "step": 1253 + }, + { + "epoch": 0.1377113990775313, + "grad_norm": 2.204458236694336, + "learning_rate": 2.2931918008784776e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7226125001907349, + "num_tokens": 32568189.0, + "step": 1254 + }, + { + "epoch": 0.13782121678014497, + "grad_norm": 2.065256118774414, + "learning_rate": 2.29502196193265e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6799665689468384, + "num_tokens": 32600446.0, + "step": 1255 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.3268513679504395, + "learning_rate": 2.2968521229868228e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6792080402374268, + "num_tokens": 32626897.0, + "step": 1256 + }, + { + "epoch": 0.1380408521853723, + "grad_norm": 2.0581414699554443, + "learning_rate": 2.2986822840409958e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7097935080528259, + "num_tokens": 32655622.0, + "step": 1257 + }, + { + "epoch": 0.13815066988798594, + "grad_norm": 2.5308640003204346, + "learning_rate": 2.3005124450951684e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6963598728179932, + "num_tokens": 32677641.0, + "step": 1258 + }, + { + "epoch": 0.1382604875905996, + "grad_norm": 2.2498035430908203, + "learning_rate": 2.3023426061493414e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6715493202209473, + "num_tokens": 32705632.0, + "step": 1259 + }, + { + "epoch": 0.13837030529321326, + "grad_norm": 2.265198230743408, + "learning_rate": 2.304172767203514e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7123140692710876, + "num_tokens": 32728729.0, + "step": 1260 + }, + { + "epoch": 0.13848012299582693, + "grad_norm": 2.3978073596954346, + "learning_rate": 2.306002928257687e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6941030025482178, + "num_tokens": 32753274.0, + "step": 1261 + }, + { + "epoch": 0.13858994069844058, + "grad_norm": 2.5119452476501465, + "learning_rate": 2.3078330893118596e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6957090497016907, + "num_tokens": 32774178.0, + "step": 1262 + }, + { + "epoch": 0.13869975840105425, + "grad_norm": 2.288571357727051, + "learning_rate": 2.3096632503660326e-06, + "loss": 1.0677, + "mean_token_accuracy": 0.6792876720428467, + "num_tokens": 32802096.0, + "step": 1263 + }, + { + "epoch": 0.1388095761036679, + "grad_norm": 2.058440685272217, + "learning_rate": 2.311493411420205e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6852511763572693, + "num_tokens": 32830001.0, + "step": 1264 + }, + { + "epoch": 0.13891939380628157, + "grad_norm": 2.044395923614502, + "learning_rate": 2.313323572474378e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.710379958152771, + "num_tokens": 32858063.0, + "step": 1265 + }, + { + "epoch": 0.13902921150889525, + "grad_norm": 2.306445360183716, + "learning_rate": 2.3151537335285507e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.6816022396087646, + "num_tokens": 32883132.0, + "step": 1266 + }, + { + "epoch": 0.1391390292115089, + "grad_norm": 2.22678542137146, + "learning_rate": 2.3169838945827238e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6789505481719971, + "num_tokens": 32912027.0, + "step": 1267 + }, + { + "epoch": 0.13924884691412256, + "grad_norm": 2.4692249298095703, + "learning_rate": 2.3188140556368963e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.700782060623169, + "num_tokens": 32932300.0, + "step": 1268 + }, + { + "epoch": 0.1393586646167362, + "grad_norm": 2.2306504249572754, + "learning_rate": 2.320644216691069e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6924867630004883, + "num_tokens": 32959544.0, + "step": 1269 + }, + { + "epoch": 0.13946848231934988, + "grad_norm": 2.278602361679077, + "learning_rate": 2.322474377745242e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6976694464683533, + "num_tokens": 32983433.0, + "step": 1270 + }, + { + "epoch": 0.13957830002196353, + "grad_norm": 2.0011019706726074, + "learning_rate": 2.3243045387994145e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7340721487998962, + "num_tokens": 33009964.0, + "step": 1271 + }, + { + "epoch": 0.1396881177245772, + "grad_norm": 2.085068941116333, + "learning_rate": 2.326134699853587e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6873713135719299, + "num_tokens": 33040575.0, + "step": 1272 + }, + { + "epoch": 0.13979793542719085, + "grad_norm": 2.1718032360076904, + "learning_rate": 2.32796486090776e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.690811038017273, + "num_tokens": 33069131.0, + "step": 1273 + }, + { + "epoch": 0.13990775312980452, + "grad_norm": 2.281536340713501, + "learning_rate": 2.3297950219619327e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6915951371192932, + "num_tokens": 33095204.0, + "step": 1274 + }, + { + "epoch": 0.1400175708324182, + "grad_norm": 2.386096239089966, + "learning_rate": 2.3316251830161057e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7280296087265015, + "num_tokens": 33116477.0, + "step": 1275 + }, + { + "epoch": 0.14012738853503184, + "grad_norm": 1.978624939918518, + "learning_rate": 2.3334553440702783e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6925024390220642, + "num_tokens": 33148401.0, + "step": 1276 + }, + { + "epoch": 0.14023720623764552, + "grad_norm": 2.814506769180298, + "learning_rate": 2.335285505124451e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.7101265788078308, + "num_tokens": 33165359.0, + "step": 1277 + }, + { + "epoch": 0.14034702394025916, + "grad_norm": 2.2628839015960693, + "learning_rate": 2.337115666178624e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7022802829742432, + "num_tokens": 33188654.0, + "step": 1278 + }, + { + "epoch": 0.14045684164287284, + "grad_norm": 2.274216413497925, + "learning_rate": 2.3389458272327965e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6938676834106445, + "num_tokens": 33214100.0, + "step": 1279 + }, + { + "epoch": 0.14056665934548648, + "grad_norm": 2.015063762664795, + "learning_rate": 2.3407759882869695e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6982985734939575, + "num_tokens": 33242795.0, + "step": 1280 + }, + { + "epoch": 0.14067647704810016, + "grad_norm": 2.5188963413238525, + "learning_rate": 2.342606149341142e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6845548748970032, + "num_tokens": 33264403.0, + "step": 1281 + }, + { + "epoch": 0.1407862947507138, + "grad_norm": 2.444199323654175, + "learning_rate": 2.344436310395315e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7211350798606873, + "num_tokens": 33285273.0, + "step": 1282 + }, + { + "epoch": 0.14089611245332748, + "grad_norm": 2.339609146118164, + "learning_rate": 2.3462664714494877e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6817419528961182, + "num_tokens": 33307836.0, + "step": 1283 + }, + { + "epoch": 0.14100593015594115, + "grad_norm": 2.4580130577087402, + "learning_rate": 2.3480966325036607e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.711622953414917, + "num_tokens": 33329426.0, + "step": 1284 + }, + { + "epoch": 0.1411157478585548, + "grad_norm": 2.4334805011749268, + "learning_rate": 2.3499267935578333e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7208842635154724, + "num_tokens": 33349687.0, + "step": 1285 + }, + { + "epoch": 0.14122556556116847, + "grad_norm": 2.2505412101745605, + "learning_rate": 2.3517569546120063e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7133666276931763, + "num_tokens": 33376086.0, + "step": 1286 + }, + { + "epoch": 0.14133538326378212, + "grad_norm": 2.192307472229004, + "learning_rate": 2.353587115666179e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6790611147880554, + "num_tokens": 33406799.0, + "step": 1287 + }, + { + "epoch": 0.1414452009663958, + "grad_norm": 2.4711689949035645, + "learning_rate": 2.3554172767203515e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6954246759414673, + "num_tokens": 33428603.0, + "step": 1288 + }, + { + "epoch": 0.14155501866900944, + "grad_norm": 2.3582465648651123, + "learning_rate": 2.3572474377745245e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7078817486763, + "num_tokens": 33450690.0, + "step": 1289 + }, + { + "epoch": 0.1416648363716231, + "grad_norm": 2.301276683807373, + "learning_rate": 2.359077598828697e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7000041007995605, + "num_tokens": 33474234.0, + "step": 1290 + }, + { + "epoch": 0.14177465407423676, + "grad_norm": 2.1922335624694824, + "learning_rate": 2.36090775988287e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6878629922866821, + "num_tokens": 33500731.0, + "step": 1291 + }, + { + "epoch": 0.14188447177685043, + "grad_norm": 2.368556022644043, + "learning_rate": 2.3627379209370427e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7086682319641113, + "num_tokens": 33522773.0, + "step": 1292 + }, + { + "epoch": 0.1419942894794641, + "grad_norm": 2.008967161178589, + "learning_rate": 2.3645680819912152e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6887302398681641, + "num_tokens": 33552883.0, + "step": 1293 + }, + { + "epoch": 0.14210410718207775, + "grad_norm": 2.4068779945373535, + "learning_rate": 2.3663982430453883e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6884760856628418, + "num_tokens": 33574454.0, + "step": 1294 + }, + { + "epoch": 0.14221392488469142, + "grad_norm": 2.0818824768066406, + "learning_rate": 2.368228404099561e-06, + "loss": 1.0858, + "mean_token_accuracy": 0.6752647161483765, + "num_tokens": 33604993.0, + "step": 1295 + }, + { + "epoch": 0.14232374258730507, + "grad_norm": 2.142552137374878, + "learning_rate": 2.3700585651537334e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7166759967803955, + "num_tokens": 33631163.0, + "step": 1296 + }, + { + "epoch": 0.14243356028991874, + "grad_norm": 2.6399118900299072, + "learning_rate": 2.3718887262079064e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6928911209106445, + "num_tokens": 33652855.0, + "step": 1297 + }, + { + "epoch": 0.1425433779925324, + "grad_norm": 2.185497999191284, + "learning_rate": 2.373718887262079e-06, + "loss": 1.135, + "mean_token_accuracy": 0.6654244661331177, + "num_tokens": 33680677.0, + "step": 1298 + }, + { + "epoch": 0.14265319569514606, + "grad_norm": 2.3083431720733643, + "learning_rate": 2.375549048316252e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6831337213516235, + "num_tokens": 33704689.0, + "step": 1299 + }, + { + "epoch": 0.1427630133977597, + "grad_norm": 2.138943910598755, + "learning_rate": 2.3773792093704246e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7026702165603638, + "num_tokens": 33731408.0, + "step": 1300 + }, + { + "epoch": 0.14287283110037338, + "grad_norm": 2.2690277099609375, + "learning_rate": 2.3792093704245976e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.711244523525238, + "num_tokens": 33754941.0, + "step": 1301 + }, + { + "epoch": 0.14298264880298703, + "grad_norm": 2.2409250736236572, + "learning_rate": 2.3810395314787702e-06, + "loss": 1.1181, + "mean_token_accuracy": 0.6664736866950989, + "num_tokens": 33781326.0, + "step": 1302 + }, + { + "epoch": 0.1430924665056007, + "grad_norm": 2.636568546295166, + "learning_rate": 2.3828696925329432e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6937167644500732, + "num_tokens": 33802512.0, + "step": 1303 + }, + { + "epoch": 0.14320228420821438, + "grad_norm": 2.1740779876708984, + "learning_rate": 2.384699853587116e-06, + "loss": 1.0819, + "mean_token_accuracy": 0.6768412590026855, + "num_tokens": 33831740.0, + "step": 1304 + }, + { + "epoch": 0.14331210191082802, + "grad_norm": 2.4432005882263184, + "learning_rate": 2.386530014641289e-06, + "loss": 1.0861, + "mean_token_accuracy": 0.6729085445404053, + "num_tokens": 33854353.0, + "step": 1305 + }, + { + "epoch": 0.1434219196134417, + "grad_norm": 2.1464438438415527, + "learning_rate": 2.3883601756954614e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6888905763626099, + "num_tokens": 33881483.0, + "step": 1306 + }, + { + "epoch": 0.14353173731605534, + "grad_norm": 2.068896532058716, + "learning_rate": 2.3901903367496344e-06, + "loss": 1.1173, + "mean_token_accuracy": 0.6687901020050049, + "num_tokens": 33912147.0, + "step": 1307 + }, + { + "epoch": 0.14364155501866901, + "grad_norm": 2.3973469734191895, + "learning_rate": 2.392020497803807e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7337806224822998, + "num_tokens": 33933892.0, + "step": 1308 + }, + { + "epoch": 0.14375137272128266, + "grad_norm": 2.3410775661468506, + "learning_rate": 2.3938506588579796e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7096223831176758, + "num_tokens": 33956499.0, + "step": 1309 + }, + { + "epoch": 0.14386119042389633, + "grad_norm": 2.357816457748413, + "learning_rate": 2.3956808199121526e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6971372365951538, + "num_tokens": 33981125.0, + "step": 1310 + }, + { + "epoch": 0.14397100812650998, + "grad_norm": 2.520930528640747, + "learning_rate": 2.397510980966325e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7256479263305664, + "num_tokens": 33999636.0, + "step": 1311 + }, + { + "epoch": 0.14408082582912365, + "grad_norm": 2.062826633453369, + "learning_rate": 2.3993411420204978e-06, + "loss": 1.1065, + "mean_token_accuracy": 0.6717388033866882, + "num_tokens": 34030802.0, + "step": 1312 + }, + { + "epoch": 0.14419064353173733, + "grad_norm": 2.194932460784912, + "learning_rate": 2.4011713030746708e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.6714177131652832, + "num_tokens": 34059183.0, + "step": 1313 + }, + { + "epoch": 0.14430046123435097, + "grad_norm": 2.2190756797790527, + "learning_rate": 2.4030014641288434e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6929826736450195, + "num_tokens": 34083634.0, + "step": 1314 + }, + { + "epoch": 0.14441027893696465, + "grad_norm": 2.316394567489624, + "learning_rate": 2.4048316251830164e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6803874969482422, + "num_tokens": 34106450.0, + "step": 1315 + }, + { + "epoch": 0.1445200966395783, + "grad_norm": 2.2585456371307373, + "learning_rate": 2.406661786237189e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6916943788528442, + "num_tokens": 34131753.0, + "step": 1316 + }, + { + "epoch": 0.14462991434219197, + "grad_norm": 2.4494688510894775, + "learning_rate": 2.4084919472913616e-06, + "loss": 1.0759, + "mean_token_accuracy": 0.6796731948852539, + "num_tokens": 34154741.0, + "step": 1317 + }, + { + "epoch": 0.1447397320448056, + "grad_norm": 2.28983473777771, + "learning_rate": 2.4103221083455346e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6942539215087891, + "num_tokens": 34181490.0, + "step": 1318 + }, + { + "epoch": 0.1448495497474193, + "grad_norm": 2.2445785999298096, + "learning_rate": 2.4121522693997076e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6922234296798706, + "num_tokens": 34207315.0, + "step": 1319 + }, + { + "epoch": 0.14495936745003293, + "grad_norm": 2.1640777587890625, + "learning_rate": 2.41398243045388e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7061817646026611, + "num_tokens": 34234234.0, + "step": 1320 + }, + { + "epoch": 0.1450691851526466, + "grad_norm": 2.244619131088257, + "learning_rate": 2.415812591508053e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6841515302658081, + "num_tokens": 34259352.0, + "step": 1321 + }, + { + "epoch": 0.14517900285526028, + "grad_norm": 1.9899272918701172, + "learning_rate": 2.4176427525622258e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7075968980789185, + "num_tokens": 34287313.0, + "step": 1322 + }, + { + "epoch": 0.14528882055787393, + "grad_norm": 2.221970796585083, + "learning_rate": 2.4194729136163988e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7191588282585144, + "num_tokens": 34309792.0, + "step": 1323 + }, + { + "epoch": 0.1453986382604876, + "grad_norm": 2.2893645763397217, + "learning_rate": 2.4213030746705714e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6966896057128906, + "num_tokens": 34334808.0, + "step": 1324 + }, + { + "epoch": 0.14550845596310125, + "grad_norm": 2.51731276512146, + "learning_rate": 2.423133235724744e-06, + "loss": 1.1318, + "mean_token_accuracy": 0.6632756590843201, + "num_tokens": 34356285.0, + "step": 1325 + }, + { + "epoch": 0.14561827366571492, + "grad_norm": 2.031623125076294, + "learning_rate": 2.424963396778917e-06, + "loss": 1.024, + "mean_token_accuracy": 0.7031073570251465, + "num_tokens": 34385477.0, + "step": 1326 + }, + { + "epoch": 0.14572809136832857, + "grad_norm": 1.845399260520935, + "learning_rate": 2.4267935578330895e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6917180418968201, + "num_tokens": 34421609.0, + "step": 1327 + }, + { + "epoch": 0.14583790907094224, + "grad_norm": 2.4171946048736572, + "learning_rate": 2.428623718887262e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6895829439163208, + "num_tokens": 34443782.0, + "step": 1328 + }, + { + "epoch": 0.14594772677355589, + "grad_norm": 2.424198627471924, + "learning_rate": 2.430453879941435e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6993970274925232, + "num_tokens": 34464480.0, + "step": 1329 + }, + { + "epoch": 0.14605754447616956, + "grad_norm": 2.136680841445923, + "learning_rate": 2.4322840409956077e-06, + "loss": 1.0944, + "mean_token_accuracy": 0.6747960448265076, + "num_tokens": 34494323.0, + "step": 1330 + }, + { + "epoch": 0.14616736217878323, + "grad_norm": 2.093043804168701, + "learning_rate": 2.4341142020497807e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6898530721664429, + "num_tokens": 34523507.0, + "step": 1331 + }, + { + "epoch": 0.14627717988139688, + "grad_norm": 2.1080613136291504, + "learning_rate": 2.4359443631039533e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.69505774974823, + "num_tokens": 34549237.0, + "step": 1332 + }, + { + "epoch": 0.14638699758401055, + "grad_norm": 2.3231117725372314, + "learning_rate": 2.437774524158126e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7028049826622009, + "num_tokens": 34570999.0, + "step": 1333 + }, + { + "epoch": 0.1464968152866242, + "grad_norm": 2.239185094833374, + "learning_rate": 2.439604685212299e-06, + "loss": 1.1268, + "mean_token_accuracy": 0.679801881313324, + "num_tokens": 34596433.0, + "step": 1334 + }, + { + "epoch": 0.14660663298923787, + "grad_norm": 2.14313006401062, + "learning_rate": 2.4414348462664715e-06, + "loss": 1.0958, + "mean_token_accuracy": 0.6741917133331299, + "num_tokens": 34623859.0, + "step": 1335 + }, + { + "epoch": 0.14671645069185152, + "grad_norm": 2.110891819000244, + "learning_rate": 2.4432650073206445e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7051664590835571, + "num_tokens": 34651076.0, + "step": 1336 + }, + { + "epoch": 0.1468262683944652, + "grad_norm": 2.2161333560943604, + "learning_rate": 2.445095168374817e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6714804172515869, + "num_tokens": 34675326.0, + "step": 1337 + }, + { + "epoch": 0.14693608609707884, + "grad_norm": 2.1160595417022705, + "learning_rate": 2.44692532942899e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7039876580238342, + "num_tokens": 34702287.0, + "step": 1338 + }, + { + "epoch": 0.1470459037996925, + "grad_norm": 2.078218698501587, + "learning_rate": 2.4487554904831627e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7058489322662354, + "num_tokens": 34731227.0, + "step": 1339 + }, + { + "epoch": 0.14715572150230616, + "grad_norm": 2.3591740131378174, + "learning_rate": 2.4505856515373357e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6936054229736328, + "num_tokens": 34753917.0, + "step": 1340 + }, + { + "epoch": 0.14726553920491983, + "grad_norm": 1.956846833229065, + "learning_rate": 2.4524158125915083e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.686643123626709, + "num_tokens": 34785104.0, + "step": 1341 + }, + { + "epoch": 0.1473753569075335, + "grad_norm": 2.536295175552368, + "learning_rate": 2.4542459736456813e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7018128037452698, + "num_tokens": 34804108.0, + "step": 1342 + }, + { + "epoch": 0.14748517461014715, + "grad_norm": 2.1932179927825928, + "learning_rate": 2.456076134699854e-06, + "loss": 1.0955, + "mean_token_accuracy": 0.6824416518211365, + "num_tokens": 34831927.0, + "step": 1343 + }, + { + "epoch": 0.14759499231276083, + "grad_norm": 2.410285472869873, + "learning_rate": 2.4579062957540265e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7053417563438416, + "num_tokens": 34854553.0, + "step": 1344 + }, + { + "epoch": 0.14770481001537447, + "grad_norm": 2.1987788677215576, + "learning_rate": 2.4597364568081995e-06, + "loss": 1.0866, + "mean_token_accuracy": 0.6748734712600708, + "num_tokens": 34879994.0, + "step": 1345 + }, + { + "epoch": 0.14781462771798815, + "grad_norm": 2.1979246139526367, + "learning_rate": 2.461566617862372e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6988741755485535, + "num_tokens": 34906857.0, + "step": 1346 + }, + { + "epoch": 0.1479244454206018, + "grad_norm": 2.05989933013916, + "learning_rate": 2.463396778916545e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6730645895004272, + "num_tokens": 34936096.0, + "step": 1347 + }, + { + "epoch": 0.14803426312321546, + "grad_norm": 2.114393472671509, + "learning_rate": 2.4652269399707177e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6769541501998901, + "num_tokens": 34962729.0, + "step": 1348 + }, + { + "epoch": 0.1481440808258291, + "grad_norm": 2.0828328132629395, + "learning_rate": 2.4670571010248903e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7062388062477112, + "num_tokens": 34989748.0, + "step": 1349 + }, + { + "epoch": 0.14825389852844278, + "grad_norm": 2.3290936946868896, + "learning_rate": 2.4688872620790633e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7077726125717163, + "num_tokens": 35011608.0, + "step": 1350 + }, + { + "epoch": 0.14836371623105646, + "grad_norm": 1.991438388824463, + "learning_rate": 2.470717423133236e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.6893184185028076, + "num_tokens": 35043016.0, + "step": 1351 + }, + { + "epoch": 0.1484735339336701, + "grad_norm": 2.3471953868865967, + "learning_rate": 2.4725475841874084e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7092400789260864, + "num_tokens": 35065249.0, + "step": 1352 + }, + { + "epoch": 0.14858335163628378, + "grad_norm": 2.140767812728882, + "learning_rate": 2.4743777452415815e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7083533406257629, + "num_tokens": 35093151.0, + "step": 1353 + }, + { + "epoch": 0.14869316933889742, + "grad_norm": 2.4269728660583496, + "learning_rate": 2.476207906295754e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7078220844268799, + "num_tokens": 35115040.0, + "step": 1354 + }, + { + "epoch": 0.1488029870415111, + "grad_norm": 2.1354830265045166, + "learning_rate": 2.478038067349927e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7131104469299316, + "num_tokens": 35141834.0, + "step": 1355 + }, + { + "epoch": 0.14891280474412474, + "grad_norm": 2.427126407623291, + "learning_rate": 2.4798682284040996e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6782247424125671, + "num_tokens": 35164157.0, + "step": 1356 + }, + { + "epoch": 0.14902262244673842, + "grad_norm": 2.505530834197998, + "learning_rate": 2.4816983894582726e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7209064364433289, + "num_tokens": 35185899.0, + "step": 1357 + }, + { + "epoch": 0.14913244014935206, + "grad_norm": 2.532642126083374, + "learning_rate": 2.4835285505124452e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7100531458854675, + "num_tokens": 35208042.0, + "step": 1358 + }, + { + "epoch": 0.14924225785196574, + "grad_norm": 2.0279431343078613, + "learning_rate": 2.4853587115666182e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6848219037055969, + "num_tokens": 35237068.0, + "step": 1359 + }, + { + "epoch": 0.1493520755545794, + "grad_norm": 2.085508346557617, + "learning_rate": 2.487188872620791e-06, + "loss": 1.1016, + "mean_token_accuracy": 0.6783141493797302, + "num_tokens": 35265577.0, + "step": 1360 + }, + { + "epoch": 0.14946189325719306, + "grad_norm": 2.0993173122406006, + "learning_rate": 2.489019033674964e-06, + "loss": 1.1079, + "mean_token_accuracy": 0.6745380759239197, + "num_tokens": 35294941.0, + "step": 1361 + }, + { + "epoch": 0.14957171095980673, + "grad_norm": 2.312624454498291, + "learning_rate": 2.4908491947291364e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6801397204399109, + "num_tokens": 35321365.0, + "step": 1362 + }, + { + "epoch": 0.14968152866242038, + "grad_norm": 2.1568448543548584, + "learning_rate": 2.4926793557833094e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7135015726089478, + "num_tokens": 35350149.0, + "step": 1363 + }, + { + "epoch": 0.14979134636503405, + "grad_norm": 2.0093564987182617, + "learning_rate": 2.494509516837482e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6908528804779053, + "num_tokens": 35380953.0, + "step": 1364 + }, + { + "epoch": 0.1499011640676477, + "grad_norm": 2.0246024131774902, + "learning_rate": 2.4963396778916546e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6864714622497559, + "num_tokens": 35409111.0, + "step": 1365 + }, + { + "epoch": 0.15001098177026137, + "grad_norm": 2.061462879180908, + "learning_rate": 2.4981698389458276e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7012562155723572, + "num_tokens": 35438669.0, + "step": 1366 + }, + { + "epoch": 0.15012079947287502, + "grad_norm": 2.086948871612549, + "learning_rate": 2.5e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7037235498428345, + "num_tokens": 35470104.0, + "step": 1367 + }, + { + "epoch": 0.1502306171754887, + "grad_norm": 2.4781370162963867, + "learning_rate": 2.501830161054173e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7045226693153381, + "num_tokens": 35491122.0, + "step": 1368 + }, + { + "epoch": 0.15034043487810236, + "grad_norm": 2.3295035362243652, + "learning_rate": 2.5036603221083454e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6907371282577515, + "num_tokens": 35514546.0, + "step": 1369 + }, + { + "epoch": 0.150450252580716, + "grad_norm": 2.165935754776001, + "learning_rate": 2.505490483162519e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6979226469993591, + "num_tokens": 35541356.0, + "step": 1370 + }, + { + "epoch": 0.15056007028332968, + "grad_norm": 2.259469985961914, + "learning_rate": 2.5073206442166914e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6977667808532715, + "num_tokens": 35569142.0, + "step": 1371 + }, + { + "epoch": 0.15066988798594333, + "grad_norm": 2.2213947772979736, + "learning_rate": 2.509150805270864e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7055498361587524, + "num_tokens": 35595895.0, + "step": 1372 + }, + { + "epoch": 0.150779705688557, + "grad_norm": 1.9895195960998535, + "learning_rate": 2.5109809663250366e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6919187903404236, + "num_tokens": 35624006.0, + "step": 1373 + }, + { + "epoch": 0.15088952339117065, + "grad_norm": 2.4168386459350586, + "learning_rate": 2.5128111273792096e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7109134197235107, + "num_tokens": 35645685.0, + "step": 1374 + }, + { + "epoch": 0.15099934109378432, + "grad_norm": 2.0465595722198486, + "learning_rate": 2.514641288433382e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6920981407165527, + "num_tokens": 35678159.0, + "step": 1375 + }, + { + "epoch": 0.15110915879639797, + "grad_norm": 2.172358989715576, + "learning_rate": 2.516471449487555e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7010783553123474, + "num_tokens": 35705670.0, + "step": 1376 + }, + { + "epoch": 0.15121897649901164, + "grad_norm": 2.3156723976135254, + "learning_rate": 2.5183016105417278e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7002772092819214, + "num_tokens": 35730335.0, + "step": 1377 + }, + { + "epoch": 0.1513287942016253, + "grad_norm": 2.491236925125122, + "learning_rate": 2.5201317715959008e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6950870752334595, + "num_tokens": 35753570.0, + "step": 1378 + }, + { + "epoch": 0.15143861190423896, + "grad_norm": 2.5136923789978027, + "learning_rate": 2.5219619326500734e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7239966988563538, + "num_tokens": 35772292.0, + "step": 1379 + }, + { + "epoch": 0.15154842960685264, + "grad_norm": 2.406546115875244, + "learning_rate": 2.523792093704246e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7126295566558838, + "num_tokens": 35800237.0, + "step": 1380 + }, + { + "epoch": 0.15165824730946628, + "grad_norm": 2.1200239658355713, + "learning_rate": 2.525622254758419e-06, + "loss": 1.07, + "mean_token_accuracy": 0.6858015656471252, + "num_tokens": 35831887.0, + "step": 1381 + }, + { + "epoch": 0.15176806501207996, + "grad_norm": 2.189910650253296, + "learning_rate": 2.527452415812592e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.705876886844635, + "num_tokens": 35858461.0, + "step": 1382 + }, + { + "epoch": 0.1518778827146936, + "grad_norm": 2.182133913040161, + "learning_rate": 2.5292825768667646e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6874865293502808, + "num_tokens": 35886441.0, + "step": 1383 + }, + { + "epoch": 0.15198770041730728, + "grad_norm": 2.299339532852173, + "learning_rate": 2.531112737920937e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6906741857528687, + "num_tokens": 35912987.0, + "step": 1384 + }, + { + "epoch": 0.15209751811992092, + "grad_norm": 2.5440526008605957, + "learning_rate": 2.5329428989751097e-06, + "loss": 1.0909, + "mean_token_accuracy": 0.6719093322753906, + "num_tokens": 35935698.0, + "step": 1385 + }, + { + "epoch": 0.1522073358225346, + "grad_norm": 2.3547327518463135, + "learning_rate": 2.534773060029283e-06, + "loss": 1.1196, + "mean_token_accuracy": 0.6644778251647949, + "num_tokens": 35959459.0, + "step": 1386 + }, + { + "epoch": 0.15231715352514824, + "grad_norm": 2.107483386993408, + "learning_rate": 2.5366032210834557e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7087364196777344, + "num_tokens": 35986192.0, + "step": 1387 + }, + { + "epoch": 0.15242697122776191, + "grad_norm": 2.2540957927703857, + "learning_rate": 2.5384333821376283e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6789106130599976, + "num_tokens": 36012036.0, + "step": 1388 + }, + { + "epoch": 0.1525367889303756, + "grad_norm": 2.1246161460876465, + "learning_rate": 2.540263543191801e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6787903308868408, + "num_tokens": 36039846.0, + "step": 1389 + }, + { + "epoch": 0.15264660663298923, + "grad_norm": 2.2108993530273438, + "learning_rate": 2.5420937042459735e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6879018545150757, + "num_tokens": 36064549.0, + "step": 1390 + }, + { + "epoch": 0.1527564243356029, + "grad_norm": 2.1841182708740234, + "learning_rate": 2.5439238653001465e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7095442414283752, + "num_tokens": 36091110.0, + "step": 1391 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 2.114388942718506, + "learning_rate": 2.5457540263543195e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6861073970794678, + "num_tokens": 36117458.0, + "step": 1392 + }, + { + "epoch": 0.15297605974083023, + "grad_norm": 2.531613349914551, + "learning_rate": 2.547584187408492e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6890339851379395, + "num_tokens": 36138500.0, + "step": 1393 + }, + { + "epoch": 0.15308587744344387, + "grad_norm": 2.218052387237549, + "learning_rate": 2.5494143484626647e-06, + "loss": 1.1375, + "mean_token_accuracy": 0.6657212376594543, + "num_tokens": 36169991.0, + "step": 1394 + }, + { + "epoch": 0.15319569514605755, + "grad_norm": 1.8786872625350952, + "learning_rate": 2.5512445095168377e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6827516555786133, + "num_tokens": 36205553.0, + "step": 1395 + }, + { + "epoch": 0.1533055128486712, + "grad_norm": 2.227012872695923, + "learning_rate": 2.5530746705710103e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6907913088798523, + "num_tokens": 36231549.0, + "step": 1396 + }, + { + "epoch": 0.15341533055128487, + "grad_norm": 2.0935399532318115, + "learning_rate": 2.5549048316251833e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7156105637550354, + "num_tokens": 36258395.0, + "step": 1397 + }, + { + "epoch": 0.15352514825389854, + "grad_norm": 2.4895408153533936, + "learning_rate": 2.556734992679356e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7113910913467407, + "num_tokens": 36278218.0, + "step": 1398 + }, + { + "epoch": 0.1536349659565122, + "grad_norm": 2.035550355911255, + "learning_rate": 2.558565153733529e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7374144792556763, + "num_tokens": 36304579.0, + "step": 1399 + }, + { + "epoch": 0.15374478365912586, + "grad_norm": 2.120218276977539, + "learning_rate": 2.5603953147877015e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6868056654930115, + "num_tokens": 36331481.0, + "step": 1400 + }, + { + "epoch": 0.1538546013617395, + "grad_norm": 2.304147720336914, + "learning_rate": 2.562225475841874e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7124870419502258, + "num_tokens": 36354441.0, + "step": 1401 + }, + { + "epoch": 0.15396441906435318, + "grad_norm": 2.2369189262390137, + "learning_rate": 2.5640556368960475e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6948302984237671, + "num_tokens": 36378275.0, + "step": 1402 + }, + { + "epoch": 0.15407423676696683, + "grad_norm": 2.361011266708374, + "learning_rate": 2.56588579795022e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6763975620269775, + "num_tokens": 36401383.0, + "step": 1403 + }, + { + "epoch": 0.1541840544695805, + "grad_norm": 2.536511182785034, + "learning_rate": 2.5677159590043927e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7266851663589478, + "num_tokens": 36421670.0, + "step": 1404 + }, + { + "epoch": 0.15429387217219415, + "grad_norm": 2.326209783554077, + "learning_rate": 2.5695461200585653e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6961161494255066, + "num_tokens": 36445800.0, + "step": 1405 + }, + { + "epoch": 0.15440368987480782, + "grad_norm": 2.1066927909851074, + "learning_rate": 2.571376281112738e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6956274509429932, + "num_tokens": 36476123.0, + "step": 1406 + }, + { + "epoch": 0.1545135075774215, + "grad_norm": 2.3016529083251953, + "learning_rate": 2.5732064421669104e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7144458293914795, + "num_tokens": 36498691.0, + "step": 1407 + }, + { + "epoch": 0.15462332528003514, + "grad_norm": 2.070742607116699, + "learning_rate": 2.575036603221084e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6936459541320801, + "num_tokens": 36528322.0, + "step": 1408 + }, + { + "epoch": 0.1547331429826488, + "grad_norm": 2.0308947563171387, + "learning_rate": 2.5768667642752565e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7187577486038208, + "num_tokens": 36557297.0, + "step": 1409 + }, + { + "epoch": 0.15484296068526246, + "grad_norm": 2.3579583168029785, + "learning_rate": 2.578696925329429e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7044415473937988, + "num_tokens": 36580021.0, + "step": 1410 + }, + { + "epoch": 0.15495277838787613, + "grad_norm": 2.33017897605896, + "learning_rate": 2.5805270863836016e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.719251275062561, + "num_tokens": 36602471.0, + "step": 1411 + }, + { + "epoch": 0.15506259609048978, + "grad_norm": 2.400315046310425, + "learning_rate": 2.5823572474377746e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6991158127784729, + "num_tokens": 36624862.0, + "step": 1412 + }, + { + "epoch": 0.15517241379310345, + "grad_norm": 2.213747978210449, + "learning_rate": 2.5841874084919477e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6974704265594482, + "num_tokens": 36651929.0, + "step": 1413 + }, + { + "epoch": 0.1552822314957171, + "grad_norm": 2.224292039871216, + "learning_rate": 2.5860175695461202e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7049716114997864, + "num_tokens": 36676933.0, + "step": 1414 + }, + { + "epoch": 0.15539204919833077, + "grad_norm": 2.2941973209381104, + "learning_rate": 2.587847730600293e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.678737998008728, + "num_tokens": 36702439.0, + "step": 1415 + }, + { + "epoch": 0.15550186690094442, + "grad_norm": 2.5546810626983643, + "learning_rate": 2.589677891654466e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.7016631364822388, + "num_tokens": 36725759.0, + "step": 1416 + }, + { + "epoch": 0.1556116846035581, + "grad_norm": 2.34897518157959, + "learning_rate": 2.5915080527086384e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6992653608322144, + "num_tokens": 36749790.0, + "step": 1417 + }, + { + "epoch": 0.15572150230617177, + "grad_norm": 2.2858028411865234, + "learning_rate": 2.5933382137628114e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7024091482162476, + "num_tokens": 36773879.0, + "step": 1418 + }, + { + "epoch": 0.1558313200087854, + "grad_norm": 2.134408473968506, + "learning_rate": 2.5951683748169844e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.689852237701416, + "num_tokens": 36800233.0, + "step": 1419 + }, + { + "epoch": 0.15594113771139909, + "grad_norm": 2.292659282684326, + "learning_rate": 2.596998535871157e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6940276622772217, + "num_tokens": 36824510.0, + "step": 1420 + }, + { + "epoch": 0.15605095541401273, + "grad_norm": 1.8717964887619019, + "learning_rate": 2.5988286969253296e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6897857189178467, + "num_tokens": 36860322.0, + "step": 1421 + }, + { + "epoch": 0.1561607731166264, + "grad_norm": 2.272193670272827, + "learning_rate": 2.600658857979502e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6947736740112305, + "num_tokens": 36883275.0, + "step": 1422 + }, + { + "epoch": 0.15627059081924005, + "grad_norm": 2.3551383018493652, + "learning_rate": 2.602489019033675e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7090581059455872, + "num_tokens": 36904741.0, + "step": 1423 + }, + { + "epoch": 0.15638040852185373, + "grad_norm": 1.98539400100708, + "learning_rate": 2.6043191800878482e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.6779860854148865, + "num_tokens": 36937089.0, + "step": 1424 + }, + { + "epoch": 0.15649022622446737, + "grad_norm": 2.297515630722046, + "learning_rate": 2.606149341142021e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7169976234436035, + "num_tokens": 36959613.0, + "step": 1425 + }, + { + "epoch": 0.15660004392708105, + "grad_norm": 2.3174307346343994, + "learning_rate": 2.6079795021961934e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6991119384765625, + "num_tokens": 36982451.0, + "step": 1426 + }, + { + "epoch": 0.15670986162969472, + "grad_norm": 2.353186845779419, + "learning_rate": 2.609809663250366e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7050420045852661, + "num_tokens": 37005406.0, + "step": 1427 + }, + { + "epoch": 0.15681967933230836, + "grad_norm": 2.1577224731445312, + "learning_rate": 2.6116398243045386e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7111042737960815, + "num_tokens": 37031574.0, + "step": 1428 + }, + { + "epoch": 0.15692949703492204, + "grad_norm": 2.1189215183258057, + "learning_rate": 2.613469985358712e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7028323411941528, + "num_tokens": 37055163.0, + "step": 1429 + }, + { + "epoch": 0.15703931473753568, + "grad_norm": 2.2047629356384277, + "learning_rate": 2.6153001464128846e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.707983136177063, + "num_tokens": 37078690.0, + "step": 1430 + }, + { + "epoch": 0.15714913244014936, + "grad_norm": 2.064277410507202, + "learning_rate": 2.617130307467057e-06, + "loss": 1.1034, + "mean_token_accuracy": 0.6767673492431641, + "num_tokens": 37108849.0, + "step": 1431 + }, + { + "epoch": 0.157258950142763, + "grad_norm": 2.0284297466278076, + "learning_rate": 2.61896046852123e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6877479553222656, + "num_tokens": 37138463.0, + "step": 1432 + }, + { + "epoch": 0.15736876784537668, + "grad_norm": 2.2367143630981445, + "learning_rate": 2.6207906295754028e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7035460472106934, + "num_tokens": 37163034.0, + "step": 1433 + }, + { + "epoch": 0.15747858554799032, + "grad_norm": 2.0013084411621094, + "learning_rate": 2.622620790629576e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6893184185028076, + "num_tokens": 37193952.0, + "step": 1434 + }, + { + "epoch": 0.157588403250604, + "grad_norm": 2.170074939727783, + "learning_rate": 2.6244509516837484e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7108353972434998, + "num_tokens": 37219171.0, + "step": 1435 + }, + { + "epoch": 0.15769822095321767, + "grad_norm": 2.232684373855591, + "learning_rate": 2.6262811127379214e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6943187117576599, + "num_tokens": 37244150.0, + "step": 1436 + }, + { + "epoch": 0.15780803865583132, + "grad_norm": 2.661872625350952, + "learning_rate": 2.628111273792094e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.731339693069458, + "num_tokens": 37262362.0, + "step": 1437 + }, + { + "epoch": 0.157917856358445, + "grad_norm": 2.161461353302002, + "learning_rate": 2.6299414348462666e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.686684787273407, + "num_tokens": 37288481.0, + "step": 1438 + }, + { + "epoch": 0.15802767406105864, + "grad_norm": 2.0192887783050537, + "learning_rate": 2.631771595900439e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7058324217796326, + "num_tokens": 37321035.0, + "step": 1439 + }, + { + "epoch": 0.1581374917636723, + "grad_norm": 2.096848726272583, + "learning_rate": 2.6336017569546126e-06, + "loss": 1.0983, + "mean_token_accuracy": 0.6808937788009644, + "num_tokens": 37347449.0, + "step": 1440 + }, + { + "epoch": 0.15824730946628596, + "grad_norm": 2.099132776260376, + "learning_rate": 2.635431918008785e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.674557089805603, + "num_tokens": 37377460.0, + "step": 1441 + }, + { + "epoch": 0.15835712716889963, + "grad_norm": 2.5306613445281982, + "learning_rate": 2.6372620790629578e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7081570029258728, + "num_tokens": 37396511.0, + "step": 1442 + }, + { + "epoch": 0.15846694487151328, + "grad_norm": 1.972656488418579, + "learning_rate": 2.6390922401171303e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.676482081413269, + "num_tokens": 37430860.0, + "step": 1443 + }, + { + "epoch": 0.15857676257412695, + "grad_norm": 2.0907578468322754, + "learning_rate": 2.640922401171303e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6869416236877441, + "num_tokens": 37460323.0, + "step": 1444 + }, + { + "epoch": 0.15868658027674062, + "grad_norm": 2.4631969928741455, + "learning_rate": 2.6427525622254764e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7037059664726257, + "num_tokens": 37482858.0, + "step": 1445 + }, + { + "epoch": 0.15879639797935427, + "grad_norm": 2.1252756118774414, + "learning_rate": 2.644582723279649e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.700869619846344, + "num_tokens": 37509510.0, + "step": 1446 + }, + { + "epoch": 0.15890621568196794, + "grad_norm": 2.2103524208068848, + "learning_rate": 2.6464128843338215e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6948665976524353, + "num_tokens": 37536815.0, + "step": 1447 + }, + { + "epoch": 0.1590160333845816, + "grad_norm": 2.2460386753082275, + "learning_rate": 2.648243045387994e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.7009316682815552, + "num_tokens": 37562293.0, + "step": 1448 + }, + { + "epoch": 0.15912585108719526, + "grad_norm": 2.271881341934204, + "learning_rate": 2.650073206442167e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7160661220550537, + "num_tokens": 37586741.0, + "step": 1449 + }, + { + "epoch": 0.1592356687898089, + "grad_norm": 2.1021642684936523, + "learning_rate": 2.65190336749634e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6990692019462585, + "num_tokens": 37613943.0, + "step": 1450 + }, + { + "epoch": 0.15934548649242258, + "grad_norm": 2.0242698192596436, + "learning_rate": 2.6537335285505127e-06, + "loss": 1.0823, + "mean_token_accuracy": 0.6787436008453369, + "num_tokens": 37644801.0, + "step": 1451 + }, + { + "epoch": 0.15945530419503623, + "grad_norm": 2.2851874828338623, + "learning_rate": 2.6555636896046853e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7093566060066223, + "num_tokens": 37666534.0, + "step": 1452 + }, + { + "epoch": 0.1595651218976499, + "grad_norm": 2.243915557861328, + "learning_rate": 2.6573938506588583e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7002880573272705, + "num_tokens": 37690095.0, + "step": 1453 + }, + { + "epoch": 0.15967493960026355, + "grad_norm": 2.1129794120788574, + "learning_rate": 2.659224011713031e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6797378063201904, + "num_tokens": 37718658.0, + "step": 1454 + }, + { + "epoch": 0.15978475730287722, + "grad_norm": 2.3510913848876953, + "learning_rate": 2.6610541727672035e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6833558082580566, + "num_tokens": 37743598.0, + "step": 1455 + }, + { + "epoch": 0.1598945750054909, + "grad_norm": 2.1636710166931152, + "learning_rate": 2.6628843338213765e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7179781198501587, + "num_tokens": 37768132.0, + "step": 1456 + }, + { + "epoch": 0.16000439270810454, + "grad_norm": 1.9546117782592773, + "learning_rate": 2.6647144948755495e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6925958395004272, + "num_tokens": 37801387.0, + "step": 1457 + }, + { + "epoch": 0.16011421041071822, + "grad_norm": 2.315795421600342, + "learning_rate": 2.666544655929722e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7022901773452759, + "num_tokens": 37827579.0, + "step": 1458 + }, + { + "epoch": 0.16022402811333186, + "grad_norm": 2.1902832984924316, + "learning_rate": 2.6683748169838947e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.703871488571167, + "num_tokens": 37852099.0, + "step": 1459 + }, + { + "epoch": 0.16033384581594554, + "grad_norm": 2.2182796001434326, + "learning_rate": 2.6702049780380673e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6800968050956726, + "num_tokens": 37878884.0, + "step": 1460 + }, + { + "epoch": 0.16044366351855918, + "grad_norm": 2.0974128246307373, + "learning_rate": 2.6720351390922407e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6984515190124512, + "num_tokens": 37905891.0, + "step": 1461 + }, + { + "epoch": 0.16055348122117286, + "grad_norm": 2.6234092712402344, + "learning_rate": 2.6738653001464133e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7057195901870728, + "num_tokens": 37927100.0, + "step": 1462 + }, + { + "epoch": 0.1606632989237865, + "grad_norm": 2.6072006225585938, + "learning_rate": 2.675695461200586e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6965261697769165, + "num_tokens": 37945287.0, + "step": 1463 + }, + { + "epoch": 0.16077311662640018, + "grad_norm": 2.28218936920166, + "learning_rate": 2.6775256222547585e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7303868532180786, + "num_tokens": 37967651.0, + "step": 1464 + }, + { + "epoch": 0.16088293432901385, + "grad_norm": 2.0145137310028076, + "learning_rate": 2.679355783308931e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6897598505020142, + "num_tokens": 38001800.0, + "step": 1465 + }, + { + "epoch": 0.1609927520316275, + "grad_norm": 2.3988068103790283, + "learning_rate": 2.6811859443631045e-06, + "loss": 1.1137, + "mean_token_accuracy": 0.6681543588638306, + "num_tokens": 38027901.0, + "step": 1466 + }, + { + "epoch": 0.16110256973424117, + "grad_norm": 2.6210289001464844, + "learning_rate": 2.683016105417277e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7115687727928162, + "num_tokens": 38046664.0, + "step": 1467 + }, + { + "epoch": 0.16121238743685481, + "grad_norm": 2.2199442386627197, + "learning_rate": 2.6848462664714497e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6805918216705322, + "num_tokens": 38071222.0, + "step": 1468 + }, + { + "epoch": 0.1613222051394685, + "grad_norm": 2.310394287109375, + "learning_rate": 2.6866764275256222e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7110298871994019, + "num_tokens": 38092672.0, + "step": 1469 + }, + { + "epoch": 0.16143202284208213, + "grad_norm": 2.2493669986724854, + "learning_rate": 2.6885065885797953e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6857689023017883, + "num_tokens": 38117730.0, + "step": 1470 + }, + { + "epoch": 0.1615418405446958, + "grad_norm": 2.0036861896514893, + "learning_rate": 2.690336749633968e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.690962553024292, + "num_tokens": 38147648.0, + "step": 1471 + }, + { + "epoch": 0.16165165824730945, + "grad_norm": 2.4037725925445557, + "learning_rate": 2.692166910688141e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.69370436668396, + "num_tokens": 38173420.0, + "step": 1472 + }, + { + "epoch": 0.16176147594992313, + "grad_norm": 2.2738940715789795, + "learning_rate": 2.6939970717423134e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7105502486228943, + "num_tokens": 38196439.0, + "step": 1473 + }, + { + "epoch": 0.1618712936525368, + "grad_norm": 2.161802053451538, + "learning_rate": 2.6958272327964865e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.69379061460495, + "num_tokens": 38219811.0, + "step": 1474 + }, + { + "epoch": 0.16198111135515045, + "grad_norm": 2.4074978828430176, + "learning_rate": 2.697657393850659e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7080605030059814, + "num_tokens": 38241769.0, + "step": 1475 + }, + { + "epoch": 0.16209092905776412, + "grad_norm": 2.1320602893829346, + "learning_rate": 2.6994875549048316e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7072888016700745, + "num_tokens": 38269796.0, + "step": 1476 + }, + { + "epoch": 0.16220074676037777, + "grad_norm": 2.4353175163269043, + "learning_rate": 2.7013177159590046e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6880637407302856, + "num_tokens": 38291742.0, + "step": 1477 + }, + { + "epoch": 0.16231056446299144, + "grad_norm": 2.1559484004974365, + "learning_rate": 2.7031478770131776e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7115877866744995, + "num_tokens": 38319062.0, + "step": 1478 + }, + { + "epoch": 0.1624203821656051, + "grad_norm": 2.0199618339538574, + "learning_rate": 2.7049780380673502e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6963120698928833, + "num_tokens": 38349640.0, + "step": 1479 + }, + { + "epoch": 0.16253019986821876, + "grad_norm": 2.4099433422088623, + "learning_rate": 2.706808199121523e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6826168298721313, + "num_tokens": 38372246.0, + "step": 1480 + }, + { + "epoch": 0.1626400175708324, + "grad_norm": 2.484513759613037, + "learning_rate": 2.7086383601756954e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7015805244445801, + "num_tokens": 38394180.0, + "step": 1481 + }, + { + "epoch": 0.16274983527344608, + "grad_norm": 2.1635923385620117, + "learning_rate": 2.710468521229869e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6880767345428467, + "num_tokens": 38420650.0, + "step": 1482 + }, + { + "epoch": 0.16285965297605975, + "grad_norm": 2.3749122619628906, + "learning_rate": 2.7122986822840414e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6848814487457275, + "num_tokens": 38444016.0, + "step": 1483 + }, + { + "epoch": 0.1629694706786734, + "grad_norm": 2.4687771797180176, + "learning_rate": 2.714128843338214e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7149032950401306, + "num_tokens": 38465802.0, + "step": 1484 + }, + { + "epoch": 0.16307928838128707, + "grad_norm": 2.3485612869262695, + "learning_rate": 2.7159590043923866e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6790962219238281, + "num_tokens": 38489517.0, + "step": 1485 + }, + { + "epoch": 0.16318910608390072, + "grad_norm": 2.177985906600952, + "learning_rate": 2.717789165446559e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6849035620689392, + "num_tokens": 38515735.0, + "step": 1486 + }, + { + "epoch": 0.1632989237865144, + "grad_norm": 2.3413619995117188, + "learning_rate": 2.719619326500732e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7092251181602478, + "num_tokens": 38541234.0, + "step": 1487 + }, + { + "epoch": 0.16340874148912804, + "grad_norm": 2.3443264961242676, + "learning_rate": 2.721449487554905e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7221572995185852, + "num_tokens": 38564492.0, + "step": 1488 + }, + { + "epoch": 0.1635185591917417, + "grad_norm": 2.34098744392395, + "learning_rate": 2.723279648609078e-06, + "loss": 1.03, + "mean_token_accuracy": 0.682853102684021, + "num_tokens": 38588977.0, + "step": 1489 + }, + { + "epoch": 0.16362837689435536, + "grad_norm": 2.257901430130005, + "learning_rate": 2.7251098096632504e-06, + "loss": 1.093, + "mean_token_accuracy": 0.6759153604507446, + "num_tokens": 38615650.0, + "step": 1490 + }, + { + "epoch": 0.16373819459696903, + "grad_norm": 2.3543503284454346, + "learning_rate": 2.7269399707174234e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6751024127006531, + "num_tokens": 38642346.0, + "step": 1491 + }, + { + "epoch": 0.16384801229958268, + "grad_norm": 2.79250431060791, + "learning_rate": 2.728770131771596e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7390066385269165, + "num_tokens": 38659575.0, + "step": 1492 + }, + { + "epoch": 0.16395783000219635, + "grad_norm": 2.544329881668091, + "learning_rate": 2.730600292825769e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.702866792678833, + "num_tokens": 38681232.0, + "step": 1493 + }, + { + "epoch": 0.16406764770481003, + "grad_norm": 2.2478537559509277, + "learning_rate": 2.7324304538799416e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.7010893821716309, + "num_tokens": 38708816.0, + "step": 1494 + }, + { + "epoch": 0.16417746540742367, + "grad_norm": 2.32033109664917, + "learning_rate": 2.7342606149341146e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6866710782051086, + "num_tokens": 38733366.0, + "step": 1495 + }, + { + "epoch": 0.16428728311003735, + "grad_norm": 1.9994724988937378, + "learning_rate": 2.736090775988287e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.711756706237793, + "num_tokens": 38761810.0, + "step": 1496 + }, + { + "epoch": 0.164397100812651, + "grad_norm": 2.095846176147461, + "learning_rate": 2.7379209370424598e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6884886622428894, + "num_tokens": 38792470.0, + "step": 1497 + }, + { + "epoch": 0.16450691851526467, + "grad_norm": 2.1466989517211914, + "learning_rate": 2.7397510980966328e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6875504851341248, + "num_tokens": 38820143.0, + "step": 1498 + }, + { + "epoch": 0.1646167362178783, + "grad_norm": 2.2216110229492188, + "learning_rate": 2.7415812591508058e-06, + "loss": 0.994, + "mean_token_accuracy": 0.699878990650177, + "num_tokens": 38846071.0, + "step": 1499 + }, + { + "epoch": 0.16472655392049199, + "grad_norm": 2.1108171939849854, + "learning_rate": 2.7434114202049784e-06, + "loss": 1.136, + "mean_token_accuracy": 0.666589081287384, + "num_tokens": 38877105.0, + "step": 1500 + }, + { + "epoch": 0.16483637162310563, + "grad_norm": 2.5047380924224854, + "learning_rate": 2.745241581259151e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6976335048675537, + "num_tokens": 38899406.0, + "step": 1501 + }, + { + "epoch": 0.1649461893257193, + "grad_norm": 2.268362283706665, + "learning_rate": 2.7470717423133235e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.711104154586792, + "num_tokens": 38924717.0, + "step": 1502 + }, + { + "epoch": 0.16505600702833298, + "grad_norm": 2.3498713970184326, + "learning_rate": 2.748901903367496e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6911882162094116, + "num_tokens": 38949909.0, + "step": 1503 + }, + { + "epoch": 0.16516582473094663, + "grad_norm": 2.4200994968414307, + "learning_rate": 2.7507320644216696e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6863524317741394, + "num_tokens": 38973529.0, + "step": 1504 + }, + { + "epoch": 0.1652756424335603, + "grad_norm": 2.524986982345581, + "learning_rate": 2.752562225475842e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7004796266555786, + "num_tokens": 38997949.0, + "step": 1505 + }, + { + "epoch": 0.16538546013617395, + "grad_norm": 2.6260011196136475, + "learning_rate": 2.7543923865300147e-06, + "loss": 1.1168, + "mean_token_accuracy": 0.6669563055038452, + "num_tokens": 39022632.0, + "step": 1506 + }, + { + "epoch": 0.16549527783878762, + "grad_norm": 2.287240505218506, + "learning_rate": 2.7562225475841873e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6918671131134033, + "num_tokens": 39047251.0, + "step": 1507 + }, + { + "epoch": 0.16560509554140126, + "grad_norm": 2.2132153511047363, + "learning_rate": 2.7580527086383603e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6973628401756287, + "num_tokens": 39073344.0, + "step": 1508 + }, + { + "epoch": 0.16571491324401494, + "grad_norm": 2.3461244106292725, + "learning_rate": 2.7598828696925333e-06, + "loss": 1.1051, + "mean_token_accuracy": 0.6758475303649902, + "num_tokens": 39099161.0, + "step": 1509 + }, + { + "epoch": 0.16582473094662858, + "grad_norm": 2.1190545558929443, + "learning_rate": 2.761713030746706e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6962766647338867, + "num_tokens": 39125449.0, + "step": 1510 + }, + { + "epoch": 0.16593454864924226, + "grad_norm": 2.046700954437256, + "learning_rate": 2.7635431918008785e-06, + "loss": 1.0811, + "mean_token_accuracy": 0.6787540316581726, + "num_tokens": 39156017.0, + "step": 1511 + }, + { + "epoch": 0.16604436635185593, + "grad_norm": 2.0531201362609863, + "learning_rate": 2.7653733528550515e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6775758862495422, + "num_tokens": 39186458.0, + "step": 1512 + }, + { + "epoch": 0.16615418405446958, + "grad_norm": 2.0739660263061523, + "learning_rate": 2.767203513909224e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6865783333778381, + "num_tokens": 39218287.0, + "step": 1513 + }, + { + "epoch": 0.16626400175708325, + "grad_norm": 2.419356346130371, + "learning_rate": 2.769033674963397e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6983739137649536, + "num_tokens": 39240996.0, + "step": 1514 + }, + { + "epoch": 0.1663738194596969, + "grad_norm": 2.0914955139160156, + "learning_rate": 2.77086383601757e-06, + "loss": 1.1566, + "mean_token_accuracy": 0.6566485166549683, + "num_tokens": 39271371.0, + "step": 1515 + }, + { + "epoch": 0.16648363716231057, + "grad_norm": 2.11674427986145, + "learning_rate": 2.7726939970717427e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6871393918991089, + "num_tokens": 39299610.0, + "step": 1516 + }, + { + "epoch": 0.16659345486492422, + "grad_norm": 2.2099225521087646, + "learning_rate": 2.7745241581259153e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6815978288650513, + "num_tokens": 39327502.0, + "step": 1517 + }, + { + "epoch": 0.1667032725675379, + "grad_norm": 2.3944263458251953, + "learning_rate": 2.776354319180088e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.697371244430542, + "num_tokens": 39351916.0, + "step": 1518 + }, + { + "epoch": 0.16681309027015154, + "grad_norm": 2.659062623977661, + "learning_rate": 2.7781844802342605e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7060168385505676, + "num_tokens": 39370562.0, + "step": 1519 + }, + { + "epoch": 0.1669229079727652, + "grad_norm": 2.179893732070923, + "learning_rate": 2.780014641288434e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7199444770812988, + "num_tokens": 39394470.0, + "step": 1520 + }, + { + "epoch": 0.16703272567537888, + "grad_norm": 2.3406221866607666, + "learning_rate": 2.7818448023426065e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6896792054176331, + "num_tokens": 39419759.0, + "step": 1521 + }, + { + "epoch": 0.16714254337799253, + "grad_norm": 2.258524179458618, + "learning_rate": 2.783674963396779e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7036597728729248, + "num_tokens": 39446037.0, + "step": 1522 + }, + { + "epoch": 0.1672523610806062, + "grad_norm": 2.4026131629943848, + "learning_rate": 2.7855051244509517e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.6766629219055176, + "num_tokens": 39471103.0, + "step": 1523 + }, + { + "epoch": 0.16736217878321985, + "grad_norm": 2.1544811725616455, + "learning_rate": 2.7873352855051243e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6765662431716919, + "num_tokens": 39499587.0, + "step": 1524 + }, + { + "epoch": 0.16747199648583352, + "grad_norm": 2.4095582962036133, + "learning_rate": 2.7891654465592977e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.689505398273468, + "num_tokens": 39522962.0, + "step": 1525 + }, + { + "epoch": 0.16758181418844717, + "grad_norm": 2.109687089920044, + "learning_rate": 2.7909956076134703e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7018101811408997, + "num_tokens": 39547851.0, + "step": 1526 + }, + { + "epoch": 0.16769163189106084, + "grad_norm": 2.168135166168213, + "learning_rate": 2.792825768667643e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6755383014678955, + "num_tokens": 39577310.0, + "step": 1527 + }, + { + "epoch": 0.1678014495936745, + "grad_norm": 2.395987033843994, + "learning_rate": 2.7946559297218154e-06, + "loss": 1.013, + "mean_token_accuracy": 0.705145001411438, + "num_tokens": 39599725.0, + "step": 1528 + }, + { + "epoch": 0.16791126729628816, + "grad_norm": 2.1941020488739014, + "learning_rate": 2.7964860907759885e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.6740774512290955, + "num_tokens": 39626992.0, + "step": 1529 + }, + { + "epoch": 0.1680210849989018, + "grad_norm": 2.300809383392334, + "learning_rate": 2.7983162518301615e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6913208365440369, + "num_tokens": 39653520.0, + "step": 1530 + }, + { + "epoch": 0.16813090270151548, + "grad_norm": 2.3967623710632324, + "learning_rate": 2.800146412884334e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7241743803024292, + "num_tokens": 39675485.0, + "step": 1531 + }, + { + "epoch": 0.16824072040412916, + "grad_norm": 2.1288671493530273, + "learning_rate": 2.801976573938507e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6970498561859131, + "num_tokens": 39703344.0, + "step": 1532 + }, + { + "epoch": 0.1683505381067428, + "grad_norm": 2.044753074645996, + "learning_rate": 2.8038067349926796e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6890419721603394, + "num_tokens": 39728863.0, + "step": 1533 + }, + { + "epoch": 0.16846035580935648, + "grad_norm": 2.036008834838867, + "learning_rate": 2.8056368960468522e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.688673734664917, + "num_tokens": 39758283.0, + "step": 1534 + }, + { + "epoch": 0.16857017351197012, + "grad_norm": 2.047936201095581, + "learning_rate": 2.807467057101025e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7079783082008362, + "num_tokens": 39785143.0, + "step": 1535 + }, + { + "epoch": 0.1686799912145838, + "grad_norm": 1.9880812168121338, + "learning_rate": 2.8092972181551983e-06, + "loss": 1.1012, + "mean_token_accuracy": 0.6723222732543945, + "num_tokens": 39817247.0, + "step": 1536 + }, + { + "epoch": 0.16878980891719744, + "grad_norm": 2.3282077312469482, + "learning_rate": 2.811127379209371e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6962630748748779, + "num_tokens": 39840302.0, + "step": 1537 + }, + { + "epoch": 0.16889962661981112, + "grad_norm": 2.316814422607422, + "learning_rate": 2.8129575402635434e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6938046216964722, + "num_tokens": 39863917.0, + "step": 1538 + }, + { + "epoch": 0.16900944432242476, + "grad_norm": 2.2396926879882812, + "learning_rate": 2.814787701317716e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6892727613449097, + "num_tokens": 39888976.0, + "step": 1539 + }, + { + "epoch": 0.16911926202503844, + "grad_norm": 1.9575475454330444, + "learning_rate": 2.8166178623718886e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7257990837097168, + "num_tokens": 39919144.0, + "step": 1540 + }, + { + "epoch": 0.1692290797276521, + "grad_norm": 2.3493523597717285, + "learning_rate": 2.818448023426062e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7187691330909729, + "num_tokens": 39941248.0, + "step": 1541 + }, + { + "epoch": 0.16933889743026576, + "grad_norm": 2.392916440963745, + "learning_rate": 2.8202781844802346e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6885866522789001, + "num_tokens": 39964161.0, + "step": 1542 + }, + { + "epoch": 0.16944871513287943, + "grad_norm": 2.065659999847412, + "learning_rate": 2.822108345534407e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.6775798797607422, + "num_tokens": 39993366.0, + "step": 1543 + }, + { + "epoch": 0.16955853283549308, + "grad_norm": 2.2595772743225098, + "learning_rate": 2.82393850658858e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.697597861289978, + "num_tokens": 40019746.0, + "step": 1544 + }, + { + "epoch": 0.16966835053810675, + "grad_norm": 2.0884056091308594, + "learning_rate": 2.8257686676427524e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6931101679801941, + "num_tokens": 40047064.0, + "step": 1545 + }, + { + "epoch": 0.1697781682407204, + "grad_norm": 2.329158067703247, + "learning_rate": 2.827598828696926e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6854268312454224, + "num_tokens": 40071457.0, + "step": 1546 + }, + { + "epoch": 0.16988798594333407, + "grad_norm": 2.2462568283081055, + "learning_rate": 2.8294289897510984e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6784666776657104, + "num_tokens": 40096513.0, + "step": 1547 + }, + { + "epoch": 0.16999780364594771, + "grad_norm": 2.2917017936706543, + "learning_rate": 2.831259150805271e-06, + "loss": 1.1179, + "mean_token_accuracy": 0.6736029386520386, + "num_tokens": 40123154.0, + "step": 1548 + }, + { + "epoch": 0.1701076213485614, + "grad_norm": 2.3530728816986084, + "learning_rate": 2.833089311859444e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6857683658599854, + "num_tokens": 40147223.0, + "step": 1549 + }, + { + "epoch": 0.17021743905117506, + "grad_norm": 2.5071775913238525, + "learning_rate": 2.8349194729136166e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6940321922302246, + "num_tokens": 40167668.0, + "step": 1550 + }, + { + "epoch": 0.1703272567537887, + "grad_norm": 2.2934019565582275, + "learning_rate": 2.836749633967789e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6911518573760986, + "num_tokens": 40191618.0, + "step": 1551 + }, + { + "epoch": 0.17043707445640238, + "grad_norm": 2.0707056522369385, + "learning_rate": 2.838579795021962e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6835110783576965, + "num_tokens": 40219578.0, + "step": 1552 + }, + { + "epoch": 0.17054689215901603, + "grad_norm": 2.7457950115203857, + "learning_rate": 2.840409956076135e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7043265104293823, + "num_tokens": 40237532.0, + "step": 1553 + }, + { + "epoch": 0.1706567098616297, + "grad_norm": 2.369720697402954, + "learning_rate": 2.8422401171303078e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6845760345458984, + "num_tokens": 40261906.0, + "step": 1554 + }, + { + "epoch": 0.17076652756424335, + "grad_norm": 2.3212482929229736, + "learning_rate": 2.8440702781844804e-06, + "loss": 1.12, + "mean_token_accuracy": 0.6639068126678467, + "num_tokens": 40288916.0, + "step": 1555 + }, + { + "epoch": 0.17087634526685702, + "grad_norm": 2.1371195316314697, + "learning_rate": 2.845900439238653e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6984075903892517, + "num_tokens": 40314949.0, + "step": 1556 + }, + { + "epoch": 0.17098616296947067, + "grad_norm": 2.0957350730895996, + "learning_rate": 2.8477306002928264e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6869897246360779, + "num_tokens": 40343427.0, + "step": 1557 + }, + { + "epoch": 0.17109598067208434, + "grad_norm": 2.2535934448242188, + "learning_rate": 2.849560761346999e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6935319900512695, + "num_tokens": 40366447.0, + "step": 1558 + }, + { + "epoch": 0.17120579837469801, + "grad_norm": 2.312224864959717, + "learning_rate": 2.8513909224011716e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6838529109954834, + "num_tokens": 40391765.0, + "step": 1559 + }, + { + "epoch": 0.17131561607731166, + "grad_norm": 2.2818443775177, + "learning_rate": 2.853221083455344e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.698883056640625, + "num_tokens": 40415065.0, + "step": 1560 + }, + { + "epoch": 0.17142543377992533, + "grad_norm": 2.617403268814087, + "learning_rate": 2.8550512445095167e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7212207317352295, + "num_tokens": 40434950.0, + "step": 1561 + }, + { + "epoch": 0.17153525148253898, + "grad_norm": 2.4538421630859375, + "learning_rate": 2.85688140556369e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6961959600448608, + "num_tokens": 40455229.0, + "step": 1562 + }, + { + "epoch": 0.17164506918515265, + "grad_norm": 2.0820419788360596, + "learning_rate": 2.8587115666178628e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6875244975090027, + "num_tokens": 40486420.0, + "step": 1563 + }, + { + "epoch": 0.1717548868877663, + "grad_norm": 2.1913814544677734, + "learning_rate": 2.8605417276720353e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6884431838989258, + "num_tokens": 40514334.0, + "step": 1564 + }, + { + "epoch": 0.17186470459037997, + "grad_norm": 2.124478340148926, + "learning_rate": 2.862371888726208e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7068994045257568, + "num_tokens": 40540773.0, + "step": 1565 + }, + { + "epoch": 0.17197452229299362, + "grad_norm": 2.1058921813964844, + "learning_rate": 2.864202049780381e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6878135204315186, + "num_tokens": 40568934.0, + "step": 1566 + }, + { + "epoch": 0.1720843399956073, + "grad_norm": 2.180572748184204, + "learning_rate": 2.8660322108345535e-06, + "loss": 1.1258, + "mean_token_accuracy": 0.6683669686317444, + "num_tokens": 40598330.0, + "step": 1567 + }, + { + "epoch": 0.17219415769822094, + "grad_norm": 2.030095100402832, + "learning_rate": 2.8678623718887265e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7057343125343323, + "num_tokens": 40631135.0, + "step": 1568 + }, + { + "epoch": 0.1723039754008346, + "grad_norm": 2.240778684616089, + "learning_rate": 2.869692532942899e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6929984092712402, + "num_tokens": 40656345.0, + "step": 1569 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 2.1889450550079346, + "learning_rate": 2.871522693997072e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6974100470542908, + "num_tokens": 40682328.0, + "step": 1570 + }, + { + "epoch": 0.17252361080606193, + "grad_norm": 2.120103120803833, + "learning_rate": 2.8733528550512447e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6934479475021362, + "num_tokens": 40710682.0, + "step": 1571 + }, + { + "epoch": 0.1726334285086756, + "grad_norm": 2.290236234664917, + "learning_rate": 2.8751830161054173e-06, + "loss": 1.0918, + "mean_token_accuracy": 0.6744712591171265, + "num_tokens": 40735484.0, + "step": 1572 + }, + { + "epoch": 0.17274324621128925, + "grad_norm": 2.110139846801758, + "learning_rate": 2.8770131771595903e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7159264087677002, + "num_tokens": 40761613.0, + "step": 1573 + }, + { + "epoch": 0.17285306391390293, + "grad_norm": 2.107703924179077, + "learning_rate": 2.8788433382137633e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7173802256584167, + "num_tokens": 40788325.0, + "step": 1574 + }, + { + "epoch": 0.17296288161651657, + "grad_norm": 2.203383207321167, + "learning_rate": 2.880673499267936e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6902881860733032, + "num_tokens": 40815423.0, + "step": 1575 + }, + { + "epoch": 0.17307269931913025, + "grad_norm": 2.250103712081909, + "learning_rate": 2.8825036603221085e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6823422312736511, + "num_tokens": 40841881.0, + "step": 1576 + }, + { + "epoch": 0.1731825170217439, + "grad_norm": 1.94341242313385, + "learning_rate": 2.884333821376281e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.693337082862854, + "num_tokens": 40873147.0, + "step": 1577 + }, + { + "epoch": 0.17329233472435757, + "grad_norm": 2.1588432788848877, + "learning_rate": 2.8861639824304545e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6960384845733643, + "num_tokens": 40898361.0, + "step": 1578 + }, + { + "epoch": 0.17340215242697124, + "grad_norm": 2.1430320739746094, + "learning_rate": 2.887994143484627e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7041295766830444, + "num_tokens": 40925613.0, + "step": 1579 + }, + { + "epoch": 0.17351197012958489, + "grad_norm": 2.173098564147949, + "learning_rate": 2.8898243045387997e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6981871128082275, + "num_tokens": 40949831.0, + "step": 1580 + }, + { + "epoch": 0.17362178783219856, + "grad_norm": 2.12495756149292, + "learning_rate": 2.8916544655929723e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6912436485290527, + "num_tokens": 40978300.0, + "step": 1581 + }, + { + "epoch": 0.1737316055348122, + "grad_norm": 2.2030155658721924, + "learning_rate": 2.893484626647145e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6972644329071045, + "num_tokens": 41005198.0, + "step": 1582 + }, + { + "epoch": 0.17384142323742588, + "grad_norm": 2.058119535446167, + "learning_rate": 2.895314787701318e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7110321521759033, + "num_tokens": 41032666.0, + "step": 1583 + }, + { + "epoch": 0.17395124094003953, + "grad_norm": 2.0595951080322266, + "learning_rate": 2.897144948755491e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7048076391220093, + "num_tokens": 41060591.0, + "step": 1584 + }, + { + "epoch": 0.1740610586426532, + "grad_norm": 2.018462896347046, + "learning_rate": 2.8989751098096635e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6878602504730225, + "num_tokens": 41094295.0, + "step": 1585 + }, + { + "epoch": 0.17417087634526685, + "grad_norm": 2.5230064392089844, + "learning_rate": 2.900805270863836e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6956015825271606, + "num_tokens": 41116363.0, + "step": 1586 + }, + { + "epoch": 0.17428069404788052, + "grad_norm": 2.0071990489959717, + "learning_rate": 2.902635431918009e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6825167536735535, + "num_tokens": 41147723.0, + "step": 1587 + }, + { + "epoch": 0.1743905117504942, + "grad_norm": 2.2744569778442383, + "learning_rate": 2.9044655929721817e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.6757466793060303, + "num_tokens": 41172355.0, + "step": 1588 + }, + { + "epoch": 0.17450032945310784, + "grad_norm": 2.0774409770965576, + "learning_rate": 2.9062957540263547e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6799015998840332, + "num_tokens": 41200370.0, + "step": 1589 + }, + { + "epoch": 0.1746101471557215, + "grad_norm": 2.4555716514587402, + "learning_rate": 2.9081259150805273e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.694113552570343, + "num_tokens": 41221123.0, + "step": 1590 + }, + { + "epoch": 0.17471996485833516, + "grad_norm": 2.261439085006714, + "learning_rate": 2.9099560761347003e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7183189392089844, + "num_tokens": 41246997.0, + "step": 1591 + }, + { + "epoch": 0.17482978256094883, + "grad_norm": 2.0980849266052246, + "learning_rate": 2.911786237188873e-06, + "loss": 1.1469, + "mean_token_accuracy": 0.6695288419723511, + "num_tokens": 41278634.0, + "step": 1592 + }, + { + "epoch": 0.17493960026356248, + "grad_norm": 2.4096148014068604, + "learning_rate": 2.9136163982430454e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7009278535842896, + "num_tokens": 41300328.0, + "step": 1593 + }, + { + "epoch": 0.17504941796617615, + "grad_norm": 2.308710813522339, + "learning_rate": 2.9154465592972184e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7104536294937134, + "num_tokens": 41326391.0, + "step": 1594 + }, + { + "epoch": 0.1751592356687898, + "grad_norm": 2.015385627746582, + "learning_rate": 2.9172767203513915e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7145084142684937, + "num_tokens": 41353485.0, + "step": 1595 + }, + { + "epoch": 0.17526905337140347, + "grad_norm": 2.242550849914551, + "learning_rate": 2.919106881405564e-06, + "loss": 1.026, + "mean_token_accuracy": 0.689821720123291, + "num_tokens": 41380722.0, + "step": 1596 + }, + { + "epoch": 0.17537887107401715, + "grad_norm": 2.056703805923462, + "learning_rate": 2.9209370424597366e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7035424709320068, + "num_tokens": 41410234.0, + "step": 1597 + }, + { + "epoch": 0.1754886887766308, + "grad_norm": 2.1686699390411377, + "learning_rate": 2.9227672035139092e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.6799809336662292, + "num_tokens": 41438126.0, + "step": 1598 + }, + { + "epoch": 0.17559850647924446, + "grad_norm": 2.17712140083313, + "learning_rate": 2.924597364568082e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6908514499664307, + "num_tokens": 41465200.0, + "step": 1599 + }, + { + "epoch": 0.1757083241818581, + "grad_norm": 2.2164552211761475, + "learning_rate": 2.9264275256222552e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6862298250198364, + "num_tokens": 41491310.0, + "step": 1600 + }, + { + "epoch": 0.17581814188447178, + "grad_norm": 2.1355926990509033, + "learning_rate": 2.928257686676428e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6913068294525146, + "num_tokens": 41518929.0, + "step": 1601 + }, + { + "epoch": 0.17592795958708543, + "grad_norm": 2.3055408000946045, + "learning_rate": 2.9300878477306004e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6817469000816345, + "num_tokens": 41542283.0, + "step": 1602 + }, + { + "epoch": 0.1760377772896991, + "grad_norm": 2.3508358001708984, + "learning_rate": 2.931918008784773e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.7013658881187439, + "num_tokens": 41564122.0, + "step": 1603 + }, + { + "epoch": 0.17614759499231275, + "grad_norm": 1.9199676513671875, + "learning_rate": 2.933748169838946e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.686355710029602, + "num_tokens": 41596726.0, + "step": 1604 + }, + { + "epoch": 0.17625741269492642, + "grad_norm": 2.216503858566284, + "learning_rate": 2.935578330893119e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6967681646347046, + "num_tokens": 41622673.0, + "step": 1605 + }, + { + "epoch": 0.17636723039754007, + "grad_norm": 2.2793045043945312, + "learning_rate": 2.9374084919472916e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6885421276092529, + "num_tokens": 41647658.0, + "step": 1606 + }, + { + "epoch": 0.17647704810015374, + "grad_norm": 2.057997226715088, + "learning_rate": 2.939238653001464e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6845511198043823, + "num_tokens": 41677313.0, + "step": 1607 + }, + { + "epoch": 0.17658686580276742, + "grad_norm": 1.9891082048416138, + "learning_rate": 2.941068814055637e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6967030167579651, + "num_tokens": 41707969.0, + "step": 1608 + }, + { + "epoch": 0.17669668350538106, + "grad_norm": 2.3359036445617676, + "learning_rate": 2.9428989751098098e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6772107481956482, + "num_tokens": 41733125.0, + "step": 1609 + }, + { + "epoch": 0.17680650120799474, + "grad_norm": 2.0212416648864746, + "learning_rate": 2.944729136163983e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6954628825187683, + "num_tokens": 41762159.0, + "step": 1610 + }, + { + "epoch": 0.17691631891060838, + "grad_norm": 2.267965316772461, + "learning_rate": 2.9465592972181554e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7149994373321533, + "num_tokens": 41786749.0, + "step": 1611 + }, + { + "epoch": 0.17702613661322206, + "grad_norm": 2.152148723602295, + "learning_rate": 2.9483894582723284e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6998703479766846, + "num_tokens": 41814633.0, + "step": 1612 + }, + { + "epoch": 0.1771359543158357, + "grad_norm": 2.0111935138702393, + "learning_rate": 2.950219619326501e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7016980051994324, + "num_tokens": 41845628.0, + "step": 1613 + }, + { + "epoch": 0.17724577201844938, + "grad_norm": 2.37174391746521, + "learning_rate": 2.9520497803806736e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7187953591346741, + "num_tokens": 41866599.0, + "step": 1614 + }, + { + "epoch": 0.17735558972106302, + "grad_norm": 1.8913394212722778, + "learning_rate": 2.953879941434846e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6968308687210083, + "num_tokens": 41901939.0, + "step": 1615 + }, + { + "epoch": 0.1774654074236767, + "grad_norm": 2.2385880947113037, + "learning_rate": 2.9557101024890196e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7026015520095825, + "num_tokens": 41928251.0, + "step": 1616 + }, + { + "epoch": 0.17757522512629037, + "grad_norm": 2.330214023590088, + "learning_rate": 2.957540263543192e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7038345336914062, + "num_tokens": 41951873.0, + "step": 1617 + }, + { + "epoch": 0.17768504282890402, + "grad_norm": 2.1326096057891846, + "learning_rate": 2.9593704245973648e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.713241457939148, + "num_tokens": 41975346.0, + "step": 1618 + }, + { + "epoch": 0.1777948605315177, + "grad_norm": 2.3266234397888184, + "learning_rate": 2.9612005856515373e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6999162435531616, + "num_tokens": 41999161.0, + "step": 1619 + }, + { + "epoch": 0.17790467823413134, + "grad_norm": 2.186534881591797, + "learning_rate": 2.96303074670571e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6972827911376953, + "num_tokens": 42026962.0, + "step": 1620 + }, + { + "epoch": 0.178014495936745, + "grad_norm": 2.4225265979766846, + "learning_rate": 2.9648609077598834e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6878612041473389, + "num_tokens": 42050651.0, + "step": 1621 + }, + { + "epoch": 0.17812431363935866, + "grad_norm": 2.1076247692108154, + "learning_rate": 2.966691068814056e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6790317296981812, + "num_tokens": 42079390.0, + "step": 1622 + }, + { + "epoch": 0.17823413134197233, + "grad_norm": 2.4026894569396973, + "learning_rate": 2.9685212298682285e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.688254177570343, + "num_tokens": 42102611.0, + "step": 1623 + }, + { + "epoch": 0.17834394904458598, + "grad_norm": 2.1603033542633057, + "learning_rate": 2.970351390922401e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7096785306930542, + "num_tokens": 42130997.0, + "step": 1624 + }, + { + "epoch": 0.17845376674719965, + "grad_norm": 2.223130464553833, + "learning_rate": 2.972181551976574e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6866651773452759, + "num_tokens": 42157097.0, + "step": 1625 + }, + { + "epoch": 0.17856358444981332, + "grad_norm": 2.3802196979522705, + "learning_rate": 2.974011713030747e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7120363712310791, + "num_tokens": 42181215.0, + "step": 1626 + }, + { + "epoch": 0.17867340215242697, + "grad_norm": 1.9399644136428833, + "learning_rate": 2.9758418740849197e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7085534930229187, + "num_tokens": 42211678.0, + "step": 1627 + }, + { + "epoch": 0.17878321985504064, + "grad_norm": 2.3958003520965576, + "learning_rate": 2.9776720351390927e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6867527961730957, + "num_tokens": 42232738.0, + "step": 1628 + }, + { + "epoch": 0.1788930375576543, + "grad_norm": 2.3442416191101074, + "learning_rate": 2.9795021961932653e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6871390342712402, + "num_tokens": 42255847.0, + "step": 1629 + }, + { + "epoch": 0.17900285526026796, + "grad_norm": 2.3883957862854004, + "learning_rate": 2.981332357247438e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7014085054397583, + "num_tokens": 42279032.0, + "step": 1630 + }, + { + "epoch": 0.1791126729628816, + "grad_norm": 2.267061471939087, + "learning_rate": 2.9831625183016105e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6896479725837708, + "num_tokens": 42302746.0, + "step": 1631 + }, + { + "epoch": 0.17922249066549528, + "grad_norm": 2.1900746822357178, + "learning_rate": 2.984992679355784e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7047721147537231, + "num_tokens": 42328746.0, + "step": 1632 + }, + { + "epoch": 0.17933230836810893, + "grad_norm": 2.6297714710235596, + "learning_rate": 2.9868228404099565e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7045285701751709, + "num_tokens": 42348322.0, + "step": 1633 + }, + { + "epoch": 0.1794421260707226, + "grad_norm": 2.201323986053467, + "learning_rate": 2.988653001464129e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7252564430236816, + "num_tokens": 42370433.0, + "step": 1634 + }, + { + "epoch": 0.17955194377333628, + "grad_norm": 2.088491678237915, + "learning_rate": 2.9904831625183017e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6982265710830688, + "num_tokens": 42397735.0, + "step": 1635 + }, + { + "epoch": 0.17966176147594992, + "grad_norm": 2.130380153656006, + "learning_rate": 2.9923133235724743e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.701711893081665, + "num_tokens": 42425829.0, + "step": 1636 + }, + { + "epoch": 0.1797715791785636, + "grad_norm": 2.0121560096740723, + "learning_rate": 2.9941434846266477e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6727714538574219, + "num_tokens": 42457395.0, + "step": 1637 + }, + { + "epoch": 0.17988139688117724, + "grad_norm": 2.360743761062622, + "learning_rate": 2.9959736456808203e-06, + "loss": 1.0904, + "mean_token_accuracy": 0.6762146949768066, + "num_tokens": 42481041.0, + "step": 1638 + }, + { + "epoch": 0.17999121458379091, + "grad_norm": 1.9135502576828003, + "learning_rate": 2.997803806734993e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6788856387138367, + "num_tokens": 42516363.0, + "step": 1639 + }, + { + "epoch": 0.18010103228640456, + "grad_norm": 2.1203997135162354, + "learning_rate": 2.9996339677891655e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6904340386390686, + "num_tokens": 42543729.0, + "step": 1640 + }, + { + "epoch": 0.18021084998901823, + "grad_norm": 2.157008171081543, + "learning_rate": 3.001464128843338e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6791058778762817, + "num_tokens": 42572446.0, + "step": 1641 + }, + { + "epoch": 0.18032066769163188, + "grad_norm": 2.507128953933716, + "learning_rate": 3.0032942898975115e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.702054500579834, + "num_tokens": 42590881.0, + "step": 1642 + }, + { + "epoch": 0.18043048539424555, + "grad_norm": 2.046281099319458, + "learning_rate": 3.005124450951684e-06, + "loss": 1.111, + "mean_token_accuracy": 0.6690924167633057, + "num_tokens": 42622658.0, + "step": 1643 + }, + { + "epoch": 0.1805403030968592, + "grad_norm": 2.2646496295928955, + "learning_rate": 3.0069546120058567e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7046193480491638, + "num_tokens": 42648535.0, + "step": 1644 + }, + { + "epoch": 0.18065012079947287, + "grad_norm": 2.2067949771881104, + "learning_rate": 3.0087847730600297e-06, + "loss": 1.149, + "mean_token_accuracy": 0.6640856266021729, + "num_tokens": 42677745.0, + "step": 1645 + }, + { + "epoch": 0.18075993850208655, + "grad_norm": 2.156942844390869, + "learning_rate": 3.0106149341142023e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.6732012629508972, + "num_tokens": 42706544.0, + "step": 1646 + }, + { + "epoch": 0.1808697562047002, + "grad_norm": 1.7882695198059082, + "learning_rate": 3.012445095168375e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6881676912307739, + "num_tokens": 42743276.0, + "step": 1647 + }, + { + "epoch": 0.18097957390731387, + "grad_norm": 2.098496675491333, + "learning_rate": 3.014275256222548e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7025209069252014, + "num_tokens": 42770651.0, + "step": 1648 + }, + { + "epoch": 0.1810893916099275, + "grad_norm": 2.4447686672210693, + "learning_rate": 3.016105417276721e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6938438415527344, + "num_tokens": 42792649.0, + "step": 1649 + }, + { + "epoch": 0.1811992093125412, + "grad_norm": 2.008249282836914, + "learning_rate": 3.0179355783308935e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7154269218444824, + "num_tokens": 42820868.0, + "step": 1650 + }, + { + "epoch": 0.18130902701515483, + "grad_norm": 2.5223236083984375, + "learning_rate": 3.019765739385066e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6862491369247437, + "num_tokens": 42841875.0, + "step": 1651 + }, + { + "epoch": 0.1814188447177685, + "grad_norm": 2.517122745513916, + "learning_rate": 3.0215959004392386e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6746059060096741, + "num_tokens": 42864576.0, + "step": 1652 + }, + { + "epoch": 0.18152866242038215, + "grad_norm": 1.964185118675232, + "learning_rate": 3.023426061493412e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6869503259658813, + "num_tokens": 42897077.0, + "step": 1653 + }, + { + "epoch": 0.18163848012299583, + "grad_norm": 2.2969610691070557, + "learning_rate": 3.0252562225475847e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6829937696456909, + "num_tokens": 42921381.0, + "step": 1654 + }, + { + "epoch": 0.1817482978256095, + "grad_norm": 2.4583396911621094, + "learning_rate": 3.0270863836017572e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7214930653572083, + "num_tokens": 42941255.0, + "step": 1655 + }, + { + "epoch": 0.18185811552822315, + "grad_norm": 2.1855249404907227, + "learning_rate": 3.02891654465593e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6739941239356995, + "num_tokens": 42968236.0, + "step": 1656 + }, + { + "epoch": 0.18196793323083682, + "grad_norm": 2.0336802005767822, + "learning_rate": 3.0307467057101024e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6943436861038208, + "num_tokens": 42994938.0, + "step": 1657 + }, + { + "epoch": 0.18207775093345047, + "grad_norm": 2.0354504585266113, + "learning_rate": 3.032576866764276e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6952322721481323, + "num_tokens": 43024661.0, + "step": 1658 + }, + { + "epoch": 0.18218756863606414, + "grad_norm": 2.306112051010132, + "learning_rate": 3.0344070278184484e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6985463500022888, + "num_tokens": 43050851.0, + "step": 1659 + }, + { + "epoch": 0.18229738633867779, + "grad_norm": 2.6589791774749756, + "learning_rate": 3.036237188872621e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7083687782287598, + "num_tokens": 43070091.0, + "step": 1660 + }, + { + "epoch": 0.18240720404129146, + "grad_norm": 2.12654972076416, + "learning_rate": 3.0380673499267936e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6846668124198914, + "num_tokens": 43101441.0, + "step": 1661 + }, + { + "epoch": 0.1825170217439051, + "grad_norm": 2.464442014694214, + "learning_rate": 3.0398975109809666e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6908200979232788, + "num_tokens": 43122183.0, + "step": 1662 + }, + { + "epoch": 0.18262683944651878, + "grad_norm": 2.2975070476531982, + "learning_rate": 3.041727672035139e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6978121995925903, + "num_tokens": 43144803.0, + "step": 1663 + }, + { + "epoch": 0.18273665714913245, + "grad_norm": 2.0334842205047607, + "learning_rate": 3.043557833089312e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7015068531036377, + "num_tokens": 43173759.0, + "step": 1664 + }, + { + "epoch": 0.1828464748517461, + "grad_norm": 2.3425416946411133, + "learning_rate": 3.045387994143485e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7091519236564636, + "num_tokens": 43196409.0, + "step": 1665 + }, + { + "epoch": 0.18295629255435977, + "grad_norm": 2.1251752376556396, + "learning_rate": 3.047218155197658e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7148640155792236, + "num_tokens": 43222680.0, + "step": 1666 + }, + { + "epoch": 0.18306611025697342, + "grad_norm": 2.0688836574554443, + "learning_rate": 3.0490483162518304e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7077338695526123, + "num_tokens": 43250767.0, + "step": 1667 + }, + { + "epoch": 0.1831759279595871, + "grad_norm": 2.057610034942627, + "learning_rate": 3.050878477306003e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6924569606781006, + "num_tokens": 43282283.0, + "step": 1668 + }, + { + "epoch": 0.18328574566220074, + "grad_norm": 2.2354910373687744, + "learning_rate": 3.052708638360176e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6825204491615295, + "num_tokens": 43311972.0, + "step": 1669 + }, + { + "epoch": 0.1833955633648144, + "grad_norm": 2.035853862762451, + "learning_rate": 3.054538799414349e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7055137157440186, + "num_tokens": 43341231.0, + "step": 1670 + }, + { + "epoch": 0.18350538106742806, + "grad_norm": 2.2507987022399902, + "learning_rate": 3.0563689604685216e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6876027584075928, + "num_tokens": 43365068.0, + "step": 1671 + }, + { + "epoch": 0.18361519877004173, + "grad_norm": 2.064499616622925, + "learning_rate": 3.058199121522694e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6924060583114624, + "num_tokens": 43397258.0, + "step": 1672 + }, + { + "epoch": 0.1837250164726554, + "grad_norm": 2.238001585006714, + "learning_rate": 3.0600292825768668e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6818021535873413, + "num_tokens": 43423227.0, + "step": 1673 + }, + { + "epoch": 0.18383483417526905, + "grad_norm": 2.1098265647888184, + "learning_rate": 3.06185944363104e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7269431948661804, + "num_tokens": 43446862.0, + "step": 1674 + }, + { + "epoch": 0.18394465187788273, + "grad_norm": 2.474803924560547, + "learning_rate": 3.0636896046852128e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7048572301864624, + "num_tokens": 43467582.0, + "step": 1675 + }, + { + "epoch": 0.18405446958049637, + "grad_norm": 2.0515902042388916, + "learning_rate": 3.0655197657393854e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6939406991004944, + "num_tokens": 43495547.0, + "step": 1676 + }, + { + "epoch": 0.18416428728311005, + "grad_norm": 2.293701171875, + "learning_rate": 3.067349926793558e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6916329860687256, + "num_tokens": 43519620.0, + "step": 1677 + }, + { + "epoch": 0.1842741049857237, + "grad_norm": 2.1076080799102783, + "learning_rate": 3.0691800878477305e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7079637050628662, + "num_tokens": 43545911.0, + "step": 1678 + }, + { + "epoch": 0.18438392268833736, + "grad_norm": 2.134338140487671, + "learning_rate": 3.0710102489019036e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6775963306427002, + "num_tokens": 43573136.0, + "step": 1679 + }, + { + "epoch": 0.184493740390951, + "grad_norm": 2.033724308013916, + "learning_rate": 3.0728404099560766e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6772313117980957, + "num_tokens": 43604295.0, + "step": 1680 + }, + { + "epoch": 0.18460355809356468, + "grad_norm": 2.001376152038574, + "learning_rate": 3.074670571010249e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.6848035454750061, + "num_tokens": 43632573.0, + "step": 1681 + }, + { + "epoch": 0.18471337579617833, + "grad_norm": 2.0419673919677734, + "learning_rate": 3.0765007320644217e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.6709569096565247, + "num_tokens": 43662759.0, + "step": 1682 + }, + { + "epoch": 0.184823193498792, + "grad_norm": 2.1709461212158203, + "learning_rate": 3.0783308931185947e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7171335220336914, + "num_tokens": 43686567.0, + "step": 1683 + }, + { + "epoch": 0.18493301120140568, + "grad_norm": 1.9332276582717896, + "learning_rate": 3.0801610541727673e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6810851693153381, + "num_tokens": 43720274.0, + "step": 1684 + }, + { + "epoch": 0.18504282890401932, + "grad_norm": 2.143073320388794, + "learning_rate": 3.0819912152269403e-06, + "loss": 1.1086, + "mean_token_accuracy": 0.6672989726066589, + "num_tokens": 43749875.0, + "step": 1685 + }, + { + "epoch": 0.185152646606633, + "grad_norm": 2.086944103240967, + "learning_rate": 3.083821376281113e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6962335109710693, + "num_tokens": 43778568.0, + "step": 1686 + }, + { + "epoch": 0.18526246430924664, + "grad_norm": 2.128282308578491, + "learning_rate": 3.085651537335286e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7187325358390808, + "num_tokens": 43803176.0, + "step": 1687 + }, + { + "epoch": 0.18537228201186032, + "grad_norm": 2.3680689334869385, + "learning_rate": 3.0874816983894585e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7069246768951416, + "num_tokens": 43824033.0, + "step": 1688 + }, + { + "epoch": 0.18548209971447396, + "grad_norm": 2.2551779747009277, + "learning_rate": 3.089311859443631e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6915808916091919, + "num_tokens": 43847901.0, + "step": 1689 + }, + { + "epoch": 0.18559191741708764, + "grad_norm": 2.213463306427002, + "learning_rate": 3.091142020497804e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6933112144470215, + "num_tokens": 43873189.0, + "step": 1690 + }, + { + "epoch": 0.18570173511970128, + "grad_norm": 2.185182571411133, + "learning_rate": 3.092972181551977e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6911836862564087, + "num_tokens": 43901552.0, + "step": 1691 + }, + { + "epoch": 0.18581155282231496, + "grad_norm": 2.647010326385498, + "learning_rate": 3.0948023426061497e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7230653762817383, + "num_tokens": 43920657.0, + "step": 1692 + }, + { + "epoch": 0.18592137052492863, + "grad_norm": 2.4152638912200928, + "learning_rate": 3.0966325036603223e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6909716129302979, + "num_tokens": 43942884.0, + "step": 1693 + }, + { + "epoch": 0.18603118822754228, + "grad_norm": 2.310398578643799, + "learning_rate": 3.098462664714495e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7072887420654297, + "num_tokens": 43966339.0, + "step": 1694 + }, + { + "epoch": 0.18614100593015595, + "grad_norm": 2.7169432640075684, + "learning_rate": 3.1002928257686675e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7082935571670532, + "num_tokens": 43985284.0, + "step": 1695 + }, + { + "epoch": 0.1862508236327696, + "grad_norm": 2.485037088394165, + "learning_rate": 3.102122986822841e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6986506581306458, + "num_tokens": 44008348.0, + "step": 1696 + }, + { + "epoch": 0.18636064133538327, + "grad_norm": 2.2239694595336914, + "learning_rate": 3.1039531478770135e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6887339353561401, + "num_tokens": 44034861.0, + "step": 1697 + }, + { + "epoch": 0.18647045903799692, + "grad_norm": 2.0292508602142334, + "learning_rate": 3.105783308931186e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6883500218391418, + "num_tokens": 44064780.0, + "step": 1698 + }, + { + "epoch": 0.1865802767406106, + "grad_norm": 2.1049067974090576, + "learning_rate": 3.1076134699853587e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7181870937347412, + "num_tokens": 44093024.0, + "step": 1699 + }, + { + "epoch": 0.18669009444322424, + "grad_norm": 2.318338632583618, + "learning_rate": 3.1094436310395317e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7089828848838806, + "num_tokens": 44116362.0, + "step": 1700 + }, + { + "epoch": 0.1867999121458379, + "grad_norm": 2.0105321407318115, + "learning_rate": 3.1112737920937047e-06, + "loss": 1.1148, + "mean_token_accuracy": 0.6676714420318604, + "num_tokens": 44149784.0, + "step": 1701 + }, + { + "epoch": 0.18690972984845158, + "grad_norm": 2.367530107498169, + "learning_rate": 3.1131039531478773e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.709959089756012, + "num_tokens": 44172416.0, + "step": 1702 + }, + { + "epoch": 0.18701954755106523, + "grad_norm": 2.019791841506958, + "learning_rate": 3.11493411420205e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6934973001480103, + "num_tokens": 44204024.0, + "step": 1703 + }, + { + "epoch": 0.1871293652536789, + "grad_norm": 2.216155529022217, + "learning_rate": 3.116764275256223e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6934740543365479, + "num_tokens": 44232262.0, + "step": 1704 + }, + { + "epoch": 0.18723918295629255, + "grad_norm": 2.086803913116455, + "learning_rate": 3.1185944363103955e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6891929507255554, + "num_tokens": 44263111.0, + "step": 1705 + }, + { + "epoch": 0.18734900065890622, + "grad_norm": 2.366978406906128, + "learning_rate": 3.1204245973645685e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7132043838500977, + "num_tokens": 44288355.0, + "step": 1706 + }, + { + "epoch": 0.18745881836151987, + "grad_norm": 2.3855788707733154, + "learning_rate": 3.122254758418741e-06, + "loss": 1.1236, + "mean_token_accuracy": 0.6720728874206543, + "num_tokens": 44312836.0, + "step": 1707 + }, + { + "epoch": 0.18756863606413354, + "grad_norm": 2.35766339302063, + "learning_rate": 3.124084919472914e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.709685206413269, + "num_tokens": 44333439.0, + "step": 1708 + }, + { + "epoch": 0.1876784537667472, + "grad_norm": 1.999086856842041, + "learning_rate": 3.1259150805270867e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6852293610572815, + "num_tokens": 44361823.0, + "step": 1709 + }, + { + "epoch": 0.18778827146936086, + "grad_norm": 2.3187029361724854, + "learning_rate": 3.1277452415812592e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6885772943496704, + "num_tokens": 44387963.0, + "step": 1710 + }, + { + "epoch": 0.18789808917197454, + "grad_norm": 2.193155288696289, + "learning_rate": 3.129575402635432e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6880714893341064, + "num_tokens": 44414590.0, + "step": 1711 + }, + { + "epoch": 0.18800790687458818, + "grad_norm": 2.143305540084839, + "learning_rate": 3.1314055636896053e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6867335438728333, + "num_tokens": 44444613.0, + "step": 1712 + }, + { + "epoch": 0.18811772457720186, + "grad_norm": 2.11274790763855, + "learning_rate": 3.133235724743778e-06, + "loss": 1.0862, + "mean_token_accuracy": 0.6745914220809937, + "num_tokens": 44476485.0, + "step": 1713 + }, + { + "epoch": 0.1882275422798155, + "grad_norm": 2.1155803203582764, + "learning_rate": 3.1350658857979504e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7070379257202148, + "num_tokens": 44505052.0, + "step": 1714 + }, + { + "epoch": 0.18833735998242918, + "grad_norm": 2.4163413047790527, + "learning_rate": 3.136896046852123e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6994929313659668, + "num_tokens": 44528495.0, + "step": 1715 + }, + { + "epoch": 0.18844717768504282, + "grad_norm": 2.1071808338165283, + "learning_rate": 3.1387262079062956e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6919505596160889, + "num_tokens": 44558169.0, + "step": 1716 + }, + { + "epoch": 0.1885569953876565, + "grad_norm": 2.0886428356170654, + "learning_rate": 3.140556368960469e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6903006434440613, + "num_tokens": 44585811.0, + "step": 1717 + }, + { + "epoch": 0.18866681309027014, + "grad_norm": 2.1119234561920166, + "learning_rate": 3.1423865300146416e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.691959023475647, + "num_tokens": 44614525.0, + "step": 1718 + }, + { + "epoch": 0.18877663079288381, + "grad_norm": 2.4675912857055664, + "learning_rate": 3.1442166910688142e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7192723751068115, + "num_tokens": 44633063.0, + "step": 1719 + }, + { + "epoch": 0.18888644849549746, + "grad_norm": 2.438150405883789, + "learning_rate": 3.146046852122987e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6985613107681274, + "num_tokens": 44655308.0, + "step": 1720 + }, + { + "epoch": 0.18899626619811113, + "grad_norm": 2.0402352809906006, + "learning_rate": 3.14787701317716e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7117757797241211, + "num_tokens": 44681988.0, + "step": 1721 + }, + { + "epoch": 0.1891060839007248, + "grad_norm": 1.8278573751449585, + "learning_rate": 3.149707174231333e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.7003622055053711, + "num_tokens": 44716408.0, + "step": 1722 + }, + { + "epoch": 0.18921590160333845, + "grad_norm": 1.9430416822433472, + "learning_rate": 3.1515373352855054e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.678910493850708, + "num_tokens": 44747512.0, + "step": 1723 + }, + { + "epoch": 0.18932571930595213, + "grad_norm": 1.9379082918167114, + "learning_rate": 3.153367496339678e-06, + "loss": 1.0827, + "mean_token_accuracy": 0.6759161353111267, + "num_tokens": 44778573.0, + "step": 1724 + }, + { + "epoch": 0.18943553700856577, + "grad_norm": 2.304521083831787, + "learning_rate": 3.155197657393851e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6976348161697388, + "num_tokens": 44803196.0, + "step": 1725 + }, + { + "epoch": 0.18954535471117945, + "grad_norm": 2.2215113639831543, + "learning_rate": 3.1570278184480236e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7234750986099243, + "num_tokens": 44830002.0, + "step": 1726 + }, + { + "epoch": 0.1896551724137931, + "grad_norm": 2.3899824619293213, + "learning_rate": 3.158857979502196e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7064213156700134, + "num_tokens": 44851723.0, + "step": 1727 + }, + { + "epoch": 0.18976499011640677, + "grad_norm": 2.254493474960327, + "learning_rate": 3.1606881405563696e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7186726331710815, + "num_tokens": 44875312.0, + "step": 1728 + }, + { + "epoch": 0.1898748078190204, + "grad_norm": 2.1544110774993896, + "learning_rate": 3.162518301610542e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6814364194869995, + "num_tokens": 44901654.0, + "step": 1729 + }, + { + "epoch": 0.1899846255216341, + "grad_norm": 2.0358800888061523, + "learning_rate": 3.1643484626647148e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6984742879867554, + "num_tokens": 44929436.0, + "step": 1730 + }, + { + "epoch": 0.19009444322424776, + "grad_norm": 2.3401272296905518, + "learning_rate": 3.1661786237188874e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6971147060394287, + "num_tokens": 44951109.0, + "step": 1731 + }, + { + "epoch": 0.1902042609268614, + "grad_norm": 2.1425933837890625, + "learning_rate": 3.16800878477306e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6958689093589783, + "num_tokens": 44977327.0, + "step": 1732 + }, + { + "epoch": 0.19031407862947508, + "grad_norm": 2.609468936920166, + "learning_rate": 3.1698389458272334e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6991864442825317, + "num_tokens": 44994618.0, + "step": 1733 + }, + { + "epoch": 0.19042389633208873, + "grad_norm": 2.180577039718628, + "learning_rate": 3.171669106881406e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7078145742416382, + "num_tokens": 45021875.0, + "step": 1734 + }, + { + "epoch": 0.1905337140347024, + "grad_norm": 2.007161855697632, + "learning_rate": 3.1734992679355786e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6976993083953857, + "num_tokens": 45051847.0, + "step": 1735 + }, + { + "epoch": 0.19064353173731605, + "grad_norm": 1.989575743675232, + "learning_rate": 3.175329428989751e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.6757966876029968, + "num_tokens": 45080397.0, + "step": 1736 + }, + { + "epoch": 0.19075334943992972, + "grad_norm": 2.291334867477417, + "learning_rate": 3.1771595900439237e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6899840235710144, + "num_tokens": 45104995.0, + "step": 1737 + }, + { + "epoch": 0.19086316714254337, + "grad_norm": 1.9987719058990479, + "learning_rate": 3.178989751098097e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6783286333084106, + "num_tokens": 45135675.0, + "step": 1738 + }, + { + "epoch": 0.19097298484515704, + "grad_norm": 2.1310501098632812, + "learning_rate": 3.1808199121522698e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7066044807434082, + "num_tokens": 45160953.0, + "step": 1739 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 2.270097494125366, + "learning_rate": 3.1826500732064423e-06, + "loss": 1.0992, + "mean_token_accuracy": 0.6742687821388245, + "num_tokens": 45187508.0, + "step": 1740 + }, + { + "epoch": 0.19119262025038436, + "grad_norm": 2.3814947605133057, + "learning_rate": 3.1844802342606154e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7140913605690002, + "num_tokens": 45206853.0, + "step": 1741 + }, + { + "epoch": 0.19130243795299803, + "grad_norm": 2.302333116531372, + "learning_rate": 3.186310395314788e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.6802793741226196, + "num_tokens": 45229467.0, + "step": 1742 + }, + { + "epoch": 0.19141225565561168, + "grad_norm": 2.4115703105926514, + "learning_rate": 3.1881405563689605e-06, + "loss": 0.964, + "mean_token_accuracy": 0.6954561471939087, + "num_tokens": 45250359.0, + "step": 1743 + }, + { + "epoch": 0.19152207335822535, + "grad_norm": 2.3878626823425293, + "learning_rate": 3.1899707174231335e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7018159627914429, + "num_tokens": 45273670.0, + "step": 1744 + }, + { + "epoch": 0.191631891060839, + "grad_norm": 2.0221221446990967, + "learning_rate": 3.1918008784773065e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6975036263465881, + "num_tokens": 45301813.0, + "step": 1745 + }, + { + "epoch": 0.19174170876345267, + "grad_norm": 2.3267760276794434, + "learning_rate": 3.193631039531479e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6881515979766846, + "num_tokens": 45326307.0, + "step": 1746 + }, + { + "epoch": 0.19185152646606632, + "grad_norm": 2.2556731700897217, + "learning_rate": 3.1954612005856517e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6888217926025391, + "num_tokens": 45351950.0, + "step": 1747 + }, + { + "epoch": 0.19196134416868, + "grad_norm": 2.508246898651123, + "learning_rate": 3.1972913616398243e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7285383939743042, + "num_tokens": 45372669.0, + "step": 1748 + }, + { + "epoch": 0.19207116187129367, + "grad_norm": 2.128236770629883, + "learning_rate": 3.1991215226939977e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7098737359046936, + "num_tokens": 45397098.0, + "step": 1749 + }, + { + "epoch": 0.1921809795739073, + "grad_norm": 1.917565941810608, + "learning_rate": 3.2009516837481703e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6899658441543579, + "num_tokens": 45431060.0, + "step": 1750 + }, + { + "epoch": 0.19229079727652099, + "grad_norm": 2.3823370933532715, + "learning_rate": 3.202781844802343e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7082394361495972, + "num_tokens": 45452836.0, + "step": 1751 + }, + { + "epoch": 0.19240061497913463, + "grad_norm": 2.30193829536438, + "learning_rate": 3.2046120058565155e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7060320377349854, + "num_tokens": 45477505.0, + "step": 1752 + }, + { + "epoch": 0.1925104326817483, + "grad_norm": 2.11002516746521, + "learning_rate": 3.206442166910688e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6905845403671265, + "num_tokens": 45507561.0, + "step": 1753 + }, + { + "epoch": 0.19262025038436195, + "grad_norm": 2.4505183696746826, + "learning_rate": 3.2082723279648615e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7024536728858948, + "num_tokens": 45531649.0, + "step": 1754 + }, + { + "epoch": 0.19273006808697563, + "grad_norm": 2.2536754608154297, + "learning_rate": 3.210102489019034e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.709670901298523, + "num_tokens": 45556327.0, + "step": 1755 + }, + { + "epoch": 0.19283988578958927, + "grad_norm": 2.0896849632263184, + "learning_rate": 3.2119326500732067e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7055801153182983, + "num_tokens": 45582638.0, + "step": 1756 + }, + { + "epoch": 0.19294970349220295, + "grad_norm": 2.365875482559204, + "learning_rate": 3.2137628111273793e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6881653070449829, + "num_tokens": 45607681.0, + "step": 1757 + }, + { + "epoch": 0.1930595211948166, + "grad_norm": 2.0273141860961914, + "learning_rate": 3.2155929721815523e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6995218992233276, + "num_tokens": 45640042.0, + "step": 1758 + }, + { + "epoch": 0.19316933889743026, + "grad_norm": 2.0561063289642334, + "learning_rate": 3.217423133235725e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7039593458175659, + "num_tokens": 45668318.0, + "step": 1759 + }, + { + "epoch": 0.19327915660004394, + "grad_norm": 2.067667007446289, + "learning_rate": 3.219253294289898e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6762667298316956, + "num_tokens": 45696861.0, + "step": 1760 + }, + { + "epoch": 0.19338897430265758, + "grad_norm": 2.0776681900024414, + "learning_rate": 3.2210834553440705e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6846925020217896, + "num_tokens": 45724910.0, + "step": 1761 + }, + { + "epoch": 0.19349879200527126, + "grad_norm": 2.2134602069854736, + "learning_rate": 3.2229136163982435e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7121920585632324, + "num_tokens": 45750739.0, + "step": 1762 + }, + { + "epoch": 0.1936086097078849, + "grad_norm": 2.284613847732544, + "learning_rate": 3.224743777452416e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6904610991477966, + "num_tokens": 45773878.0, + "step": 1763 + }, + { + "epoch": 0.19371842741049858, + "grad_norm": 2.306007146835327, + "learning_rate": 3.2265739385065887e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7041307687759399, + "num_tokens": 45797970.0, + "step": 1764 + }, + { + "epoch": 0.19382824511311222, + "grad_norm": 2.400599956512451, + "learning_rate": 3.2284040995607617e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7027568817138672, + "num_tokens": 45820175.0, + "step": 1765 + }, + { + "epoch": 0.1939380628157259, + "grad_norm": 2.2502827644348145, + "learning_rate": 3.2302342606149347e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6918638944625854, + "num_tokens": 45844845.0, + "step": 1766 + }, + { + "epoch": 0.19404788051833954, + "grad_norm": 2.3168118000030518, + "learning_rate": 3.2320644216691073e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6978409290313721, + "num_tokens": 45868710.0, + "step": 1767 + }, + { + "epoch": 0.19415769822095322, + "grad_norm": 2.1056149005889893, + "learning_rate": 3.23389458272328e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6833765506744385, + "num_tokens": 45898390.0, + "step": 1768 + }, + { + "epoch": 0.1942675159235669, + "grad_norm": 2.363773822784424, + "learning_rate": 3.2357247437774524e-06, + "loss": 1.1264, + "mean_token_accuracy": 0.6744462251663208, + "num_tokens": 45923034.0, + "step": 1769 + }, + { + "epoch": 0.19437733362618054, + "grad_norm": 2.1863982677459717, + "learning_rate": 3.237554904831626e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6778645515441895, + "num_tokens": 45950391.0, + "step": 1770 + }, + { + "epoch": 0.1944871513287942, + "grad_norm": 2.1431005001068115, + "learning_rate": 3.2393850658857985e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7017661929130554, + "num_tokens": 45976010.0, + "step": 1771 + }, + { + "epoch": 0.19459696903140786, + "grad_norm": 2.414992570877075, + "learning_rate": 3.241215226939971e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7132588624954224, + "num_tokens": 45996647.0, + "step": 1772 + }, + { + "epoch": 0.19470678673402153, + "grad_norm": 2.1776885986328125, + "learning_rate": 3.2430453879941436e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6953365802764893, + "num_tokens": 46023428.0, + "step": 1773 + }, + { + "epoch": 0.19481660443663518, + "grad_norm": 1.9599109888076782, + "learning_rate": 3.2448755490483162e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6982336640357971, + "num_tokens": 46054396.0, + "step": 1774 + }, + { + "epoch": 0.19492642213924885, + "grad_norm": 1.9741806983947754, + "learning_rate": 3.2467057101024892e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6974703073501587, + "num_tokens": 46088722.0, + "step": 1775 + }, + { + "epoch": 0.1950362398418625, + "grad_norm": 2.141110420227051, + "learning_rate": 3.2485358711566622e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.687292218208313, + "num_tokens": 46115387.0, + "step": 1776 + }, + { + "epoch": 0.19514605754447617, + "grad_norm": 2.296621084213257, + "learning_rate": 3.250366032210835e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6974034309387207, + "num_tokens": 46138875.0, + "step": 1777 + }, + { + "epoch": 0.19525587524708984, + "grad_norm": 1.7674161195755005, + "learning_rate": 3.2521961932650074e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6980858445167542, + "num_tokens": 46175149.0, + "step": 1778 + }, + { + "epoch": 0.1953656929497035, + "grad_norm": 2.288111448287964, + "learning_rate": 3.2540263543191804e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6879154443740845, + "num_tokens": 46200848.0, + "step": 1779 + }, + { + "epoch": 0.19547551065231716, + "grad_norm": 2.108243703842163, + "learning_rate": 3.255856515373353e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6948730945587158, + "num_tokens": 46228060.0, + "step": 1780 + }, + { + "epoch": 0.1955853283549308, + "grad_norm": 2.2940938472747803, + "learning_rate": 3.257686676427526e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6943564414978027, + "num_tokens": 46252361.0, + "step": 1781 + }, + { + "epoch": 0.19569514605754448, + "grad_norm": 2.2514474391937256, + "learning_rate": 3.2595168374816986e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6911035180091858, + "num_tokens": 46275152.0, + "step": 1782 + }, + { + "epoch": 0.19580496376015813, + "grad_norm": 2.116591215133667, + "learning_rate": 3.2613469985358716e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7013342976570129, + "num_tokens": 46299938.0, + "step": 1783 + }, + { + "epoch": 0.1959147814627718, + "grad_norm": 2.288379430770874, + "learning_rate": 3.263177159590044e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6808187365531921, + "num_tokens": 46324647.0, + "step": 1784 + }, + { + "epoch": 0.19602459916538545, + "grad_norm": 2.2793679237365723, + "learning_rate": 3.265007320644217e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7176708579063416, + "num_tokens": 46349474.0, + "step": 1785 + }, + { + "epoch": 0.19613441686799912, + "grad_norm": 2.029381275177002, + "learning_rate": 3.26683748169839e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6888138055801392, + "num_tokens": 46376917.0, + "step": 1786 + }, + { + "epoch": 0.1962442345706128, + "grad_norm": 2.4241795539855957, + "learning_rate": 3.268667642752563e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7060725688934326, + "num_tokens": 46396858.0, + "step": 1787 + }, + { + "epoch": 0.19635405227322644, + "grad_norm": 2.119666814804077, + "learning_rate": 3.2704978038067354e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6960784792900085, + "num_tokens": 46422068.0, + "step": 1788 + }, + { + "epoch": 0.19646386997584012, + "grad_norm": 2.402958393096924, + "learning_rate": 3.272327964860908e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7021945714950562, + "num_tokens": 46443006.0, + "step": 1789 + }, + { + "epoch": 0.19657368767845376, + "grad_norm": 2.147533893585205, + "learning_rate": 3.2741581259150806e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6769611835479736, + "num_tokens": 46469927.0, + "step": 1790 + }, + { + "epoch": 0.19668350538106744, + "grad_norm": 2.3630361557006836, + "learning_rate": 3.275988286969253e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6999498009681702, + "num_tokens": 46492331.0, + "step": 1791 + }, + { + "epoch": 0.19679332308368108, + "grad_norm": 1.9698237180709839, + "learning_rate": 3.2778184480234266e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7267101407051086, + "num_tokens": 46521933.0, + "step": 1792 + }, + { + "epoch": 0.19690314078629476, + "grad_norm": 2.310732841491699, + "learning_rate": 3.279648609077599e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6891330480575562, + "num_tokens": 46546939.0, + "step": 1793 + }, + { + "epoch": 0.1970129584889084, + "grad_norm": 2.1365702152252197, + "learning_rate": 3.2814787701317718e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7340008616447449, + "num_tokens": 46570402.0, + "step": 1794 + }, + { + "epoch": 0.19712277619152208, + "grad_norm": 2.0344483852386475, + "learning_rate": 3.2833089311859443e-06, + "loss": 1.0924, + "mean_token_accuracy": 0.6728941202163696, + "num_tokens": 46600726.0, + "step": 1795 + }, + { + "epoch": 0.19723259389413572, + "grad_norm": 2.171438455581665, + "learning_rate": 3.2851390922401174e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6873220205307007, + "num_tokens": 46625479.0, + "step": 1796 + }, + { + "epoch": 0.1973424115967494, + "grad_norm": 2.2815961837768555, + "learning_rate": 3.2869692532942904e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7035756707191467, + "num_tokens": 46650620.0, + "step": 1797 + }, + { + "epoch": 0.19745222929936307, + "grad_norm": 2.505263566970825, + "learning_rate": 3.288799414348463e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7021595239639282, + "num_tokens": 46671926.0, + "step": 1798 + }, + { + "epoch": 0.19756204700197671, + "grad_norm": 2.154768705368042, + "learning_rate": 3.2906295754026355e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6910613775253296, + "num_tokens": 46700038.0, + "step": 1799 + }, + { + "epoch": 0.1976718647045904, + "grad_norm": 2.2229528427124023, + "learning_rate": 3.2924597364568086e-06, + "loss": 1.0967, + "mean_token_accuracy": 0.6799579858779907, + "num_tokens": 46728144.0, + "step": 1800 + }, + { + "epoch": 0.19778168240720403, + "grad_norm": 2.1375045776367188, + "learning_rate": 3.294289897510981e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.6900208592414856, + "num_tokens": 46758587.0, + "step": 1801 + }, + { + "epoch": 0.1978915001098177, + "grad_norm": 2.2974822521209717, + "learning_rate": 3.296120058565154e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6892304420471191, + "num_tokens": 46782988.0, + "step": 1802 + }, + { + "epoch": 0.19800131781243135, + "grad_norm": 2.231452703475952, + "learning_rate": 3.2979502196193267e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6811293959617615, + "num_tokens": 46808480.0, + "step": 1803 + }, + { + "epoch": 0.19811113551504503, + "grad_norm": 1.9667346477508545, + "learning_rate": 3.2997803806734997e-06, + "loss": 1.104, + "mean_token_accuracy": 0.6694022417068481, + "num_tokens": 46842060.0, + "step": 1804 + }, + { + "epoch": 0.19822095321765867, + "grad_norm": 2.2631940841674805, + "learning_rate": 3.3016105417276723e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6784462928771973, + "num_tokens": 46866920.0, + "step": 1805 + }, + { + "epoch": 0.19833077092027235, + "grad_norm": 2.3414864540100098, + "learning_rate": 3.303440702781845e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7004315257072449, + "num_tokens": 46888451.0, + "step": 1806 + }, + { + "epoch": 0.19844058862288602, + "grad_norm": 2.4098055362701416, + "learning_rate": 3.3052708638360175e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7006291151046753, + "num_tokens": 46912144.0, + "step": 1807 + }, + { + "epoch": 0.19855040632549967, + "grad_norm": 2.098492383956909, + "learning_rate": 3.307101024890191e-06, + "loss": 1.0926, + "mean_token_accuracy": 0.6740326285362244, + "num_tokens": 46941357.0, + "step": 1808 + }, + { + "epoch": 0.19866022402811334, + "grad_norm": 2.06973934173584, + "learning_rate": 3.3089311859443635e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6952786445617676, + "num_tokens": 46967967.0, + "step": 1809 + }, + { + "epoch": 0.198770041730727, + "grad_norm": 2.3128116130828857, + "learning_rate": 3.310761346998536e-06, + "loss": 0.983, + "mean_token_accuracy": 0.6987296938896179, + "num_tokens": 46992011.0, + "step": 1810 + }, + { + "epoch": 0.19887985943334066, + "grad_norm": 2.0650405883789062, + "learning_rate": 3.3125915080527087e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7091321349143982, + "num_tokens": 47016303.0, + "step": 1811 + }, + { + "epoch": 0.1989896771359543, + "grad_norm": 2.290705442428589, + "learning_rate": 3.3144216691068813e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7022224068641663, + "num_tokens": 47040356.0, + "step": 1812 + }, + { + "epoch": 0.19909949483856798, + "grad_norm": 2.1740996837615967, + "learning_rate": 3.3162518301610547e-06, + "loss": 1.0911, + "mean_token_accuracy": 0.6705593466758728, + "num_tokens": 47068887.0, + "step": 1813 + }, + { + "epoch": 0.19920931254118163, + "grad_norm": 2.0084426403045654, + "learning_rate": 3.3180819912152273e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6909135580062866, + "num_tokens": 47099452.0, + "step": 1814 + }, + { + "epoch": 0.1993191302437953, + "grad_norm": 2.2503764629364014, + "learning_rate": 3.3199121522694e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7252925038337708, + "num_tokens": 47122239.0, + "step": 1815 + }, + { + "epoch": 0.19942894794640897, + "grad_norm": 2.4412994384765625, + "learning_rate": 3.3217423133235725e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6847517490386963, + "num_tokens": 47145197.0, + "step": 1816 + }, + { + "epoch": 0.19953876564902262, + "grad_norm": 2.027890682220459, + "learning_rate": 3.3235724743777455e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7194989323616028, + "num_tokens": 47172993.0, + "step": 1817 + }, + { + "epoch": 0.1996485833516363, + "grad_norm": 2.4671530723571777, + "learning_rate": 3.3254026354319185e-06, + "loss": 0.939, + "mean_token_accuracy": 0.713017463684082, + "num_tokens": 47192146.0, + "step": 1818 + }, + { + "epoch": 0.19975840105424994, + "grad_norm": 2.6748616695404053, + "learning_rate": 3.327232796486091e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.716408371925354, + "num_tokens": 47209652.0, + "step": 1819 + }, + { + "epoch": 0.1998682187568636, + "grad_norm": 1.9805490970611572, + "learning_rate": 3.3290629575402637e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6924034357070923, + "num_tokens": 47243909.0, + "step": 1820 + }, + { + "epoch": 0.19997803645947726, + "grad_norm": 2.1254186630249023, + "learning_rate": 3.3308931185944367e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.680048942565918, + "num_tokens": 47269832.0, + "step": 1821 + }, + { + "epoch": 0.20008785416209093, + "grad_norm": 2.258324384689331, + "learning_rate": 3.3327232796486093e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7240288853645325, + "num_tokens": 47292457.0, + "step": 1822 + }, + { + "epoch": 0.20019767186470458, + "grad_norm": 2.049610137939453, + "learning_rate": 3.334553440702782e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6907762289047241, + "num_tokens": 47321652.0, + "step": 1823 + }, + { + "epoch": 0.20030748956731825, + "grad_norm": 2.4924638271331787, + "learning_rate": 3.3363836017569553e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7044394016265869, + "num_tokens": 47343683.0, + "step": 1824 + }, + { + "epoch": 0.2004173072699319, + "grad_norm": 2.187422513961792, + "learning_rate": 3.338213762811128e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6993355751037598, + "num_tokens": 47370339.0, + "step": 1825 + }, + { + "epoch": 0.20052712497254557, + "grad_norm": 2.1250228881835938, + "learning_rate": 3.3400439238653005e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.711241602897644, + "num_tokens": 47395321.0, + "step": 1826 + }, + { + "epoch": 0.20063694267515925, + "grad_norm": 2.0079805850982666, + "learning_rate": 3.341874084919473e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7149333953857422, + "num_tokens": 47423095.0, + "step": 1827 + }, + { + "epoch": 0.2007467603777729, + "grad_norm": 2.2319672107696533, + "learning_rate": 3.3437042459736456e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6954727172851562, + "num_tokens": 47448340.0, + "step": 1828 + }, + { + "epoch": 0.20085657808038657, + "grad_norm": 2.4225237369537354, + "learning_rate": 3.345534407027819e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6904310584068298, + "num_tokens": 47472315.0, + "step": 1829 + }, + { + "epoch": 0.2009663957830002, + "grad_norm": 2.1429479122161865, + "learning_rate": 3.3473645680819917e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6960057020187378, + "num_tokens": 47500473.0, + "step": 1830 + }, + { + "epoch": 0.20107621348561389, + "grad_norm": 2.105905771255493, + "learning_rate": 3.3491947291361642e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6984102129936218, + "num_tokens": 47529949.0, + "step": 1831 + }, + { + "epoch": 0.20118603118822753, + "grad_norm": 1.9696871042251587, + "learning_rate": 3.351024890190337e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6928511261940002, + "num_tokens": 47558833.0, + "step": 1832 + }, + { + "epoch": 0.2012958488908412, + "grad_norm": 2.154557943344116, + "learning_rate": 3.3528550512445094e-06, + "loss": 1.1272, + "mean_token_accuracy": 0.6801561117172241, + "num_tokens": 47589951.0, + "step": 1833 + }, + { + "epoch": 0.20140566659345485, + "grad_norm": 2.0739858150482178, + "learning_rate": 3.354685212298683e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6894845962524414, + "num_tokens": 47620883.0, + "step": 1834 + }, + { + "epoch": 0.20151548429606853, + "grad_norm": 2.324120283126831, + "learning_rate": 3.3565153733528554e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7148708701133728, + "num_tokens": 47643060.0, + "step": 1835 + }, + { + "epoch": 0.2016253019986822, + "grad_norm": 2.084064245223999, + "learning_rate": 3.358345534407028e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6908804774284363, + "num_tokens": 47671869.0, + "step": 1836 + }, + { + "epoch": 0.20173511970129585, + "grad_norm": 2.041533946990967, + "learning_rate": 3.3601756954612006e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6740773916244507, + "num_tokens": 47705612.0, + "step": 1837 + }, + { + "epoch": 0.20184493740390952, + "grad_norm": 2.2999765872955322, + "learning_rate": 3.3620058565153736e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6959185600280762, + "num_tokens": 47731712.0, + "step": 1838 + }, + { + "epoch": 0.20195475510652316, + "grad_norm": 2.2877256870269775, + "learning_rate": 3.363836017569546e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7114570140838623, + "num_tokens": 47755936.0, + "step": 1839 + }, + { + "epoch": 0.20206457280913684, + "grad_norm": 2.0327675342559814, + "learning_rate": 3.3656661786237192e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6948362588882446, + "num_tokens": 47787665.0, + "step": 1840 + }, + { + "epoch": 0.20217439051175048, + "grad_norm": 2.194610595703125, + "learning_rate": 3.3674963396778922e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6925487518310547, + "num_tokens": 47811809.0, + "step": 1841 + }, + { + "epoch": 0.20228420821436416, + "grad_norm": 2.3054423332214355, + "learning_rate": 3.369326500732065e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6927096843719482, + "num_tokens": 47836202.0, + "step": 1842 + }, + { + "epoch": 0.2023940259169778, + "grad_norm": 2.3113696575164795, + "learning_rate": 3.3711566617862374e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7218228578567505, + "num_tokens": 47862237.0, + "step": 1843 + }, + { + "epoch": 0.20250384361959148, + "grad_norm": 2.332751512527466, + "learning_rate": 3.37298682284041e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6915042400360107, + "num_tokens": 47884743.0, + "step": 1844 + }, + { + "epoch": 0.20261366132220515, + "grad_norm": 2.0530693531036377, + "learning_rate": 3.3748169838945834e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6999973058700562, + "num_tokens": 47912595.0, + "step": 1845 + }, + { + "epoch": 0.2027234790248188, + "grad_norm": 2.220132350921631, + "learning_rate": 3.376647144948756e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.710909366607666, + "num_tokens": 47937929.0, + "step": 1846 + }, + { + "epoch": 0.20283329672743247, + "grad_norm": 2.2354938983917236, + "learning_rate": 3.3784773060029286e-06, + "loss": 1.0907, + "mean_token_accuracy": 0.6732510924339294, + "num_tokens": 47967703.0, + "step": 1847 + }, + { + "epoch": 0.20294311443004612, + "grad_norm": 2.4346964359283447, + "learning_rate": 3.380307467057101e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6846588850021362, + "num_tokens": 47991177.0, + "step": 1848 + }, + { + "epoch": 0.2030529321326598, + "grad_norm": 2.5442872047424316, + "learning_rate": 3.3821376281112738e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.6985134482383728, + "num_tokens": 48013460.0, + "step": 1849 + }, + { + "epoch": 0.20316274983527344, + "grad_norm": 2.1609556674957275, + "learning_rate": 3.383967789165447e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7088111639022827, + "num_tokens": 48040451.0, + "step": 1850 + }, + { + "epoch": 0.2032725675378871, + "grad_norm": 2.3511013984680176, + "learning_rate": 3.3857979502196198e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6904324293136597, + "num_tokens": 48063442.0, + "step": 1851 + }, + { + "epoch": 0.20338238524050076, + "grad_norm": 1.951958417892456, + "learning_rate": 3.3876281112737924e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.6746821403503418, + "num_tokens": 48095553.0, + "step": 1852 + }, + { + "epoch": 0.20349220294311443, + "grad_norm": 2.2291860580444336, + "learning_rate": 3.389458272327965e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6943528652191162, + "num_tokens": 48121310.0, + "step": 1853 + }, + { + "epoch": 0.2036020206457281, + "grad_norm": 2.402831554412842, + "learning_rate": 3.3912884333821375e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7048429250717163, + "num_tokens": 48145607.0, + "step": 1854 + }, + { + "epoch": 0.20371183834834175, + "grad_norm": 2.3932607173919678, + "learning_rate": 3.3931185944363106e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7256565093994141, + "num_tokens": 48166064.0, + "step": 1855 + }, + { + "epoch": 0.20382165605095542, + "grad_norm": 2.2787773609161377, + "learning_rate": 3.3949487554904836e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7046493291854858, + "num_tokens": 48190313.0, + "step": 1856 + }, + { + "epoch": 0.20393147375356907, + "grad_norm": 2.2107481956481934, + "learning_rate": 3.396778916544656e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7025256156921387, + "num_tokens": 48213556.0, + "step": 1857 + }, + { + "epoch": 0.20404129145618274, + "grad_norm": 2.1077840328216553, + "learning_rate": 3.398609077598829e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6922945976257324, + "num_tokens": 48244588.0, + "step": 1858 + }, + { + "epoch": 0.2041511091587964, + "grad_norm": 2.09393310546875, + "learning_rate": 3.4004392386530017e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7153570055961609, + "num_tokens": 48270945.0, + "step": 1859 + }, + { + "epoch": 0.20426092686141006, + "grad_norm": 1.9848655462265015, + "learning_rate": 3.4022693997071743e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6920541524887085, + "num_tokens": 48300979.0, + "step": 1860 + }, + { + "epoch": 0.2043707445640237, + "grad_norm": 2.0888280868530273, + "learning_rate": 3.4040995607613473e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.69422447681427, + "num_tokens": 48326875.0, + "step": 1861 + }, + { + "epoch": 0.20448056226663738, + "grad_norm": 2.1780405044555664, + "learning_rate": 3.4059297218155204e-06, + "loss": 1.1197, + "mean_token_accuracy": 0.6711124181747437, + "num_tokens": 48353663.0, + "step": 1862 + }, + { + "epoch": 0.20459037996925103, + "grad_norm": 2.1619858741760254, + "learning_rate": 3.407759882869693e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6954497694969177, + "num_tokens": 48380161.0, + "step": 1863 + }, + { + "epoch": 0.2047001976718647, + "grad_norm": 2.0438592433929443, + "learning_rate": 3.4095900439238655e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.692767322063446, + "num_tokens": 48411774.0, + "step": 1864 + }, + { + "epoch": 0.20481001537447838, + "grad_norm": 2.217703342437744, + "learning_rate": 3.411420204978038e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.6978050470352173, + "num_tokens": 48437657.0, + "step": 1865 + }, + { + "epoch": 0.20491983307709202, + "grad_norm": 2.223781108856201, + "learning_rate": 3.4132503660322115e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7084360122680664, + "num_tokens": 48462332.0, + "step": 1866 + }, + { + "epoch": 0.2050296507797057, + "grad_norm": 2.109811544418335, + "learning_rate": 3.415080527086384e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6866388320922852, + "num_tokens": 48490016.0, + "step": 1867 + }, + { + "epoch": 0.20513946848231934, + "grad_norm": 2.2493908405303955, + "learning_rate": 3.4169106881405567e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6963469982147217, + "num_tokens": 48516853.0, + "step": 1868 + }, + { + "epoch": 0.20524928618493302, + "grad_norm": 2.381124258041382, + "learning_rate": 3.4187408491947293e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7026046514511108, + "num_tokens": 48542496.0, + "step": 1869 + }, + { + "epoch": 0.20535910388754666, + "grad_norm": 2.150191068649292, + "learning_rate": 3.420571010248902e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6805409789085388, + "num_tokens": 48569317.0, + "step": 1870 + }, + { + "epoch": 0.20546892159016034, + "grad_norm": 1.9751912355422974, + "learning_rate": 3.422401171303075e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6859148144721985, + "num_tokens": 48601281.0, + "step": 1871 + }, + { + "epoch": 0.20557873929277398, + "grad_norm": 2.1021623611450195, + "learning_rate": 3.424231332357248e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7055008411407471, + "num_tokens": 48630247.0, + "step": 1872 + }, + { + "epoch": 0.20568855699538766, + "grad_norm": 2.3746042251586914, + "learning_rate": 3.4260614934114205e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7148284316062927, + "num_tokens": 48650060.0, + "step": 1873 + }, + { + "epoch": 0.20579837469800133, + "grad_norm": 2.3011913299560547, + "learning_rate": 3.427891654465593e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7155217528343201, + "num_tokens": 48671899.0, + "step": 1874 + }, + { + "epoch": 0.20590819240061498, + "grad_norm": 2.3982796669006348, + "learning_rate": 3.429721815519766e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7018528580665588, + "num_tokens": 48691950.0, + "step": 1875 + }, + { + "epoch": 0.20601801010322865, + "grad_norm": 2.1734516620635986, + "learning_rate": 3.4315519765739387e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7041801810264587, + "num_tokens": 48716166.0, + "step": 1876 + }, + { + "epoch": 0.2061278278058423, + "grad_norm": 2.2555463314056396, + "learning_rate": 3.4333821376281117e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6846138834953308, + "num_tokens": 48740908.0, + "step": 1877 + }, + { + "epoch": 0.20623764550845597, + "grad_norm": 2.470465660095215, + "learning_rate": 3.4352122986822843e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.700478196144104, + "num_tokens": 48762206.0, + "step": 1878 + }, + { + "epoch": 0.20634746321106961, + "grad_norm": 2.151503801345825, + "learning_rate": 3.4370424597364573e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6984230279922485, + "num_tokens": 48788974.0, + "step": 1879 + }, + { + "epoch": 0.2064572809136833, + "grad_norm": 2.2725563049316406, + "learning_rate": 3.43887262079063e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6879882216453552, + "num_tokens": 48814651.0, + "step": 1880 + }, + { + "epoch": 0.20656709861629693, + "grad_norm": 2.302283525466919, + "learning_rate": 3.4407027818448025e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6901258230209351, + "num_tokens": 48838890.0, + "step": 1881 + }, + { + "epoch": 0.2066769163189106, + "grad_norm": 2.347425699234009, + "learning_rate": 3.4425329428989755e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7138515114784241, + "num_tokens": 48861960.0, + "step": 1882 + }, + { + "epoch": 0.20678673402152428, + "grad_norm": 2.1046175956726074, + "learning_rate": 3.4443631039531485e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7087484002113342, + "num_tokens": 48890337.0, + "step": 1883 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 2.304823398590088, + "learning_rate": 3.446193265007321e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6753579378128052, + "num_tokens": 48913513.0, + "step": 1884 + }, + { + "epoch": 0.2070063694267516, + "grad_norm": 2.063041925430298, + "learning_rate": 3.4480234260614937e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6920399069786072, + "num_tokens": 48940472.0, + "step": 1885 + }, + { + "epoch": 0.20711618712936525, + "grad_norm": 2.4317572116851807, + "learning_rate": 3.4498535871156662e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6957523226737976, + "num_tokens": 48962523.0, + "step": 1886 + }, + { + "epoch": 0.20722600483197892, + "grad_norm": 2.1688003540039062, + "learning_rate": 3.451683748169839e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.6749435663223267, + "num_tokens": 48988626.0, + "step": 1887 + }, + { + "epoch": 0.20733582253459257, + "grad_norm": 2.5408942699432373, + "learning_rate": 3.4535139092240123e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6899746656417847, + "num_tokens": 49009657.0, + "step": 1888 + }, + { + "epoch": 0.20744564023720624, + "grad_norm": 2.473069190979004, + "learning_rate": 3.455344070278185e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7027124762535095, + "num_tokens": 49031763.0, + "step": 1889 + }, + { + "epoch": 0.2075554579398199, + "grad_norm": 2.0611352920532227, + "learning_rate": 3.4571742313323574e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6904630064964294, + "num_tokens": 49059249.0, + "step": 1890 + }, + { + "epoch": 0.20766527564243356, + "grad_norm": 1.9815666675567627, + "learning_rate": 3.45900439238653e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7262849807739258, + "num_tokens": 49087438.0, + "step": 1891 + }, + { + "epoch": 0.20777509334504723, + "grad_norm": 2.596538543701172, + "learning_rate": 3.460834553440703e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7152668833732605, + "num_tokens": 49104910.0, + "step": 1892 + }, + { + "epoch": 0.20788491104766088, + "grad_norm": 2.2626426219940186, + "learning_rate": 3.462664714494876e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6889671087265015, + "num_tokens": 49127348.0, + "step": 1893 + }, + { + "epoch": 0.20799472875027455, + "grad_norm": 2.6871120929718018, + "learning_rate": 3.4644948755490486e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.6945629119873047, + "num_tokens": 49147176.0, + "step": 1894 + }, + { + "epoch": 0.2081045464528882, + "grad_norm": 1.9682918787002563, + "learning_rate": 3.4663250366032212e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6804206371307373, + "num_tokens": 49179616.0, + "step": 1895 + }, + { + "epoch": 0.20821436415550187, + "grad_norm": 2.3245339393615723, + "learning_rate": 3.4681551976573942e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.710638165473938, + "num_tokens": 49204260.0, + "step": 1896 + }, + { + "epoch": 0.20832418185811552, + "grad_norm": 2.191598892211914, + "learning_rate": 3.469985358711567e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6803576946258545, + "num_tokens": 49230460.0, + "step": 1897 + }, + { + "epoch": 0.2084339995607292, + "grad_norm": 2.1041295528411865, + "learning_rate": 3.47181551976574e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6861007213592529, + "num_tokens": 49258883.0, + "step": 1898 + }, + { + "epoch": 0.20854381726334284, + "grad_norm": 2.0069093704223633, + "learning_rate": 3.4736456808199124e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7020266056060791, + "num_tokens": 49287458.0, + "step": 1899 + }, + { + "epoch": 0.2086536349659565, + "grad_norm": 2.2549614906311035, + "learning_rate": 3.4754758418740854e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.714342474937439, + "num_tokens": 49309640.0, + "step": 1900 + }, + { + "epoch": 0.20876345266857016, + "grad_norm": 2.3297455310821533, + "learning_rate": 3.477306002928258e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6831784248352051, + "num_tokens": 49332061.0, + "step": 1901 + }, + { + "epoch": 0.20887327037118383, + "grad_norm": 2.3557662963867188, + "learning_rate": 3.4791361639824306e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.7009462118148804, + "num_tokens": 49354394.0, + "step": 1902 + }, + { + "epoch": 0.2089830880737975, + "grad_norm": 2.0969467163085938, + "learning_rate": 3.480966325036603e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7157325744628906, + "num_tokens": 49380156.0, + "step": 1903 + }, + { + "epoch": 0.20909290577641115, + "grad_norm": 2.4804108142852783, + "learning_rate": 3.4827964860907766e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7147586345672607, + "num_tokens": 49398873.0, + "step": 1904 + }, + { + "epoch": 0.20920272347902483, + "grad_norm": 2.1150689125061035, + "learning_rate": 3.484626647144949e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.6989025473594666, + "num_tokens": 49424821.0, + "step": 1905 + }, + { + "epoch": 0.20931254118163847, + "grad_norm": 2.0810654163360596, + "learning_rate": 3.486456808199122e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7008124589920044, + "num_tokens": 49454902.0, + "step": 1906 + }, + { + "epoch": 0.20942235888425215, + "grad_norm": 2.154834747314453, + "learning_rate": 3.4882869692532944e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7237980365753174, + "num_tokens": 49480365.0, + "step": 1907 + }, + { + "epoch": 0.2095321765868658, + "grad_norm": 2.1893508434295654, + "learning_rate": 3.490117130307467e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7032618522644043, + "num_tokens": 49504957.0, + "step": 1908 + }, + { + "epoch": 0.20964199428947947, + "grad_norm": 2.183757781982422, + "learning_rate": 3.4919472913616404e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.691909909248352, + "num_tokens": 49531233.0, + "step": 1909 + }, + { + "epoch": 0.2097518119920931, + "grad_norm": 2.212819814682007, + "learning_rate": 3.493777452415813e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6841228604316711, + "num_tokens": 49556939.0, + "step": 1910 + }, + { + "epoch": 0.20986162969470679, + "grad_norm": 2.299700975418091, + "learning_rate": 3.4956076134699856e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7021907567977905, + "num_tokens": 49581176.0, + "step": 1911 + }, + { + "epoch": 0.20997144739732046, + "grad_norm": 2.435868740081787, + "learning_rate": 3.497437774524158e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6905363202095032, + "num_tokens": 49603737.0, + "step": 1912 + }, + { + "epoch": 0.2100812650999341, + "grad_norm": 2.203911066055298, + "learning_rate": 3.499267935578331e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7105399370193481, + "num_tokens": 49627660.0, + "step": 1913 + }, + { + "epoch": 0.21019108280254778, + "grad_norm": 2.3587913513183594, + "learning_rate": 3.501098096632504e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7209717035293579, + "num_tokens": 49648119.0, + "step": 1914 + }, + { + "epoch": 0.21030090050516143, + "grad_norm": 2.3230514526367188, + "learning_rate": 3.5029282576866768e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7042571306228638, + "num_tokens": 49671587.0, + "step": 1915 + }, + { + "epoch": 0.2104107182077751, + "grad_norm": 2.1604223251342773, + "learning_rate": 3.5047584187408494e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7091715335845947, + "num_tokens": 49698639.0, + "step": 1916 + }, + { + "epoch": 0.21052053591038875, + "grad_norm": 2.203084945678711, + "learning_rate": 3.5065885797950224e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7102990746498108, + "num_tokens": 49722424.0, + "step": 1917 + }, + { + "epoch": 0.21063035361300242, + "grad_norm": 2.021801233291626, + "learning_rate": 3.508418740849195e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.6780968904495239, + "num_tokens": 49753718.0, + "step": 1918 + }, + { + "epoch": 0.21074017131561606, + "grad_norm": 2.2191953659057617, + "learning_rate": 3.5102489019033675e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6714619398117065, + "num_tokens": 49779949.0, + "step": 1919 + }, + { + "epoch": 0.21084998901822974, + "grad_norm": 2.2062289714813232, + "learning_rate": 3.5120790629575405e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6973613500595093, + "num_tokens": 49804381.0, + "step": 1920 + }, + { + "epoch": 0.2109598067208434, + "grad_norm": 1.9762006998062134, + "learning_rate": 3.5139092240117136e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7001418471336365, + "num_tokens": 49831760.0, + "step": 1921 + }, + { + "epoch": 0.21106962442345706, + "grad_norm": 2.194336414337158, + "learning_rate": 3.515739385065886e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6939456462860107, + "num_tokens": 49856182.0, + "step": 1922 + }, + { + "epoch": 0.21117944212607073, + "grad_norm": 1.9720580577850342, + "learning_rate": 3.5175695461200587e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6945407390594482, + "num_tokens": 49883659.0, + "step": 1923 + }, + { + "epoch": 0.21128925982868438, + "grad_norm": 2.048187732696533, + "learning_rate": 3.5193997071742313e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7062947750091553, + "num_tokens": 49909552.0, + "step": 1924 + }, + { + "epoch": 0.21139907753129805, + "grad_norm": 2.186525821685791, + "learning_rate": 3.5212298682284047e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7050721645355225, + "num_tokens": 49933449.0, + "step": 1925 + }, + { + "epoch": 0.2115088952339117, + "grad_norm": 2.111860752105713, + "learning_rate": 3.5230600292825773e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7231115102767944, + "num_tokens": 49956654.0, + "step": 1926 + }, + { + "epoch": 0.21161871293652537, + "grad_norm": 2.1436989307403564, + "learning_rate": 3.52489019033675e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6907992362976074, + "num_tokens": 49981425.0, + "step": 1927 + }, + { + "epoch": 0.21172853063913902, + "grad_norm": 2.287501335144043, + "learning_rate": 3.5267203513909225e-06, + "loss": 1.1227, + "mean_token_accuracy": 0.6741077303886414, + "num_tokens": 50006444.0, + "step": 1928 + }, + { + "epoch": 0.2118383483417527, + "grad_norm": 2.21404767036438, + "learning_rate": 3.528550512445095e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6811326742172241, + "num_tokens": 50032990.0, + "step": 1929 + }, + { + "epoch": 0.21194816604436637, + "grad_norm": 2.1723105907440186, + "learning_rate": 3.5303806734992685e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7063585519790649, + "num_tokens": 50058150.0, + "step": 1930 + }, + { + "epoch": 0.21205798374698, + "grad_norm": 2.2267708778381348, + "learning_rate": 3.532210834553441e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6923621892929077, + "num_tokens": 50083168.0, + "step": 1931 + }, + { + "epoch": 0.21216780144959368, + "grad_norm": 2.0061323642730713, + "learning_rate": 3.5340409956076137e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6820881962776184, + "num_tokens": 50113390.0, + "step": 1932 + }, + { + "epoch": 0.21227761915220733, + "grad_norm": 2.4942638874053955, + "learning_rate": 3.5358711566617863e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7039651870727539, + "num_tokens": 50134923.0, + "step": 1933 + }, + { + "epoch": 0.212387436854821, + "grad_norm": 1.9102632999420166, + "learning_rate": 3.5377013177159593e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6986797451972961, + "num_tokens": 50164779.0, + "step": 1934 + }, + { + "epoch": 0.21249725455743465, + "grad_norm": 1.9655429124832153, + "learning_rate": 3.539531478770132e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7110965251922607, + "num_tokens": 50193712.0, + "step": 1935 + }, + { + "epoch": 0.21260707226004832, + "grad_norm": 2.3446290493011475, + "learning_rate": 3.541361639824305e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6938254833221436, + "num_tokens": 50213766.0, + "step": 1936 + }, + { + "epoch": 0.21271688996266197, + "grad_norm": 1.9598536491394043, + "learning_rate": 3.543191800878478e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.675418496131897, + "num_tokens": 50244740.0, + "step": 1937 + }, + { + "epoch": 0.21282670766527564, + "grad_norm": 2.3522818088531494, + "learning_rate": 3.5450219619326505e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7242012619972229, + "num_tokens": 50265254.0, + "step": 1938 + }, + { + "epoch": 0.2129365253678893, + "grad_norm": 2.158972978591919, + "learning_rate": 3.546852122986823e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7142279148101807, + "num_tokens": 50290704.0, + "step": 1939 + }, + { + "epoch": 0.21304634307050296, + "grad_norm": 2.311462879180908, + "learning_rate": 3.5486822840409957e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7018426656723022, + "num_tokens": 50315731.0, + "step": 1940 + }, + { + "epoch": 0.21315616077311664, + "grad_norm": 2.3127024173736572, + "learning_rate": 3.550512445095169e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6923011541366577, + "num_tokens": 50340286.0, + "step": 1941 + }, + { + "epoch": 0.21326597847573028, + "grad_norm": 2.2826485633850098, + "learning_rate": 3.5523426061493417e-06, + "loss": 1.0971, + "mean_token_accuracy": 0.6739360094070435, + "num_tokens": 50371085.0, + "step": 1942 + }, + { + "epoch": 0.21337579617834396, + "grad_norm": 1.920346975326538, + "learning_rate": 3.5541727672035143e-06, + "loss": 1.1174, + "mean_token_accuracy": 0.6626021265983582, + "num_tokens": 50405706.0, + "step": 1943 + }, + { + "epoch": 0.2134856138809576, + "grad_norm": 2.1480681896209717, + "learning_rate": 3.556002928257687e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7088978290557861, + "num_tokens": 50429482.0, + "step": 1944 + }, + { + "epoch": 0.21359543158357128, + "grad_norm": 1.9849023818969727, + "learning_rate": 3.5578330893118594e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6964062452316284, + "num_tokens": 50458260.0, + "step": 1945 + }, + { + "epoch": 0.21370524928618492, + "grad_norm": 2.227992296218872, + "learning_rate": 3.559663250366033e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7024021744728088, + "num_tokens": 50481773.0, + "step": 1946 + }, + { + "epoch": 0.2138150669887986, + "grad_norm": 2.2618770599365234, + "learning_rate": 3.5614934114202055e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7177680134773254, + "num_tokens": 50505612.0, + "step": 1947 + }, + { + "epoch": 0.21392488469141224, + "grad_norm": 2.1437792778015137, + "learning_rate": 3.563323572474378e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7157921195030212, + "num_tokens": 50531590.0, + "step": 1948 + }, + { + "epoch": 0.21403470239402592, + "grad_norm": 2.088308334350586, + "learning_rate": 3.5651537335285506e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7034763693809509, + "num_tokens": 50557241.0, + "step": 1949 + }, + { + "epoch": 0.2141445200966396, + "grad_norm": 2.1421918869018555, + "learning_rate": 3.5669838945827232e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6845014095306396, + "num_tokens": 50581796.0, + "step": 1950 + }, + { + "epoch": 0.21425433779925324, + "grad_norm": 2.3025197982788086, + "learning_rate": 3.5688140556368962e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7159385085105896, + "num_tokens": 50604611.0, + "step": 1951 + }, + { + "epoch": 0.2143641555018669, + "grad_norm": 2.2974910736083984, + "learning_rate": 3.5706442166910692e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7164912223815918, + "num_tokens": 50625155.0, + "step": 1952 + }, + { + "epoch": 0.21447397320448056, + "grad_norm": 1.885441780090332, + "learning_rate": 3.572474377745242e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.705215573310852, + "num_tokens": 50655543.0, + "step": 1953 + }, + { + "epoch": 0.21458379090709423, + "grad_norm": 2.1391873359680176, + "learning_rate": 3.574304538799415e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6835356950759888, + "num_tokens": 50682322.0, + "step": 1954 + }, + { + "epoch": 0.21469360860970788, + "grad_norm": 2.286939859390259, + "learning_rate": 3.5761346998535874e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6853461265563965, + "num_tokens": 50706369.0, + "step": 1955 + }, + { + "epoch": 0.21480342631232155, + "grad_norm": 2.3030736446380615, + "learning_rate": 3.57796486090776e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6903232336044312, + "num_tokens": 50727011.0, + "step": 1956 + }, + { + "epoch": 0.2149132440149352, + "grad_norm": 2.1100590229034424, + "learning_rate": 3.579795021961933e-06, + "loss": 1.1546, + "mean_token_accuracy": 0.6622185707092285, + "num_tokens": 50756236.0, + "step": 1957 + }, + { + "epoch": 0.21502306171754887, + "grad_norm": 2.2874302864074707, + "learning_rate": 3.581625183016106e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7089895009994507, + "num_tokens": 50778252.0, + "step": 1958 + }, + { + "epoch": 0.21513287942016254, + "grad_norm": 2.142446517944336, + "learning_rate": 3.5834553440702786e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6979979276657104, + "num_tokens": 50803563.0, + "step": 1959 + }, + { + "epoch": 0.2152426971227762, + "grad_norm": 2.056945323944092, + "learning_rate": 3.585285505124451e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6988171339035034, + "num_tokens": 50829445.0, + "step": 1960 + }, + { + "epoch": 0.21535251482538986, + "grad_norm": 2.0663838386535645, + "learning_rate": 3.587115666178624e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7029330730438232, + "num_tokens": 50857630.0, + "step": 1961 + }, + { + "epoch": 0.2154623325280035, + "grad_norm": 2.2193620204925537, + "learning_rate": 3.5889458272327972e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7063212394714355, + "num_tokens": 50880964.0, + "step": 1962 + }, + { + "epoch": 0.21557215023061718, + "grad_norm": 2.064580202102661, + "learning_rate": 3.59077598828697e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6916887760162354, + "num_tokens": 50908999.0, + "step": 1963 + }, + { + "epoch": 0.21568196793323083, + "grad_norm": 2.1133158206939697, + "learning_rate": 3.5926061493411424e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.718522846698761, + "num_tokens": 50932950.0, + "step": 1964 + }, + { + "epoch": 0.2157917856358445, + "grad_norm": 1.9853376150131226, + "learning_rate": 3.594436310395315e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6967220902442932, + "num_tokens": 50962454.0, + "step": 1965 + }, + { + "epoch": 0.21590160333845815, + "grad_norm": 2.362518787384033, + "learning_rate": 3.5962664714494876e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6879388689994812, + "num_tokens": 50984537.0, + "step": 1966 + }, + { + "epoch": 0.21601142104107182, + "grad_norm": 2.5085320472717285, + "learning_rate": 3.59809663250366e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7132706642150879, + "num_tokens": 51004108.0, + "step": 1967 + }, + { + "epoch": 0.2161212387436855, + "grad_norm": 2.0841102600097656, + "learning_rate": 3.5999267935578336e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7096303701400757, + "num_tokens": 51028769.0, + "step": 1968 + }, + { + "epoch": 0.21623105644629914, + "grad_norm": 2.1896889209747314, + "learning_rate": 3.601756954612006e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7071866393089294, + "num_tokens": 51054101.0, + "step": 1969 + }, + { + "epoch": 0.21634087414891282, + "grad_norm": 2.0213708877563477, + "learning_rate": 3.6035871156661788e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6798460483551025, + "num_tokens": 51082983.0, + "step": 1970 + }, + { + "epoch": 0.21645069185152646, + "grad_norm": 2.121237277984619, + "learning_rate": 3.6054172767203518e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7014777660369873, + "num_tokens": 51109737.0, + "step": 1971 + }, + { + "epoch": 0.21656050955414013, + "grad_norm": 2.0337584018707275, + "learning_rate": 3.6072474377745244e-06, + "loss": 0.912, + "mean_token_accuracy": 0.721881091594696, + "num_tokens": 51137474.0, + "step": 1972 + }, + { + "epoch": 0.21667032725675378, + "grad_norm": 1.9992755651474, + "learning_rate": 3.6090775988286974e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6813603043556213, + "num_tokens": 51168136.0, + "step": 1973 + }, + { + "epoch": 0.21678014495936745, + "grad_norm": 2.102247714996338, + "learning_rate": 3.61090775988287e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7085961103439331, + "num_tokens": 51194249.0, + "step": 1974 + }, + { + "epoch": 0.2168899626619811, + "grad_norm": 2.5046870708465576, + "learning_rate": 3.612737920937043e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7014192342758179, + "num_tokens": 51217522.0, + "step": 1975 + }, + { + "epoch": 0.21699978036459477, + "grad_norm": 2.6500091552734375, + "learning_rate": 3.6145680819912156e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6975357532501221, + "num_tokens": 51236544.0, + "step": 1976 + }, + { + "epoch": 0.21710959806720842, + "grad_norm": 2.3704588413238525, + "learning_rate": 3.616398243045388e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6952745318412781, + "num_tokens": 51261562.0, + "step": 1977 + }, + { + "epoch": 0.2172194157698221, + "grad_norm": 2.4171175956726074, + "learning_rate": 3.618228404099561e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6939736604690552, + "num_tokens": 51285079.0, + "step": 1978 + }, + { + "epoch": 0.21732923347243577, + "grad_norm": 2.223618268966675, + "learning_rate": 3.620058565153734e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6924821138381958, + "num_tokens": 51310387.0, + "step": 1979 + }, + { + "epoch": 0.2174390511750494, + "grad_norm": 2.1785480976104736, + "learning_rate": 3.6218887262079068e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7014564275741577, + "num_tokens": 51339896.0, + "step": 1980 + }, + { + "epoch": 0.2175488688776631, + "grad_norm": 2.583794355392456, + "learning_rate": 3.6237188872620793e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7099122405052185, + "num_tokens": 51360970.0, + "step": 1981 + }, + { + "epoch": 0.21765868658027673, + "grad_norm": 1.9044469594955444, + "learning_rate": 3.625549048316252e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6873672008514404, + "num_tokens": 51392458.0, + "step": 1982 + }, + { + "epoch": 0.2177685042828904, + "grad_norm": 2.1549816131591797, + "learning_rate": 3.6273792093704245e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7042158246040344, + "num_tokens": 51419080.0, + "step": 1983 + }, + { + "epoch": 0.21787832198550405, + "grad_norm": 2.320789337158203, + "learning_rate": 3.629209370424598e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6959770917892456, + "num_tokens": 51442144.0, + "step": 1984 + }, + { + "epoch": 0.21798813968811773, + "grad_norm": 1.9607151746749878, + "learning_rate": 3.6310395314787705e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7324364185333252, + "num_tokens": 51469644.0, + "step": 1985 + }, + { + "epoch": 0.21809795739073137, + "grad_norm": 2.3182530403137207, + "learning_rate": 3.632869692532943e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7125695943832397, + "num_tokens": 51491180.0, + "step": 1986 + }, + { + "epoch": 0.21820777509334505, + "grad_norm": 2.1420702934265137, + "learning_rate": 3.6346998535871157e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.69890296459198, + "num_tokens": 51518792.0, + "step": 1987 + }, + { + "epoch": 0.21831759279595872, + "grad_norm": 2.296290874481201, + "learning_rate": 3.6365300146412887e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7084299325942993, + "num_tokens": 51542504.0, + "step": 1988 + }, + { + "epoch": 0.21842741049857237, + "grad_norm": 2.1186580657958984, + "learning_rate": 3.6383601756954617e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6769082546234131, + "num_tokens": 51568765.0, + "step": 1989 + }, + { + "epoch": 0.21853722820118604, + "grad_norm": 1.800905466079712, + "learning_rate": 3.6401903367496343e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6965872645378113, + "num_tokens": 51602913.0, + "step": 1990 + }, + { + "epoch": 0.21864704590379969, + "grad_norm": 2.1597461700439453, + "learning_rate": 3.642020497803807e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6944828629493713, + "num_tokens": 51629681.0, + "step": 1991 + }, + { + "epoch": 0.21875686360641336, + "grad_norm": 2.425647258758545, + "learning_rate": 3.64385065885798e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6819955110549927, + "num_tokens": 51651369.0, + "step": 1992 + }, + { + "epoch": 0.218866681309027, + "grad_norm": 2.3402047157287598, + "learning_rate": 3.6456808199121525e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6962214708328247, + "num_tokens": 51676441.0, + "step": 1993 + }, + { + "epoch": 0.21897649901164068, + "grad_norm": 2.2743592262268066, + "learning_rate": 3.6475109809663255e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6928092837333679, + "num_tokens": 51700881.0, + "step": 1994 + }, + { + "epoch": 0.21908631671425433, + "grad_norm": 2.0194873809814453, + "learning_rate": 3.649341142020498e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6788594722747803, + "num_tokens": 51730156.0, + "step": 1995 + }, + { + "epoch": 0.219196134416868, + "grad_norm": 2.4879462718963623, + "learning_rate": 3.651171303074671e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7043585777282715, + "num_tokens": 51749533.0, + "step": 1996 + }, + { + "epoch": 0.21930595211948167, + "grad_norm": 2.1192500591278076, + "learning_rate": 3.6530014641288437e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7269688844680786, + "num_tokens": 51773845.0, + "step": 1997 + }, + { + "epoch": 0.21941576982209532, + "grad_norm": 2.2178351879119873, + "learning_rate": 3.6548316251830163e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6877853870391846, + "num_tokens": 51798642.0, + "step": 1998 + }, + { + "epoch": 0.219525587524709, + "grad_norm": 2.1305665969848633, + "learning_rate": 3.656661786237189e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7015572786331177, + "num_tokens": 51823772.0, + "step": 1999 + }, + { + "epoch": 0.21963540522732264, + "grad_norm": 2.197260856628418, + "learning_rate": 3.6584919472913623e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6929807662963867, + "num_tokens": 51847840.0, + "step": 2000 + }, + { + "epoch": 0.2197452229299363, + "grad_norm": 2.171410083770752, + "learning_rate": 3.660322108345535e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.694064736366272, + "num_tokens": 51874361.0, + "step": 2001 + }, + { + "epoch": 0.21985504063254996, + "grad_norm": 2.31347918510437, + "learning_rate": 3.6621522693997075e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7084119319915771, + "num_tokens": 51897075.0, + "step": 2002 + }, + { + "epoch": 0.21996485833516363, + "grad_norm": 2.064373016357422, + "learning_rate": 3.66398243045388e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6883167028427124, + "num_tokens": 51924016.0, + "step": 2003 + }, + { + "epoch": 0.22007467603777728, + "grad_norm": 2.061403274536133, + "learning_rate": 3.6658125915080526e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.705797016620636, + "num_tokens": 51951107.0, + "step": 2004 + }, + { + "epoch": 0.22018449374039095, + "grad_norm": 2.0636420249938965, + "learning_rate": 3.667642752562226e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7112581729888916, + "num_tokens": 51979515.0, + "step": 2005 + }, + { + "epoch": 0.22029431144300463, + "grad_norm": 2.1705780029296875, + "learning_rate": 3.6694729136163987e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.6768532991409302, + "num_tokens": 52008095.0, + "step": 2006 + }, + { + "epoch": 0.22040412914561827, + "grad_norm": 2.142401933670044, + "learning_rate": 3.6713030746705712e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7107589244842529, + "num_tokens": 52033322.0, + "step": 2007 + }, + { + "epoch": 0.22051394684823195, + "grad_norm": 1.9486311674118042, + "learning_rate": 3.673133235724744e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6910701990127563, + "num_tokens": 52063362.0, + "step": 2008 + }, + { + "epoch": 0.2206237645508456, + "grad_norm": 1.981990098953247, + "learning_rate": 3.674963396778917e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.7017021775245667, + "num_tokens": 52094747.0, + "step": 2009 + }, + { + "epoch": 0.22073358225345927, + "grad_norm": 2.1591272354125977, + "learning_rate": 3.67679355783309e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7149780988693237, + "num_tokens": 52120984.0, + "step": 2010 + }, + { + "epoch": 0.2208433999560729, + "grad_norm": 2.220552921295166, + "learning_rate": 3.6786237188872624e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7043863534927368, + "num_tokens": 52145473.0, + "step": 2011 + }, + { + "epoch": 0.22095321765868658, + "grad_norm": 1.9075809717178345, + "learning_rate": 3.680453879941435e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6967297196388245, + "num_tokens": 52176218.0, + "step": 2012 + }, + { + "epoch": 0.22106303536130023, + "grad_norm": 2.0033481121063232, + "learning_rate": 3.682284040995608e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6921376585960388, + "num_tokens": 52205690.0, + "step": 2013 + }, + { + "epoch": 0.2211728530639139, + "grad_norm": 2.0045700073242188, + "learning_rate": 3.6841142020497806e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6778566837310791, + "num_tokens": 52237121.0, + "step": 2014 + }, + { + "epoch": 0.22128267076652755, + "grad_norm": 2.3779211044311523, + "learning_rate": 3.685944363103953e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7067112922668457, + "num_tokens": 52257670.0, + "step": 2015 + }, + { + "epoch": 0.22139248846914122, + "grad_norm": 2.1938135623931885, + "learning_rate": 3.6877745241581262e-06, + "loss": 1.078, + "mean_token_accuracy": 0.6749703884124756, + "num_tokens": 52284344.0, + "step": 2016 + }, + { + "epoch": 0.2215023061717549, + "grad_norm": 2.2442097663879395, + "learning_rate": 3.6896046852122992e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6838938593864441, + "num_tokens": 52309362.0, + "step": 2017 + }, + { + "epoch": 0.22161212387436854, + "grad_norm": 2.700948715209961, + "learning_rate": 3.691434846266472e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7097166776657104, + "num_tokens": 52325558.0, + "step": 2018 + }, + { + "epoch": 0.22172194157698222, + "grad_norm": 2.345940351486206, + "learning_rate": 3.6932650073206444e-06, + "loss": 1.1366, + "mean_token_accuracy": 0.6690930724143982, + "num_tokens": 52352374.0, + "step": 2019 + }, + { + "epoch": 0.22183175927959586, + "grad_norm": 2.0920498371124268, + "learning_rate": 3.695095168374817e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6752116680145264, + "num_tokens": 52380195.0, + "step": 2020 + }, + { + "epoch": 0.22194157698220954, + "grad_norm": 2.2599129676818848, + "learning_rate": 3.6969253294289904e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6829687356948853, + "num_tokens": 52402680.0, + "step": 2021 + }, + { + "epoch": 0.22205139468482318, + "grad_norm": 2.194491386413574, + "learning_rate": 3.698755490483163e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6801837682723999, + "num_tokens": 52430255.0, + "step": 2022 + }, + { + "epoch": 0.22216121238743686, + "grad_norm": 2.550469160079956, + "learning_rate": 3.7005856515373356e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7171741127967834, + "num_tokens": 52449713.0, + "step": 2023 + }, + { + "epoch": 0.2222710300900505, + "grad_norm": 2.152384042739868, + "learning_rate": 3.702415812591508e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6963350176811218, + "num_tokens": 52478830.0, + "step": 2024 + }, + { + "epoch": 0.22238084779266418, + "grad_norm": 2.3430604934692383, + "learning_rate": 3.7042459736456808e-06, + "loss": 1.1044, + "mean_token_accuracy": 0.6685386300086975, + "num_tokens": 52502044.0, + "step": 2025 + }, + { + "epoch": 0.22249066549527785, + "grad_norm": 2.163651704788208, + "learning_rate": 3.706076134699854e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7022108435630798, + "num_tokens": 52526988.0, + "step": 2026 + }, + { + "epoch": 0.2226004831978915, + "grad_norm": 2.0117509365081787, + "learning_rate": 3.707906295754027e-06, + "loss": 1.1114, + "mean_token_accuracy": 0.6669081449508667, + "num_tokens": 52558952.0, + "step": 2027 + }, + { + "epoch": 0.22271030090050517, + "grad_norm": 2.177755832672119, + "learning_rate": 3.7097364568081994e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6860579252243042, + "num_tokens": 52583844.0, + "step": 2028 + }, + { + "epoch": 0.22282011860311882, + "grad_norm": 2.10506272315979, + "learning_rate": 3.711566617862372e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6919040679931641, + "num_tokens": 52608669.0, + "step": 2029 + }, + { + "epoch": 0.2229299363057325, + "grad_norm": 2.0887017250061035, + "learning_rate": 3.713396778916545e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7113211154937744, + "num_tokens": 52633373.0, + "step": 2030 + }, + { + "epoch": 0.22303975400834614, + "grad_norm": 2.0543322563171387, + "learning_rate": 3.7152269399707176e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.7003585696220398, + "num_tokens": 52661011.0, + "step": 2031 + }, + { + "epoch": 0.2231495717109598, + "grad_norm": 2.213059902191162, + "learning_rate": 3.7170571010248906e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6924370527267456, + "num_tokens": 52685505.0, + "step": 2032 + }, + { + "epoch": 0.22325938941357346, + "grad_norm": 1.889574408531189, + "learning_rate": 3.718887262079063e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7161211967468262, + "num_tokens": 52716418.0, + "step": 2033 + }, + { + "epoch": 0.22336920711618713, + "grad_norm": 2.1425578594207764, + "learning_rate": 3.720717423133236e-06, + "loss": 1.061, + "mean_token_accuracy": 0.6784042716026306, + "num_tokens": 52742249.0, + "step": 2034 + }, + { + "epoch": 0.2234790248188008, + "grad_norm": 2.0282912254333496, + "learning_rate": 3.7225475841874088e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6922007203102112, + "num_tokens": 52772659.0, + "step": 2035 + }, + { + "epoch": 0.22358884252141445, + "grad_norm": 1.9364720582962036, + "learning_rate": 3.7243777452415813e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6865460872650146, + "num_tokens": 52802380.0, + "step": 2036 + }, + { + "epoch": 0.22369866022402812, + "grad_norm": 2.1031765937805176, + "learning_rate": 3.7262079062957548e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6858805418014526, + "num_tokens": 52828057.0, + "step": 2037 + }, + { + "epoch": 0.22380847792664177, + "grad_norm": 2.2544050216674805, + "learning_rate": 3.7280380673499274e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6990942358970642, + "num_tokens": 52852347.0, + "step": 2038 + }, + { + "epoch": 0.22391829562925544, + "grad_norm": 1.9835110902786255, + "learning_rate": 3.7298682284041e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6923809051513672, + "num_tokens": 52882164.0, + "step": 2039 + }, + { + "epoch": 0.2240281133318691, + "grad_norm": 2.291154623031616, + "learning_rate": 3.7316983894582725e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6928771138191223, + "num_tokens": 52903965.0, + "step": 2040 + }, + { + "epoch": 0.22413793103448276, + "grad_norm": 2.3328232765197754, + "learning_rate": 3.733528550512445e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6985008716583252, + "num_tokens": 52927985.0, + "step": 2041 + }, + { + "epoch": 0.2242477487370964, + "grad_norm": 2.0957581996917725, + "learning_rate": 3.7353587115666186e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7055887579917908, + "num_tokens": 52954305.0, + "step": 2042 + }, + { + "epoch": 0.22435756643971008, + "grad_norm": 2.246331214904785, + "learning_rate": 3.737188872620791e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6793025732040405, + "num_tokens": 52978544.0, + "step": 2043 + }, + { + "epoch": 0.22446738414232376, + "grad_norm": 1.839084506034851, + "learning_rate": 3.7390190336749637e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6914790868759155, + "num_tokens": 53012350.0, + "step": 2044 + }, + { + "epoch": 0.2245772018449374, + "grad_norm": 2.314218521118164, + "learning_rate": 3.7408491947291363e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.707274317741394, + "num_tokens": 53034770.0, + "step": 2045 + }, + { + "epoch": 0.22468701954755108, + "grad_norm": 2.236555576324463, + "learning_rate": 3.742679355783309e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.707627534866333, + "num_tokens": 53056947.0, + "step": 2046 + }, + { + "epoch": 0.22479683725016472, + "grad_norm": 2.239067316055298, + "learning_rate": 3.744509516837482e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6765278577804565, + "num_tokens": 53083785.0, + "step": 2047 + }, + { + "epoch": 0.2249066549527784, + "grad_norm": 2.0363306999206543, + "learning_rate": 3.746339677891655e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7116531729698181, + "num_tokens": 53110059.0, + "step": 2048 + }, + { + "epoch": 0.22501647265539204, + "grad_norm": 2.1377406120300293, + "learning_rate": 3.7481698389458275e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6902177333831787, + "num_tokens": 53136665.0, + "step": 2049 + }, + { + "epoch": 0.22512629035800572, + "grad_norm": 2.305293083190918, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7004418969154358, + "num_tokens": 53161235.0, + "step": 2050 + }, + { + "epoch": 0.22523610806061936, + "grad_norm": 2.230417013168335, + "learning_rate": 3.751830161054173e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6953213214874268, + "num_tokens": 53184040.0, + "step": 2051 + }, + { + "epoch": 0.22534592576323303, + "grad_norm": 2.290503740310669, + "learning_rate": 3.7536603221083457e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.6965163946151733, + "num_tokens": 53208681.0, + "step": 2052 + }, + { + "epoch": 0.22545574346584668, + "grad_norm": 2.196620225906372, + "learning_rate": 3.7554904831625187e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7040702700614929, + "num_tokens": 53231042.0, + "step": 2053 + }, + { + "epoch": 0.22556556116846035, + "grad_norm": 2.3185038566589355, + "learning_rate": 3.7573206442166917e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7118352651596069, + "num_tokens": 53252147.0, + "step": 2054 + }, + { + "epoch": 0.22567537887107403, + "grad_norm": 2.0053606033325195, + "learning_rate": 3.7591508052708643e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6911607980728149, + "num_tokens": 53279795.0, + "step": 2055 + }, + { + "epoch": 0.22578519657368767, + "grad_norm": 2.134758234024048, + "learning_rate": 3.760980966325037e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7166450023651123, + "num_tokens": 53306999.0, + "step": 2056 + }, + { + "epoch": 0.22589501427630135, + "grad_norm": 2.173227071762085, + "learning_rate": 3.7628111273792095e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6954693794250488, + "num_tokens": 53333510.0, + "step": 2057 + }, + { + "epoch": 0.226004831978915, + "grad_norm": 2.194751501083374, + "learning_rate": 3.764641288433382e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7140889167785645, + "num_tokens": 53355731.0, + "step": 2058 + }, + { + "epoch": 0.22611464968152867, + "grad_norm": 2.3283298015594482, + "learning_rate": 3.7664714494875555e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7094901204109192, + "num_tokens": 53380778.0, + "step": 2059 + }, + { + "epoch": 0.2262244673841423, + "grad_norm": 2.4343180656433105, + "learning_rate": 3.768301610541728e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7070930004119873, + "num_tokens": 53401889.0, + "step": 2060 + }, + { + "epoch": 0.226334285086756, + "grad_norm": 2.307727575302124, + "learning_rate": 3.7701317715959007e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6902711987495422, + "num_tokens": 53424967.0, + "step": 2061 + }, + { + "epoch": 0.22644410278936963, + "grad_norm": 2.0747833251953125, + "learning_rate": 3.7719619326500733e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7051599025726318, + "num_tokens": 53452438.0, + "step": 2062 + }, + { + "epoch": 0.2265539204919833, + "grad_norm": 2.1491215229034424, + "learning_rate": 3.773792093704246e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.705859363079071, + "num_tokens": 53479300.0, + "step": 2063 + }, + { + "epoch": 0.22666373819459698, + "grad_norm": 2.283421277999878, + "learning_rate": 3.7756222547584193e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7031297087669373, + "num_tokens": 53502719.0, + "step": 2064 + }, + { + "epoch": 0.22677355589721063, + "grad_norm": 2.284235954284668, + "learning_rate": 3.777452415812592e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6853615045547485, + "num_tokens": 53527325.0, + "step": 2065 + }, + { + "epoch": 0.2268833735998243, + "grad_norm": 2.196974992752075, + "learning_rate": 3.7792825768667644e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6922198534011841, + "num_tokens": 53551027.0, + "step": 2066 + }, + { + "epoch": 0.22699319130243795, + "grad_norm": 2.2875194549560547, + "learning_rate": 3.7811127379209375e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7061270475387573, + "num_tokens": 53574246.0, + "step": 2067 + }, + { + "epoch": 0.22710300900505162, + "grad_norm": 2.3451898097991943, + "learning_rate": 3.78294289897511e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7010305523872375, + "num_tokens": 53598073.0, + "step": 2068 + }, + { + "epoch": 0.22721282670766527, + "grad_norm": 2.1176397800445557, + "learning_rate": 3.784773060029283e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7103560566902161, + "num_tokens": 53621209.0, + "step": 2069 + }, + { + "epoch": 0.22732264441027894, + "grad_norm": 2.111466407775879, + "learning_rate": 3.7866032210834556e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6986396312713623, + "num_tokens": 53648079.0, + "step": 2070 + }, + { + "epoch": 0.22743246211289259, + "grad_norm": 2.0375473499298096, + "learning_rate": 3.7884333821376286e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7103613615036011, + "num_tokens": 53674174.0, + "step": 2071 + }, + { + "epoch": 0.22754227981550626, + "grad_norm": 2.2812588214874268, + "learning_rate": 3.7902635431918012e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6937875151634216, + "num_tokens": 53696582.0, + "step": 2072 + }, + { + "epoch": 0.22765209751811993, + "grad_norm": 2.688403606414795, + "learning_rate": 3.792093704245974e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6784259676933289, + "num_tokens": 53721855.0, + "step": 2073 + }, + { + "epoch": 0.22776191522073358, + "grad_norm": 2.3224689960479736, + "learning_rate": 3.7939238653001464e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7095091342926025, + "num_tokens": 53743142.0, + "step": 2074 + }, + { + "epoch": 0.22787173292334725, + "grad_norm": 2.14920973777771, + "learning_rate": 3.79575402635432e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7008134722709656, + "num_tokens": 53769523.0, + "step": 2075 + }, + { + "epoch": 0.2279815506259609, + "grad_norm": 1.9577972888946533, + "learning_rate": 3.7975841874084924e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7276067733764648, + "num_tokens": 53797278.0, + "step": 2076 + }, + { + "epoch": 0.22809136832857457, + "grad_norm": 2.0698976516723633, + "learning_rate": 3.799414348462665e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7085106372833252, + "num_tokens": 53825973.0, + "step": 2077 + }, + { + "epoch": 0.22820118603118822, + "grad_norm": 1.950808048248291, + "learning_rate": 3.8012445095168376e-06, + "loss": 1.0823, + "mean_token_accuracy": 0.673762321472168, + "num_tokens": 53859850.0, + "step": 2078 + }, + { + "epoch": 0.2283110037338019, + "grad_norm": 2.146409749984741, + "learning_rate": 3.80307467057101e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6893711686134338, + "num_tokens": 53888050.0, + "step": 2079 + }, + { + "epoch": 0.22842082143641554, + "grad_norm": 2.081019639968872, + "learning_rate": 3.8049048316251836e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7107837796211243, + "num_tokens": 53917317.0, + "step": 2080 + }, + { + "epoch": 0.2285306391390292, + "grad_norm": 1.9710052013397217, + "learning_rate": 3.806734992679356e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6833934783935547, + "num_tokens": 53947763.0, + "step": 2081 + }, + { + "epoch": 0.2286404568416429, + "grad_norm": 2.172311544418335, + "learning_rate": 3.808565153733529e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6961056590080261, + "num_tokens": 53971919.0, + "step": 2082 + }, + { + "epoch": 0.22875027454425653, + "grad_norm": 2.228379487991333, + "learning_rate": 3.8103953147877014e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7102752923965454, + "num_tokens": 53995692.0, + "step": 2083 + }, + { + "epoch": 0.2288600922468702, + "grad_norm": 2.141723871231079, + "learning_rate": 3.8122254758418744e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7067821025848389, + "num_tokens": 54023189.0, + "step": 2084 + }, + { + "epoch": 0.22896990994948385, + "grad_norm": 2.1206204891204834, + "learning_rate": 3.8140556368960474e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7049307823181152, + "num_tokens": 54048938.0, + "step": 2085 + }, + { + "epoch": 0.22907972765209753, + "grad_norm": 2.290818214416504, + "learning_rate": 3.81588579795022e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6972095370292664, + "num_tokens": 54073387.0, + "step": 2086 + }, + { + "epoch": 0.22918954535471117, + "grad_norm": 2.2015669345855713, + "learning_rate": 3.8177159590043926e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7050285339355469, + "num_tokens": 54099075.0, + "step": 2087 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 2.2090277671813965, + "learning_rate": 3.819546120058566e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6933517456054688, + "num_tokens": 54124565.0, + "step": 2088 + }, + { + "epoch": 0.2294091807599385, + "grad_norm": 2.1603522300720215, + "learning_rate": 3.821376281112738e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.6820146441459656, + "num_tokens": 54152169.0, + "step": 2089 + }, + { + "epoch": 0.22951899846255217, + "grad_norm": 2.130225419998169, + "learning_rate": 3.823206442166911e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7310261726379395, + "num_tokens": 54174494.0, + "step": 2090 + }, + { + "epoch": 0.2296288161651658, + "grad_norm": 2.0694572925567627, + "learning_rate": 3.825036603221084e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6910707950592041, + "num_tokens": 54204330.0, + "step": 2091 + }, + { + "epoch": 0.22973863386777948, + "grad_norm": 2.4565320014953613, + "learning_rate": 3.826866764275257e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6955633163452148, + "num_tokens": 54228327.0, + "step": 2092 + }, + { + "epoch": 0.22984845157039316, + "grad_norm": 2.302551507949829, + "learning_rate": 3.828696925329429e-06, + "loss": 1.105, + "mean_token_accuracy": 0.6792283058166504, + "num_tokens": 54255526.0, + "step": 2093 + }, + { + "epoch": 0.2299582692730068, + "grad_norm": 2.123621702194214, + "learning_rate": 3.830527086383602e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.6800042986869812, + "num_tokens": 54283867.0, + "step": 2094 + }, + { + "epoch": 0.23006808697562048, + "grad_norm": 2.1259396076202393, + "learning_rate": 3.832357247437775e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6933379173278809, + "num_tokens": 54313766.0, + "step": 2095 + }, + { + "epoch": 0.23017790467823412, + "grad_norm": 2.1566545963287354, + "learning_rate": 3.834187408491948e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6928262710571289, + "num_tokens": 54338305.0, + "step": 2096 + }, + { + "epoch": 0.2302877223808478, + "grad_norm": 2.1974940299987793, + "learning_rate": 3.83601756954612e-06, + "loss": 1.08, + "mean_token_accuracy": 0.6732192039489746, + "num_tokens": 54365232.0, + "step": 2097 + }, + { + "epoch": 0.23039754008346144, + "grad_norm": 2.176647186279297, + "learning_rate": 3.837847730600293e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6836323738098145, + "num_tokens": 54390254.0, + "step": 2098 + }, + { + "epoch": 0.23050735778607512, + "grad_norm": 1.971848964691162, + "learning_rate": 3.839677891654466e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.692115306854248, + "num_tokens": 54418996.0, + "step": 2099 + }, + { + "epoch": 0.23061717548868876, + "grad_norm": 2.0318562984466553, + "learning_rate": 3.841508052708638e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7082744836807251, + "num_tokens": 54447277.0, + "step": 2100 + }, + { + "epoch": 0.23072699319130244, + "grad_norm": 1.98750638961792, + "learning_rate": 3.843338213762811e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6952682137489319, + "num_tokens": 54477344.0, + "step": 2101 + }, + { + "epoch": 0.2308368108939161, + "grad_norm": 2.1299192905426025, + "learning_rate": 3.845168374816984e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6968051195144653, + "num_tokens": 54505618.0, + "step": 2102 + }, + { + "epoch": 0.23094662859652976, + "grad_norm": 1.9767109155654907, + "learning_rate": 3.846998535871157e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6816682815551758, + "num_tokens": 54535329.0, + "step": 2103 + }, + { + "epoch": 0.23105644629914343, + "grad_norm": 2.5388858318328857, + "learning_rate": 3.8488286969253295e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7083814144134521, + "num_tokens": 54554000.0, + "step": 2104 + }, + { + "epoch": 0.23116626400175708, + "grad_norm": 2.225557327270508, + "learning_rate": 3.8506588579795025e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7114819288253784, + "num_tokens": 54576796.0, + "step": 2105 + }, + { + "epoch": 0.23127608170437075, + "grad_norm": 2.46437668800354, + "learning_rate": 3.852489019033675e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.6984038352966309, + "num_tokens": 54597576.0, + "step": 2106 + }, + { + "epoch": 0.2313858994069844, + "grad_norm": 2.1256422996520996, + "learning_rate": 3.8543191800878485e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7186647653579712, + "num_tokens": 54625666.0, + "step": 2107 + }, + { + "epoch": 0.23149571710959807, + "grad_norm": 2.239668846130371, + "learning_rate": 3.856149341142021e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6989009380340576, + "num_tokens": 54650558.0, + "step": 2108 + }, + { + "epoch": 0.23160553481221172, + "grad_norm": 2.491036891937256, + "learning_rate": 3.857979502196194e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6949583888053894, + "num_tokens": 54672054.0, + "step": 2109 + }, + { + "epoch": 0.2317153525148254, + "grad_norm": 2.2767789363861084, + "learning_rate": 3.859809663250366e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6890649795532227, + "num_tokens": 54697390.0, + "step": 2110 + }, + { + "epoch": 0.23182517021743906, + "grad_norm": 1.909049153327942, + "learning_rate": 3.861639824304539e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.684011697769165, + "num_tokens": 54730545.0, + "step": 2111 + }, + { + "epoch": 0.2319349879200527, + "grad_norm": 2.293255090713501, + "learning_rate": 3.863469985358712e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7140415906906128, + "num_tokens": 54754376.0, + "step": 2112 + }, + { + "epoch": 0.23204480562266638, + "grad_norm": 2.005253314971924, + "learning_rate": 3.865300146412885e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7002489566802979, + "num_tokens": 54783304.0, + "step": 2113 + }, + { + "epoch": 0.23215462332528003, + "grad_norm": 2.1131906509399414, + "learning_rate": 3.867130307467057e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.689260721206665, + "num_tokens": 54811775.0, + "step": 2114 + }, + { + "epoch": 0.2322644410278937, + "grad_norm": 1.9623833894729614, + "learning_rate": 3.86896046852123e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6717305183410645, + "num_tokens": 54843797.0, + "step": 2115 + }, + { + "epoch": 0.23237425873050735, + "grad_norm": 2.4292960166931152, + "learning_rate": 3.870790629575403e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.698290228843689, + "num_tokens": 54864100.0, + "step": 2116 + }, + { + "epoch": 0.23248407643312102, + "grad_norm": 2.3755524158477783, + "learning_rate": 3.872620790629576e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6956309080123901, + "num_tokens": 54887081.0, + "step": 2117 + }, + { + "epoch": 0.23259389413573467, + "grad_norm": 2.0475988388061523, + "learning_rate": 3.874450951683748e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6846317648887634, + "num_tokens": 54919044.0, + "step": 2118 + }, + { + "epoch": 0.23270371183834834, + "grad_norm": 1.8088899850845337, + "learning_rate": 3.876281112737921e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6912100315093994, + "num_tokens": 54952453.0, + "step": 2119 + }, + { + "epoch": 0.23281352954096202, + "grad_norm": 1.9908030033111572, + "learning_rate": 3.878111273792094e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7161983251571655, + "num_tokens": 54981134.0, + "step": 2120 + }, + { + "epoch": 0.23292334724357566, + "grad_norm": 2.648451566696167, + "learning_rate": 3.8799414348462664e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6968844532966614, + "num_tokens": 54998636.0, + "step": 2121 + }, + { + "epoch": 0.23303316494618934, + "grad_norm": 2.4454452991485596, + "learning_rate": 3.8817715959004395e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6841943264007568, + "num_tokens": 55020317.0, + "step": 2122 + }, + { + "epoch": 0.23314298264880298, + "grad_norm": 2.3127658367156982, + "learning_rate": 3.8836017569546125e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6977212429046631, + "num_tokens": 55044327.0, + "step": 2123 + }, + { + "epoch": 0.23325280035141666, + "grad_norm": 2.21136736869812, + "learning_rate": 3.8854319180087855e-06, + "loss": 1.1439, + "mean_token_accuracy": 0.6651854515075684, + "num_tokens": 55068413.0, + "step": 2124 + }, + { + "epoch": 0.2333626180540303, + "grad_norm": 2.194323778152466, + "learning_rate": 3.887262079062958e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6911355257034302, + "num_tokens": 55093595.0, + "step": 2125 + }, + { + "epoch": 0.23347243575664398, + "grad_norm": 2.115098237991333, + "learning_rate": 3.889092240117131e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6995149850845337, + "num_tokens": 55123942.0, + "step": 2126 + }, + { + "epoch": 0.23358225345925762, + "grad_norm": 2.3688948154449463, + "learning_rate": 3.890922401171303e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6905845403671265, + "num_tokens": 55148395.0, + "step": 2127 + }, + { + "epoch": 0.2336920711618713, + "grad_norm": 1.9595839977264404, + "learning_rate": 3.892752562225477e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6815561056137085, + "num_tokens": 55177668.0, + "step": 2128 + }, + { + "epoch": 0.23380188886448494, + "grad_norm": 2.076507806777954, + "learning_rate": 3.894582723279649e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6812573075294495, + "num_tokens": 55203995.0, + "step": 2129 + }, + { + "epoch": 0.23391170656709862, + "grad_norm": 1.9108835458755493, + "learning_rate": 3.896412884333822e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7094050049781799, + "num_tokens": 55232930.0, + "step": 2130 + }, + { + "epoch": 0.2340215242697123, + "grad_norm": 2.3908705711364746, + "learning_rate": 3.898243045387994e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7204698324203491, + "num_tokens": 55256088.0, + "step": 2131 + }, + { + "epoch": 0.23413134197232593, + "grad_norm": 2.334228038787842, + "learning_rate": 3.900073206442167e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7172691822052002, + "num_tokens": 55278366.0, + "step": 2132 + }, + { + "epoch": 0.2342411596749396, + "grad_norm": 2.31619930267334, + "learning_rate": 3.90190336749634e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6950448155403137, + "num_tokens": 55303188.0, + "step": 2133 + }, + { + "epoch": 0.23435097737755325, + "grad_norm": 2.3097712993621826, + "learning_rate": 3.903733528550513e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7066102027893066, + "num_tokens": 55326128.0, + "step": 2134 + }, + { + "epoch": 0.23446079508016693, + "grad_norm": 2.146620273590088, + "learning_rate": 3.905563689604685e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7049587965011597, + "num_tokens": 55352488.0, + "step": 2135 + }, + { + "epoch": 0.23457061278278057, + "grad_norm": 2.1861324310302734, + "learning_rate": 3.907393850658858e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6981000900268555, + "num_tokens": 55377068.0, + "step": 2136 + }, + { + "epoch": 0.23468043048539425, + "grad_norm": 2.163891553878784, + "learning_rate": 3.909224011713031e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7107859253883362, + "num_tokens": 55400021.0, + "step": 2137 + }, + { + "epoch": 0.2347902481880079, + "grad_norm": 2.1253275871276855, + "learning_rate": 3.911054172767203e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.6868206858634949, + "num_tokens": 55425526.0, + "step": 2138 + }, + { + "epoch": 0.23490006589062157, + "grad_norm": 1.9766643047332764, + "learning_rate": 3.912884333821376e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.6782291531562805, + "num_tokens": 55454631.0, + "step": 2139 + }, + { + "epoch": 0.23500988359323524, + "grad_norm": 2.1324880123138428, + "learning_rate": 3.914714494875549e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.691935658454895, + "num_tokens": 55480020.0, + "step": 2140 + }, + { + "epoch": 0.2351197012958489, + "grad_norm": 2.052243709564209, + "learning_rate": 3.916544655929722e-06, + "loss": 1.036, + "mean_token_accuracy": 0.688992977142334, + "num_tokens": 55507039.0, + "step": 2141 + }, + { + "epoch": 0.23522951899846256, + "grad_norm": 2.2161238193511963, + "learning_rate": 3.918374816983895e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6967448592185974, + "num_tokens": 55530041.0, + "step": 2142 + }, + { + "epoch": 0.2353393367010762, + "grad_norm": 2.186863660812378, + "learning_rate": 3.920204978038068e-06, + "loss": 1.1128, + "mean_token_accuracy": 0.668769359588623, + "num_tokens": 55557788.0, + "step": 2143 + }, + { + "epoch": 0.23544915440368988, + "grad_norm": 2.0927460193634033, + "learning_rate": 3.922035139092241e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6915116906166077, + "num_tokens": 55588413.0, + "step": 2144 + }, + { + "epoch": 0.23555897210630353, + "grad_norm": 1.9702472686767578, + "learning_rate": 3.923865300146414e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7048547267913818, + "num_tokens": 55616805.0, + "step": 2145 + }, + { + "epoch": 0.2356687898089172, + "grad_norm": 2.2474937438964844, + "learning_rate": 3.925695461200586e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6998417377471924, + "num_tokens": 55642716.0, + "step": 2146 + }, + { + "epoch": 0.23577860751153085, + "grad_norm": 1.944922924041748, + "learning_rate": 3.927525622254759e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.69391930103302, + "num_tokens": 55674241.0, + "step": 2147 + }, + { + "epoch": 0.23588842521414452, + "grad_norm": 2.176210403442383, + "learning_rate": 3.929355783308931e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6936383247375488, + "num_tokens": 55701286.0, + "step": 2148 + }, + { + "epoch": 0.2359982429167582, + "grad_norm": 2.2756874561309814, + "learning_rate": 3.931185944363105e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.698795735836029, + "num_tokens": 55724455.0, + "step": 2149 + }, + { + "epoch": 0.23610806061937184, + "grad_norm": 2.128307580947876, + "learning_rate": 3.933016105417277e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6970449090003967, + "num_tokens": 55748626.0, + "step": 2150 + }, + { + "epoch": 0.2362178783219855, + "grad_norm": 2.086904525756836, + "learning_rate": 3.93484626647145e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7200641632080078, + "num_tokens": 55774495.0, + "step": 2151 + }, + { + "epoch": 0.23632769602459916, + "grad_norm": 2.21901273727417, + "learning_rate": 3.936676427525622e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6901136636734009, + "num_tokens": 55799697.0, + "step": 2152 + }, + { + "epoch": 0.23643751372721283, + "grad_norm": 1.927117109298706, + "learning_rate": 3.938506588579795e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.687796950340271, + "num_tokens": 55831797.0, + "step": 2153 + }, + { + "epoch": 0.23654733142982648, + "grad_norm": 2.152663230895996, + "learning_rate": 3.940336749633968e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7126407623291016, + "num_tokens": 55857019.0, + "step": 2154 + }, + { + "epoch": 0.23665714913244015, + "grad_norm": 2.110880136489868, + "learning_rate": 3.942166910688141e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.689947247505188, + "num_tokens": 55884875.0, + "step": 2155 + }, + { + "epoch": 0.2367669668350538, + "grad_norm": 2.354595184326172, + "learning_rate": 3.943997071742313e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7115057706832886, + "num_tokens": 55906231.0, + "step": 2156 + }, + { + "epoch": 0.23687678453766747, + "grad_norm": 2.118668556213379, + "learning_rate": 3.945827232796486e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6888183355331421, + "num_tokens": 55933987.0, + "step": 2157 + }, + { + "epoch": 0.23698660224028115, + "grad_norm": 2.3495492935180664, + "learning_rate": 3.947657393850659e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7143534421920776, + "num_tokens": 55954611.0, + "step": 2158 + }, + { + "epoch": 0.2370964199428948, + "grad_norm": 1.8754544258117676, + "learning_rate": 3.9494875549048315e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6948108673095703, + "num_tokens": 55982878.0, + "step": 2159 + }, + { + "epoch": 0.23720623764550847, + "grad_norm": 2.545301914215088, + "learning_rate": 3.9513177159590045e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7072409391403198, + "num_tokens": 56002661.0, + "step": 2160 + }, + { + "epoch": 0.2373160553481221, + "grad_norm": 2.07299542427063, + "learning_rate": 3.9531478770131775e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6943687200546265, + "num_tokens": 56030275.0, + "step": 2161 + }, + { + "epoch": 0.2374258730507358, + "grad_norm": 2.108272075653076, + "learning_rate": 3.9549780380673505e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6943680047988892, + "num_tokens": 56055517.0, + "step": 2162 + }, + { + "epoch": 0.23753569075334943, + "grad_norm": 2.1094393730163574, + "learning_rate": 3.956808199121523e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7144309282302856, + "num_tokens": 56081280.0, + "step": 2163 + }, + { + "epoch": 0.2376455084559631, + "grad_norm": 2.3139660358428955, + "learning_rate": 3.958638360175696e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.694218635559082, + "num_tokens": 56103016.0, + "step": 2164 + }, + { + "epoch": 0.23775532615857675, + "grad_norm": 2.0091381072998047, + "learning_rate": 3.960468521229869e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7051429152488708, + "num_tokens": 56130657.0, + "step": 2165 + }, + { + "epoch": 0.23786514386119043, + "grad_norm": 1.9692599773406982, + "learning_rate": 3.962298682284042e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6967153549194336, + "num_tokens": 56158931.0, + "step": 2166 + }, + { + "epoch": 0.23797496156380407, + "grad_norm": 2.509476661682129, + "learning_rate": 3.964128843338214e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7007867693901062, + "num_tokens": 56179815.0, + "step": 2167 + }, + { + "epoch": 0.23808477926641775, + "grad_norm": 2.324052572250366, + "learning_rate": 3.965959004392387e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7108813524246216, + "num_tokens": 56200875.0, + "step": 2168 + }, + { + "epoch": 0.23819459696903142, + "grad_norm": 2.070258617401123, + "learning_rate": 3.967789165446559e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6973171830177307, + "num_tokens": 56225901.0, + "step": 2169 + }, + { + "epoch": 0.23830441467164507, + "grad_norm": 2.385042428970337, + "learning_rate": 3.969619326500732e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6985504627227783, + "num_tokens": 56249316.0, + "step": 2170 + }, + { + "epoch": 0.23841423237425874, + "grad_norm": 2.0414834022521973, + "learning_rate": 3.971449487554905e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7132260799407959, + "num_tokens": 56275848.0, + "step": 2171 + }, + { + "epoch": 0.23852405007687238, + "grad_norm": 2.3066749572753906, + "learning_rate": 3.973279648609078e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7149648666381836, + "num_tokens": 56297877.0, + "step": 2172 + }, + { + "epoch": 0.23863386777948606, + "grad_norm": 2.304131031036377, + "learning_rate": 3.97510980966325e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6989845037460327, + "num_tokens": 56322814.0, + "step": 2173 + }, + { + "epoch": 0.2387436854820997, + "grad_norm": 2.029400587081909, + "learning_rate": 3.976939970717423e-06, + "loss": 1.1027, + "mean_token_accuracy": 0.6697636246681213, + "num_tokens": 56352631.0, + "step": 2174 + }, + { + "epoch": 0.23885350318471338, + "grad_norm": 2.1259772777557373, + "learning_rate": 3.978770131771596e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7199119329452515, + "num_tokens": 56377769.0, + "step": 2175 + }, + { + "epoch": 0.23896332088732702, + "grad_norm": 2.26544451713562, + "learning_rate": 3.980600292825769e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7032766938209534, + "num_tokens": 56399978.0, + "step": 2176 + }, + { + "epoch": 0.2390731385899407, + "grad_norm": 2.1976985931396484, + "learning_rate": 3.9824304538799415e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7090914249420166, + "num_tokens": 56425758.0, + "step": 2177 + }, + { + "epoch": 0.23918295629255437, + "grad_norm": 2.1554079055786133, + "learning_rate": 3.9842606149341145e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.714906632900238, + "num_tokens": 56450652.0, + "step": 2178 + }, + { + "epoch": 0.23929277399516802, + "grad_norm": 2.2745845317840576, + "learning_rate": 3.9860907759882875e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7069060802459717, + "num_tokens": 56473586.0, + "step": 2179 + }, + { + "epoch": 0.2394025916977817, + "grad_norm": 2.0702202320098877, + "learning_rate": 3.98792093704246e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6948245763778687, + "num_tokens": 56503308.0, + "step": 2180 + }, + { + "epoch": 0.23951240940039534, + "grad_norm": 2.201359748840332, + "learning_rate": 3.989751098096633e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6746801137924194, + "num_tokens": 56528617.0, + "step": 2181 + }, + { + "epoch": 0.239622227103009, + "grad_norm": 1.9486209154129028, + "learning_rate": 3.991581259150806e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.700748860836029, + "num_tokens": 56557396.0, + "step": 2182 + }, + { + "epoch": 0.23973204480562266, + "grad_norm": 2.063616991043091, + "learning_rate": 3.993411420204979e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7113109827041626, + "num_tokens": 56585119.0, + "step": 2183 + }, + { + "epoch": 0.23984186250823633, + "grad_norm": 2.00408935546875, + "learning_rate": 3.995241581259151e-06, + "loss": 0.9, + "mean_token_accuracy": 0.722685694694519, + "num_tokens": 56610155.0, + "step": 2184 + }, + { + "epoch": 0.23995168021084998, + "grad_norm": 2.4112260341644287, + "learning_rate": 3.997071742313324e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6944625973701477, + "num_tokens": 56630138.0, + "step": 2185 + }, + { + "epoch": 0.24006149791346365, + "grad_norm": 2.0450198650360107, + "learning_rate": 3.998901903367496e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7055473923683167, + "num_tokens": 56658635.0, + "step": 2186 + }, + { + "epoch": 0.24017131561607732, + "grad_norm": 2.399503469467163, + "learning_rate": 4.00073206442167e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6933497190475464, + "num_tokens": 56679426.0, + "step": 2187 + }, + { + "epoch": 0.24028113331869097, + "grad_norm": 2.226248264312744, + "learning_rate": 4.002562225475842e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.7049052715301514, + "num_tokens": 56705495.0, + "step": 2188 + }, + { + "epoch": 0.24039095102130464, + "grad_norm": 2.1982991695404053, + "learning_rate": 4.004392386530015e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7053216099739075, + "num_tokens": 56729615.0, + "step": 2189 + }, + { + "epoch": 0.2405007687239183, + "grad_norm": 2.2277328968048096, + "learning_rate": 4.006222547584187e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7112113237380981, + "num_tokens": 56755717.0, + "step": 2190 + }, + { + "epoch": 0.24061058642653196, + "grad_norm": 2.17999005317688, + "learning_rate": 4.00805270863836e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6897125244140625, + "num_tokens": 56781731.0, + "step": 2191 + }, + { + "epoch": 0.2407204041291456, + "grad_norm": 2.3231043815612793, + "learning_rate": 4.009882869692533e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6871398687362671, + "num_tokens": 56805057.0, + "step": 2192 + }, + { + "epoch": 0.24083022183175928, + "grad_norm": 2.137451171875, + "learning_rate": 4.011713030746706e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6944519281387329, + "num_tokens": 56831065.0, + "step": 2193 + }, + { + "epoch": 0.24094003953437293, + "grad_norm": 2.1043550968170166, + "learning_rate": 4.013543191800878e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7065581679344177, + "num_tokens": 56855473.0, + "step": 2194 + }, + { + "epoch": 0.2410498572369866, + "grad_norm": 2.1367104053497314, + "learning_rate": 4.015373352855051e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7067184448242188, + "num_tokens": 56883955.0, + "step": 2195 + }, + { + "epoch": 0.24115967493960028, + "grad_norm": 2.2057952880859375, + "learning_rate": 4.017203513909224e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.693402886390686, + "num_tokens": 56907862.0, + "step": 2196 + }, + { + "epoch": 0.24126949264221392, + "grad_norm": 2.0226969718933105, + "learning_rate": 4.0190336749633974e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6961044073104858, + "num_tokens": 56938145.0, + "step": 2197 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 2.074618101119995, + "learning_rate": 4.0208638360175704e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7079652547836304, + "num_tokens": 56965599.0, + "step": 2198 + }, + { + "epoch": 0.24148912804744124, + "grad_norm": 2.0260958671569824, + "learning_rate": 4.022693997071743e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6946361660957336, + "num_tokens": 56994723.0, + "step": 2199 + }, + { + "epoch": 0.24159894575005492, + "grad_norm": 2.4897146224975586, + "learning_rate": 4.024524158125916e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6985527276992798, + "num_tokens": 57015573.0, + "step": 2200 + }, + { + "epoch": 0.24170876345266856, + "grad_norm": 2.412109613418579, + "learning_rate": 4.026354319180088e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7017049789428711, + "num_tokens": 57037993.0, + "step": 2201 + }, + { + "epoch": 0.24181858115528224, + "grad_norm": 2.0295584201812744, + "learning_rate": 4.028184480234261e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6927420496940613, + "num_tokens": 57069762.0, + "step": 2202 + }, + { + "epoch": 0.24192839885789588, + "grad_norm": 2.3507468700408936, + "learning_rate": 4.030014641288434e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7046170234680176, + "num_tokens": 57091801.0, + "step": 2203 + }, + { + "epoch": 0.24203821656050956, + "grad_norm": 2.1605794429779053, + "learning_rate": 4.031844802342607e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6822352409362793, + "num_tokens": 57117398.0, + "step": 2204 + }, + { + "epoch": 0.2421480342631232, + "grad_norm": 2.1621150970458984, + "learning_rate": 4.033674963396779e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6873182058334351, + "num_tokens": 57145573.0, + "step": 2205 + }, + { + "epoch": 0.24225785196573688, + "grad_norm": 2.1169261932373047, + "learning_rate": 4.035505124450952e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6885799169540405, + "num_tokens": 57172861.0, + "step": 2206 + }, + { + "epoch": 0.24236766966835055, + "grad_norm": 2.333606719970703, + "learning_rate": 4.037335285505124e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6889019012451172, + "num_tokens": 57195312.0, + "step": 2207 + }, + { + "epoch": 0.2424774873709642, + "grad_norm": 2.064140558242798, + "learning_rate": 4.039165446559298e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6964078545570374, + "num_tokens": 57221548.0, + "step": 2208 + }, + { + "epoch": 0.24258730507357787, + "grad_norm": 2.218332529067993, + "learning_rate": 4.04099560761347e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6970848441123962, + "num_tokens": 57245242.0, + "step": 2209 + }, + { + "epoch": 0.24269712277619152, + "grad_norm": 2.401156187057495, + "learning_rate": 4.042825768667643e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6776368618011475, + "num_tokens": 57268963.0, + "step": 2210 + }, + { + "epoch": 0.2428069404788052, + "grad_norm": 2.306330680847168, + "learning_rate": 4.044655929721815e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6963058710098267, + "num_tokens": 57291171.0, + "step": 2211 + }, + { + "epoch": 0.24291675818141883, + "grad_norm": 2.251354932785034, + "learning_rate": 4.046486090775988e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6994779109954834, + "num_tokens": 57317556.0, + "step": 2212 + }, + { + "epoch": 0.2430265758840325, + "grad_norm": 2.4128060340881348, + "learning_rate": 4.048316251830161e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7004298567771912, + "num_tokens": 57337534.0, + "step": 2213 + }, + { + "epoch": 0.24313639358664615, + "grad_norm": 2.0857155323028564, + "learning_rate": 4.050146412884334e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6984939575195312, + "num_tokens": 57364401.0, + "step": 2214 + }, + { + "epoch": 0.24324621128925983, + "grad_norm": 2.3682682514190674, + "learning_rate": 4.051976573938507e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6943387985229492, + "num_tokens": 57387516.0, + "step": 2215 + }, + { + "epoch": 0.2433560289918735, + "grad_norm": 2.1735470294952393, + "learning_rate": 4.0538067349926795e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6936110854148865, + "num_tokens": 57412782.0, + "step": 2216 + }, + { + "epoch": 0.24346584669448715, + "grad_norm": 2.0409746170043945, + "learning_rate": 4.0556368960468526e-06, + "loss": 1.0935, + "mean_token_accuracy": 0.6724249720573425, + "num_tokens": 57442981.0, + "step": 2217 + }, + { + "epoch": 0.24357566439710082, + "grad_norm": 2.144796371459961, + "learning_rate": 4.057467057101025e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7132107019424438, + "num_tokens": 57466342.0, + "step": 2218 + }, + { + "epoch": 0.24368548209971447, + "grad_norm": 2.3433592319488525, + "learning_rate": 4.0592972181551986e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7140165567398071, + "num_tokens": 57488515.0, + "step": 2219 + }, + { + "epoch": 0.24379529980232814, + "grad_norm": 2.09291672706604, + "learning_rate": 4.061127379209371e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6939198970794678, + "num_tokens": 57517505.0, + "step": 2220 + }, + { + "epoch": 0.2439051175049418, + "grad_norm": 2.360901117324829, + "learning_rate": 4.062957540263544e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7197234034538269, + "num_tokens": 57538002.0, + "step": 2221 + }, + { + "epoch": 0.24401493520755546, + "grad_norm": 2.102224588394165, + "learning_rate": 4.064787701317716e-06, + "loss": 1.133, + "mean_token_accuracy": 0.6618860960006714, + "num_tokens": 57567958.0, + "step": 2222 + }, + { + "epoch": 0.2441247529101691, + "grad_norm": 1.9941717386245728, + "learning_rate": 4.066617862371889e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7076415419578552, + "num_tokens": 57595798.0, + "step": 2223 + }, + { + "epoch": 0.24423457061278278, + "grad_norm": 2.189671277999878, + "learning_rate": 4.068448023426062e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.6992936134338379, + "num_tokens": 57619205.0, + "step": 2224 + }, + { + "epoch": 0.24434438831539645, + "grad_norm": 2.0975842475891113, + "learning_rate": 4.070278184480235e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7198361158370972, + "num_tokens": 57645888.0, + "step": 2225 + }, + { + "epoch": 0.2444542060180101, + "grad_norm": 2.5894775390625, + "learning_rate": 4.072108345534407e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7022230625152588, + "num_tokens": 57665159.0, + "step": 2226 + }, + { + "epoch": 0.24456402372062377, + "grad_norm": 2.3751237392425537, + "learning_rate": 4.07393850658858e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6876435875892639, + "num_tokens": 57690920.0, + "step": 2227 + }, + { + "epoch": 0.24467384142323742, + "grad_norm": 2.2622854709625244, + "learning_rate": 4.075768667642752e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7316443920135498, + "num_tokens": 57712819.0, + "step": 2228 + }, + { + "epoch": 0.2447836591258511, + "grad_norm": 2.114708662033081, + "learning_rate": 4.077598828696926e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7098867893218994, + "num_tokens": 57739517.0, + "step": 2229 + }, + { + "epoch": 0.24489347682846474, + "grad_norm": 2.1160638332366943, + "learning_rate": 4.079428989751098e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6878975033760071, + "num_tokens": 57767022.0, + "step": 2230 + }, + { + "epoch": 0.2450032945310784, + "grad_norm": 2.33954119682312, + "learning_rate": 4.081259150805271e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7036498188972473, + "num_tokens": 57791722.0, + "step": 2231 + }, + { + "epoch": 0.24511311223369206, + "grad_norm": 2.1888866424560547, + "learning_rate": 4.083089311859444e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7031255960464478, + "num_tokens": 57816447.0, + "step": 2232 + }, + { + "epoch": 0.24522292993630573, + "grad_norm": 1.8480201959609985, + "learning_rate": 4.0849194729136165e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.6658292412757874, + "num_tokens": 57848671.0, + "step": 2233 + }, + { + "epoch": 0.2453327476389194, + "grad_norm": 2.197723865509033, + "learning_rate": 4.0867496339677895e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7072080373764038, + "num_tokens": 57873232.0, + "step": 2234 + }, + { + "epoch": 0.24544256534153305, + "grad_norm": 2.528799057006836, + "learning_rate": 4.0885797950219625e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6951351761817932, + "num_tokens": 57897789.0, + "step": 2235 + }, + { + "epoch": 0.24555238304414673, + "grad_norm": 2.206258773803711, + "learning_rate": 4.0904099560761355e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6964821219444275, + "num_tokens": 57924495.0, + "step": 2236 + }, + { + "epoch": 0.24566220074676037, + "grad_norm": 2.1767477989196777, + "learning_rate": 4.092240117130308e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7094863653182983, + "num_tokens": 57950781.0, + "step": 2237 + }, + { + "epoch": 0.24577201844937405, + "grad_norm": 2.2772774696350098, + "learning_rate": 4.094070278184481e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6950737833976746, + "num_tokens": 57972957.0, + "step": 2238 + }, + { + "epoch": 0.2458818361519877, + "grad_norm": 2.3556082248687744, + "learning_rate": 4.095900439238653e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7012699842453003, + "num_tokens": 57996357.0, + "step": 2239 + }, + { + "epoch": 0.24599165385460137, + "grad_norm": 1.9883756637573242, + "learning_rate": 4.097730600292827e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6967004537582397, + "num_tokens": 58029104.0, + "step": 2240 + }, + { + "epoch": 0.246101471557215, + "grad_norm": 2.157850980758667, + "learning_rate": 4.099560761346999e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7043622732162476, + "num_tokens": 58053191.0, + "step": 2241 + }, + { + "epoch": 0.2462112892598287, + "grad_norm": 2.237034559249878, + "learning_rate": 4.101390922401172e-06, + "loss": 1.07, + "mean_token_accuracy": 0.6787869930267334, + "num_tokens": 58077448.0, + "step": 2242 + }, + { + "epoch": 0.24632110696244233, + "grad_norm": 1.8771833181381226, + "learning_rate": 4.103221083455344e-06, + "loss": 1.0816, + "mean_token_accuracy": 0.6774435043334961, + "num_tokens": 58109999.0, + "step": 2243 + }, + { + "epoch": 0.246430924665056, + "grad_norm": 2.052947759628296, + "learning_rate": 4.105051244509517e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6791210174560547, + "num_tokens": 58141322.0, + "step": 2244 + }, + { + "epoch": 0.24654074236766968, + "grad_norm": 2.4137790203094482, + "learning_rate": 4.10688140556369e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.689058780670166, + "num_tokens": 58164008.0, + "step": 2245 + }, + { + "epoch": 0.24665056007028333, + "grad_norm": 2.005048990249634, + "learning_rate": 4.108711566617863e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6960228681564331, + "num_tokens": 58190469.0, + "step": 2246 + }, + { + "epoch": 0.246760377772897, + "grad_norm": 2.0642900466918945, + "learning_rate": 4.110541727672035e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7068517208099365, + "num_tokens": 58216202.0, + "step": 2247 + }, + { + "epoch": 0.24687019547551065, + "grad_norm": 2.0930535793304443, + "learning_rate": 4.112371888726208e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7160125970840454, + "num_tokens": 58240249.0, + "step": 2248 + }, + { + "epoch": 0.24698001317812432, + "grad_norm": 2.1253528594970703, + "learning_rate": 4.114202049780381e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.6695959568023682, + "num_tokens": 58270097.0, + "step": 2249 + }, + { + "epoch": 0.24708983088073797, + "grad_norm": 2.2137277126312256, + "learning_rate": 4.116032210834553e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6856048107147217, + "num_tokens": 58294885.0, + "step": 2250 + }, + { + "epoch": 0.24719964858335164, + "grad_norm": 2.5120341777801514, + "learning_rate": 4.1178623718887264e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6916258335113525, + "num_tokens": 58314415.0, + "step": 2251 + }, + { + "epoch": 0.24730946628596528, + "grad_norm": 2.0625810623168945, + "learning_rate": 4.1196925329428994e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.6725942492485046, + "num_tokens": 58343697.0, + "step": 2252 + }, + { + "epoch": 0.24741928398857896, + "grad_norm": 2.128310203552246, + "learning_rate": 4.1215226939970724e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6833791732788086, + "num_tokens": 58369718.0, + "step": 2253 + }, + { + "epoch": 0.24752910169119263, + "grad_norm": 2.0708651542663574, + "learning_rate": 4.123352855051245e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.719322144985199, + "num_tokens": 58396372.0, + "step": 2254 + }, + { + "epoch": 0.24763891939380628, + "grad_norm": 2.090420722961426, + "learning_rate": 4.125183016105418e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7130087614059448, + "num_tokens": 58422960.0, + "step": 2255 + }, + { + "epoch": 0.24774873709641995, + "grad_norm": 2.2838258743286133, + "learning_rate": 4.127013177159591e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7210384607315063, + "num_tokens": 58444372.0, + "step": 2256 + }, + { + "epoch": 0.2478585547990336, + "grad_norm": 2.0891025066375732, + "learning_rate": 4.128843338213764e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6952673196792603, + "num_tokens": 58470348.0, + "step": 2257 + }, + { + "epoch": 0.24796837250164727, + "grad_norm": 2.3148176670074463, + "learning_rate": 4.130673499267936e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7130753993988037, + "num_tokens": 58490511.0, + "step": 2258 + }, + { + "epoch": 0.24807819020426092, + "grad_norm": 2.1575047969818115, + "learning_rate": 4.132503660322109e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6972298622131348, + "num_tokens": 58514401.0, + "step": 2259 + }, + { + "epoch": 0.2481880079068746, + "grad_norm": 2.152855634689331, + "learning_rate": 4.134333821376281e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.711506724357605, + "num_tokens": 58539369.0, + "step": 2260 + }, + { + "epoch": 0.24829782560948824, + "grad_norm": 2.0688490867614746, + "learning_rate": 4.136163982430455e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6925662159919739, + "num_tokens": 58567212.0, + "step": 2261 + }, + { + "epoch": 0.2484076433121019, + "grad_norm": 2.224217414855957, + "learning_rate": 4.137994143484627e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6930724382400513, + "num_tokens": 58591765.0, + "step": 2262 + }, + { + "epoch": 0.24851746101471558, + "grad_norm": 1.9842098951339722, + "learning_rate": 4.1398243045388e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6796068549156189, + "num_tokens": 58622400.0, + "step": 2263 + }, + { + "epoch": 0.24862727871732923, + "grad_norm": 2.402411699295044, + "learning_rate": 4.141654465592972e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.686815619468689, + "num_tokens": 58643920.0, + "step": 2264 + }, + { + "epoch": 0.2487370964199429, + "grad_norm": 1.9848308563232422, + "learning_rate": 4.143484626647145e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6810168623924255, + "num_tokens": 58674217.0, + "step": 2265 + }, + { + "epoch": 0.24884691412255655, + "grad_norm": 2.079120397567749, + "learning_rate": 4.145314787701318e-06, + "loss": 1.0654, + "mean_token_accuracy": 0.6852896213531494, + "num_tokens": 58701611.0, + "step": 2266 + }, + { + "epoch": 0.24895673182517022, + "grad_norm": 2.0023140907287598, + "learning_rate": 4.147144948755491e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7045924067497253, + "num_tokens": 58728659.0, + "step": 2267 + }, + { + "epoch": 0.24906654952778387, + "grad_norm": 2.102248430252075, + "learning_rate": 4.148975109809663e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6912795901298523, + "num_tokens": 58755394.0, + "step": 2268 + }, + { + "epoch": 0.24917636723039754, + "grad_norm": 1.9485872983932495, + "learning_rate": 4.150805270863836e-06, + "loss": 1.0925, + "mean_token_accuracy": 0.6796934008598328, + "num_tokens": 58784085.0, + "step": 2269 + }, + { + "epoch": 0.2492861849330112, + "grad_norm": 2.3226847648620605, + "learning_rate": 4.152635431918009e-06, + "loss": 1.1056, + "mean_token_accuracy": 0.6750585436820984, + "num_tokens": 58808338.0, + "step": 2270 + }, + { + "epoch": 0.24939600263562486, + "grad_norm": 1.9777439832687378, + "learning_rate": 4.1544655929721815e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7029304504394531, + "num_tokens": 58836074.0, + "step": 2271 + }, + { + "epoch": 0.24950582033823854, + "grad_norm": 2.1521811485290527, + "learning_rate": 4.1562957540263546e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6947702169418335, + "num_tokens": 58859539.0, + "step": 2272 + }, + { + "epoch": 0.24961563804085218, + "grad_norm": 2.2973721027374268, + "learning_rate": 4.1581259150805276e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6893110871315002, + "num_tokens": 58880410.0, + "step": 2273 + }, + { + "epoch": 0.24972545574346586, + "grad_norm": 1.8499683141708374, + "learning_rate": 4.1599560761347006e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6820756196975708, + "num_tokens": 58913684.0, + "step": 2274 + }, + { + "epoch": 0.2498352734460795, + "grad_norm": 2.0082316398620605, + "learning_rate": 4.161786237188873e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6869965195655823, + "num_tokens": 58941437.0, + "step": 2275 + }, + { + "epoch": 0.24994509114869318, + "grad_norm": 2.404736280441284, + "learning_rate": 4.163616398243046e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7103821039199829, + "num_tokens": 58960547.0, + "step": 2276 + }, + { + "epoch": 0.2500549088513068, + "grad_norm": 2.1061532497406006, + "learning_rate": 4.165446559297219e-06, + "loss": 1.1394, + "mean_token_accuracy": 0.6626806259155273, + "num_tokens": 58987953.0, + "step": 2277 + }, + { + "epoch": 0.25016472655392047, + "grad_norm": 2.127122402191162, + "learning_rate": 4.167276720351392e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7192547917366028, + "num_tokens": 59011919.0, + "step": 2278 + }, + { + "epoch": 0.25027454425653417, + "grad_norm": 2.2223117351531982, + "learning_rate": 4.169106881405564e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7084774971008301, + "num_tokens": 59033880.0, + "step": 2279 + }, + { + "epoch": 0.2503843619591478, + "grad_norm": 2.2376132011413574, + "learning_rate": 4.170937042459737e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6928561925888062, + "num_tokens": 59055521.0, + "step": 2280 + }, + { + "epoch": 0.25049417966176146, + "grad_norm": 2.00703501701355, + "learning_rate": 4.172767203513909e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7153581976890564, + "num_tokens": 59081304.0, + "step": 2281 + }, + { + "epoch": 0.25060399736437516, + "grad_norm": 2.3182809352874756, + "learning_rate": 4.174597364568082e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.707443118095398, + "num_tokens": 59103201.0, + "step": 2282 + }, + { + "epoch": 0.2507138150669888, + "grad_norm": 2.013986825942993, + "learning_rate": 4.176427525622255e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6793150901794434, + "num_tokens": 59133686.0, + "step": 2283 + }, + { + "epoch": 0.25082363276960246, + "grad_norm": 2.1366889476776123, + "learning_rate": 4.178257686676428e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6871165037155151, + "num_tokens": 59157314.0, + "step": 2284 + }, + { + "epoch": 0.2509334504722161, + "grad_norm": 1.849265456199646, + "learning_rate": 4.1800878477306e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6754720211029053, + "num_tokens": 59190437.0, + "step": 2285 + }, + { + "epoch": 0.2510432681748298, + "grad_norm": 1.9922195672988892, + "learning_rate": 4.181918008784773e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6842412948608398, + "num_tokens": 59216585.0, + "step": 2286 + }, + { + "epoch": 0.25115308587744345, + "grad_norm": 2.182142496109009, + "learning_rate": 4.183748169838946e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7043126821517944, + "num_tokens": 59240642.0, + "step": 2287 + }, + { + "epoch": 0.2512629035800571, + "grad_norm": 2.1013617515563965, + "learning_rate": 4.185578330893119e-06, + "loss": 1.0972, + "mean_token_accuracy": 0.6727428436279297, + "num_tokens": 59268101.0, + "step": 2288 + }, + { + "epoch": 0.25137272128267074, + "grad_norm": 1.99204683303833, + "learning_rate": 4.1874084919472915e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7058904767036438, + "num_tokens": 59298070.0, + "step": 2289 + }, + { + "epoch": 0.25148253898528444, + "grad_norm": 2.1032681465148926, + "learning_rate": 4.1892386530014645e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7117304801940918, + "num_tokens": 59325547.0, + "step": 2290 + }, + { + "epoch": 0.2515923566878981, + "grad_norm": 2.2671563625335693, + "learning_rate": 4.1910688140556375e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7358466386795044, + "num_tokens": 59348467.0, + "step": 2291 + }, + { + "epoch": 0.25170217439051173, + "grad_norm": 2.028935194015503, + "learning_rate": 4.19289897510981e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7211266160011292, + "num_tokens": 59375138.0, + "step": 2292 + }, + { + "epoch": 0.25181199209312544, + "grad_norm": 2.261834144592285, + "learning_rate": 4.194729136163983e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7234324216842651, + "num_tokens": 59396876.0, + "step": 2293 + }, + { + "epoch": 0.2519218097957391, + "grad_norm": 1.7911012172698975, + "learning_rate": 4.196559297218156e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6963852643966675, + "num_tokens": 59431562.0, + "step": 2294 + }, + { + "epoch": 0.25203162749835273, + "grad_norm": 2.1193947792053223, + "learning_rate": 4.198389458272329e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6768815517425537, + "num_tokens": 59458343.0, + "step": 2295 + }, + { + "epoch": 0.2521414452009664, + "grad_norm": 2.0265188217163086, + "learning_rate": 4.200219619326501e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6952103972434998, + "num_tokens": 59485007.0, + "step": 2296 + }, + { + "epoch": 0.2522512629035801, + "grad_norm": 1.8675094842910767, + "learning_rate": 4.202049780380674e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6829603910446167, + "num_tokens": 59516350.0, + "step": 2297 + }, + { + "epoch": 0.2523610806061937, + "grad_norm": 2.0750081539154053, + "learning_rate": 4.203879941434846e-06, + "loss": 1.1369, + "mean_token_accuracy": 0.6659868955612183, + "num_tokens": 59546230.0, + "step": 2298 + }, + { + "epoch": 0.25247089830880737, + "grad_norm": 2.1373555660247803, + "learning_rate": 4.20571010248902e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7060263156890869, + "num_tokens": 59570288.0, + "step": 2299 + }, + { + "epoch": 0.252580716011421, + "grad_norm": 2.242910861968994, + "learning_rate": 4.207540263543192e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7008090615272522, + "num_tokens": 59592929.0, + "step": 2300 + }, + { + "epoch": 0.2526905337140347, + "grad_norm": 1.9589051008224487, + "learning_rate": 4.209370424597365e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.6951314210891724, + "num_tokens": 59622284.0, + "step": 2301 + }, + { + "epoch": 0.25280035141664836, + "grad_norm": 2.012051820755005, + "learning_rate": 4.211200585651537e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6877732872962952, + "num_tokens": 59653212.0, + "step": 2302 + }, + { + "epoch": 0.252910169119262, + "grad_norm": 2.1007423400878906, + "learning_rate": 4.21303074670571e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6977248787879944, + "num_tokens": 59682834.0, + "step": 2303 + }, + { + "epoch": 0.2530199868218757, + "grad_norm": 2.158951759338379, + "learning_rate": 4.214860907759883e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7054728865623474, + "num_tokens": 59708311.0, + "step": 2304 + }, + { + "epoch": 0.25312980452448935, + "grad_norm": 2.18381929397583, + "learning_rate": 4.216691068814056e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7019619941711426, + "num_tokens": 59732916.0, + "step": 2305 + }, + { + "epoch": 0.253239622227103, + "grad_norm": 2.421536922454834, + "learning_rate": 4.2185212298682284e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7028872966766357, + "num_tokens": 59752480.0, + "step": 2306 + }, + { + "epoch": 0.25334943992971665, + "grad_norm": 2.1168124675750732, + "learning_rate": 4.2203513909224014e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.689725935459137, + "num_tokens": 59778235.0, + "step": 2307 + }, + { + "epoch": 0.25345925763233035, + "grad_norm": 2.1843175888061523, + "learning_rate": 4.2221815519765744e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6907585263252258, + "num_tokens": 59805268.0, + "step": 2308 + }, + { + "epoch": 0.253569075334944, + "grad_norm": 1.9268579483032227, + "learning_rate": 4.2240117130307475e-06, + "loss": 1.0888, + "mean_token_accuracy": 0.6823249459266663, + "num_tokens": 59836595.0, + "step": 2309 + }, + { + "epoch": 0.25367889303755764, + "grad_norm": 1.9386717081069946, + "learning_rate": 4.22584187408492e-06, + "loss": 1.0857, + "mean_token_accuracy": 0.6783442497253418, + "num_tokens": 59866823.0, + "step": 2310 + }, + { + "epoch": 0.25378871074017134, + "grad_norm": 1.9901707172393799, + "learning_rate": 4.227672035139093e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7215732932090759, + "num_tokens": 59893686.0, + "step": 2311 + }, + { + "epoch": 0.253898528442785, + "grad_norm": 2.151937961578369, + "learning_rate": 4.229502196193266e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7200161218643188, + "num_tokens": 59916919.0, + "step": 2312 + }, + { + "epoch": 0.25400834614539863, + "grad_norm": 1.9970136880874634, + "learning_rate": 4.231332357247438e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.707335889339447, + "num_tokens": 59945004.0, + "step": 2313 + }, + { + "epoch": 0.2541181638480123, + "grad_norm": 2.2469680309295654, + "learning_rate": 4.233162518301611e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6979697942733765, + "num_tokens": 59971274.0, + "step": 2314 + }, + { + "epoch": 0.254227981550626, + "grad_norm": 1.9603781700134277, + "learning_rate": 4.234992679355784e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7062933444976807, + "num_tokens": 60002461.0, + "step": 2315 + }, + { + "epoch": 0.2543377992532396, + "grad_norm": 1.994802474975586, + "learning_rate": 4.236822840409957e-06, + "loss": 0.988, + "mean_token_accuracy": 0.695576012134552, + "num_tokens": 60029307.0, + "step": 2316 + }, + { + "epoch": 0.2544476169558533, + "grad_norm": 2.0924291610717773, + "learning_rate": 4.238653001464129e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7073675394058228, + "num_tokens": 60055628.0, + "step": 2317 + }, + { + "epoch": 0.2545574346584669, + "grad_norm": 2.1249756813049316, + "learning_rate": 4.240483162518302e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6811947226524353, + "num_tokens": 60082672.0, + "step": 2318 + }, + { + "epoch": 0.2546672523610806, + "grad_norm": 1.9716508388519287, + "learning_rate": 4.242313323572474e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7006199955940247, + "num_tokens": 60110675.0, + "step": 2319 + }, + { + "epoch": 0.25477707006369427, + "grad_norm": 2.046309232711792, + "learning_rate": 4.244143484626648e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7129342555999756, + "num_tokens": 60135806.0, + "step": 2320 + }, + { + "epoch": 0.2548868877663079, + "grad_norm": 1.9581613540649414, + "learning_rate": 4.24597364568082e-06, + "loss": 1.086, + "mean_token_accuracy": 0.6776084899902344, + "num_tokens": 60165670.0, + "step": 2321 + }, + { + "epoch": 0.2549967054689216, + "grad_norm": 2.140655517578125, + "learning_rate": 4.247803806734993e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.693440318107605, + "num_tokens": 60191248.0, + "step": 2322 + }, + { + "epoch": 0.25510652317153526, + "grad_norm": 2.135150671005249, + "learning_rate": 4.249633967789165e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6800218820571899, + "num_tokens": 60217460.0, + "step": 2323 + }, + { + "epoch": 0.2552163408741489, + "grad_norm": 2.351513624191284, + "learning_rate": 4.251464128843338e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6940889954566956, + "num_tokens": 60240345.0, + "step": 2324 + }, + { + "epoch": 0.25532615857676255, + "grad_norm": 2.094175338745117, + "learning_rate": 4.253294289897511e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6734143495559692, + "num_tokens": 60267318.0, + "step": 2325 + }, + { + "epoch": 0.25543597627937625, + "grad_norm": 2.182050943374634, + "learning_rate": 4.255124450951684e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6940149068832397, + "num_tokens": 60292681.0, + "step": 2326 + }, + { + "epoch": 0.2555457939819899, + "grad_norm": 2.2391550540924072, + "learning_rate": 4.2569546120058566e-06, + "loss": 1.0923, + "mean_token_accuracy": 0.6694439053535461, + "num_tokens": 60314941.0, + "step": 2327 + }, + { + "epoch": 0.25565561168460355, + "grad_norm": 1.9896633625030518, + "learning_rate": 4.2587847730600296e-06, + "loss": 1.1094, + "mean_token_accuracy": 0.6628687381744385, + "num_tokens": 60345113.0, + "step": 2328 + }, + { + "epoch": 0.25576542938721725, + "grad_norm": 2.155662775039673, + "learning_rate": 4.260614934114203e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7070128321647644, + "num_tokens": 60369916.0, + "step": 2329 + }, + { + "epoch": 0.2558752470898309, + "grad_norm": 2.257185935974121, + "learning_rate": 4.262445095168375e-06, + "loss": 1.1065, + "mean_token_accuracy": 0.6767371892929077, + "num_tokens": 60396604.0, + "step": 2330 + }, + { + "epoch": 0.25598506479244454, + "grad_norm": 1.9492852687835693, + "learning_rate": 4.264275256222548e-06, + "loss": 1.1014, + "mean_token_accuracy": 0.6760590672492981, + "num_tokens": 60428252.0, + "step": 2331 + }, + { + "epoch": 0.2560948824950582, + "grad_norm": 1.9588537216186523, + "learning_rate": 4.266105417276721e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7172650098800659, + "num_tokens": 60456579.0, + "step": 2332 + }, + { + "epoch": 0.2562047001976719, + "grad_norm": 2.48470401763916, + "learning_rate": 4.267935578330894e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.691482663154602, + "num_tokens": 60479016.0, + "step": 2333 + }, + { + "epoch": 0.25631451790028553, + "grad_norm": 2.4493045806884766, + "learning_rate": 4.269765739385066e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6991885304450989, + "num_tokens": 60500878.0, + "step": 2334 + }, + { + "epoch": 0.2564243356028992, + "grad_norm": 2.128559112548828, + "learning_rate": 4.271595900439239e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6988646984100342, + "num_tokens": 60525370.0, + "step": 2335 + }, + { + "epoch": 0.2565341533055128, + "grad_norm": 2.3899519443511963, + "learning_rate": 4.273426061493412e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7148752808570862, + "num_tokens": 60546162.0, + "step": 2336 + }, + { + "epoch": 0.2566439710081265, + "grad_norm": 1.9750045537948608, + "learning_rate": 4.275256222547585e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7090657949447632, + "num_tokens": 60576931.0, + "step": 2337 + }, + { + "epoch": 0.25675378871074017, + "grad_norm": 2.3516910076141357, + "learning_rate": 4.277086383601757e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6971577405929565, + "num_tokens": 60598113.0, + "step": 2338 + }, + { + "epoch": 0.2568636064133538, + "grad_norm": 1.9300264120101929, + "learning_rate": 4.27891654465593e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6894029378890991, + "num_tokens": 60627128.0, + "step": 2339 + }, + { + "epoch": 0.2569734241159675, + "grad_norm": 1.8384114503860474, + "learning_rate": 4.280746705710102e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.686360239982605, + "num_tokens": 60660417.0, + "step": 2340 + }, + { + "epoch": 0.25708324181858117, + "grad_norm": 2.1920132637023926, + "learning_rate": 4.282576866764276e-06, + "loss": 1.0984, + "mean_token_accuracy": 0.6664249300956726, + "num_tokens": 60686408.0, + "step": 2341 + }, + { + "epoch": 0.2571930595211948, + "grad_norm": 1.937453031539917, + "learning_rate": 4.284407027818448e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7167497277259827, + "num_tokens": 60716495.0, + "step": 2342 + }, + { + "epoch": 0.25730287722380846, + "grad_norm": 2.338350772857666, + "learning_rate": 4.286237188872621e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6925116777420044, + "num_tokens": 60740487.0, + "step": 2343 + }, + { + "epoch": 0.25741269492642216, + "grad_norm": 2.025805950164795, + "learning_rate": 4.2880673499267935e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6943753361701965, + "num_tokens": 60768484.0, + "step": 2344 + }, + { + "epoch": 0.2575225126290358, + "grad_norm": 1.9584648609161377, + "learning_rate": 4.2898975109809665e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6991627216339111, + "num_tokens": 60797373.0, + "step": 2345 + }, + { + "epoch": 0.25763233033164945, + "grad_norm": 2.1156349182128906, + "learning_rate": 4.2917276720351395e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6890716552734375, + "num_tokens": 60822016.0, + "step": 2346 + }, + { + "epoch": 0.2577421480342631, + "grad_norm": 2.1933982372283936, + "learning_rate": 4.2935578330893125e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6986793875694275, + "num_tokens": 60843760.0, + "step": 2347 + }, + { + "epoch": 0.2578519657368768, + "grad_norm": 2.114889144897461, + "learning_rate": 4.295387994143485e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7087690830230713, + "num_tokens": 60866510.0, + "step": 2348 + }, + { + "epoch": 0.25796178343949044, + "grad_norm": 2.0731892585754395, + "learning_rate": 4.297218155197658e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.711483359336853, + "num_tokens": 60889862.0, + "step": 2349 + }, + { + "epoch": 0.2580716011421041, + "grad_norm": 1.9465018510818481, + "learning_rate": 4.299048316251831e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7008166909217834, + "num_tokens": 60918590.0, + "step": 2350 + }, + { + "epoch": 0.2581814188447178, + "grad_norm": 1.7921497821807861, + "learning_rate": 4.300878477306003e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7031756043434143, + "num_tokens": 60952454.0, + "step": 2351 + }, + { + "epoch": 0.25829123654733144, + "grad_norm": 2.114154577255249, + "learning_rate": 4.302708638360176e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7076402902603149, + "num_tokens": 60977521.0, + "step": 2352 + }, + { + "epoch": 0.2584010542499451, + "grad_norm": 2.4137513637542725, + "learning_rate": 4.304538799414349e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6945133805274963, + "num_tokens": 60999757.0, + "step": 2353 + }, + { + "epoch": 0.25851087195255873, + "grad_norm": 1.9992717504501343, + "learning_rate": 4.306368960468522e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6991364359855652, + "num_tokens": 61025535.0, + "step": 2354 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 1.9244478940963745, + "learning_rate": 4.308199121522694e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.691982626914978, + "num_tokens": 61057199.0, + "step": 2355 + }, + { + "epoch": 0.2587305073577861, + "grad_norm": 2.253554582595825, + "learning_rate": 4.310029282576867e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7043365240097046, + "num_tokens": 61079753.0, + "step": 2356 + }, + { + "epoch": 0.2588403250603997, + "grad_norm": 2.447730541229248, + "learning_rate": 4.31185944363104e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7119008898735046, + "num_tokens": 61100190.0, + "step": 2357 + }, + { + "epoch": 0.2589501427630134, + "grad_norm": 2.4901154041290283, + "learning_rate": 4.313689604685213e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7003103494644165, + "num_tokens": 61119310.0, + "step": 2358 + }, + { + "epoch": 0.25905996046562707, + "grad_norm": 2.3497869968414307, + "learning_rate": 4.315519765739385e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7171807289123535, + "num_tokens": 61139974.0, + "step": 2359 + }, + { + "epoch": 0.2591697781682407, + "grad_norm": 2.163853883743286, + "learning_rate": 4.317349926793558e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6961051225662231, + "num_tokens": 61165738.0, + "step": 2360 + }, + { + "epoch": 0.25927959587085436, + "grad_norm": 2.074801206588745, + "learning_rate": 4.3191800878477304e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7098199129104614, + "num_tokens": 61191496.0, + "step": 2361 + }, + { + "epoch": 0.25938941357346806, + "grad_norm": 1.9317058324813843, + "learning_rate": 4.3210102489019034e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7173408269882202, + "num_tokens": 61219653.0, + "step": 2362 + }, + { + "epoch": 0.2594992312760817, + "grad_norm": 1.984092354774475, + "learning_rate": 4.3228404099560765e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6850926876068115, + "num_tokens": 61248989.0, + "step": 2363 + }, + { + "epoch": 0.25960904897869536, + "grad_norm": 2.0381815433502197, + "learning_rate": 4.3246705710102495e-06, + "loss": 1.034, + "mean_token_accuracy": 0.686854362487793, + "num_tokens": 61278608.0, + "step": 2364 + }, + { + "epoch": 0.259718866681309, + "grad_norm": 1.9754788875579834, + "learning_rate": 4.326500732064422e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.686011552810669, + "num_tokens": 61307139.0, + "step": 2365 + }, + { + "epoch": 0.2598286843839227, + "grad_norm": 2.2545552253723145, + "learning_rate": 4.328330893118595e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7067151665687561, + "num_tokens": 61330022.0, + "step": 2366 + }, + { + "epoch": 0.25993850208653635, + "grad_norm": 2.3558428287506104, + "learning_rate": 4.330161054172768e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6967617273330688, + "num_tokens": 61350480.0, + "step": 2367 + }, + { + "epoch": 0.26004831978915, + "grad_norm": 2.3018364906311035, + "learning_rate": 4.331991215226941e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6888870000839233, + "num_tokens": 61372998.0, + "step": 2368 + }, + { + "epoch": 0.2601581374917637, + "grad_norm": 2.241023302078247, + "learning_rate": 4.333821376281113e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.6874969601631165, + "num_tokens": 61396892.0, + "step": 2369 + }, + { + "epoch": 0.26026795519437734, + "grad_norm": 1.997959017753601, + "learning_rate": 4.335651537335286e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6817855834960938, + "num_tokens": 61425348.0, + "step": 2370 + }, + { + "epoch": 0.260377772896991, + "grad_norm": 2.0305802822113037, + "learning_rate": 4.337481698389459e-06, + "loss": 1.0677, + "mean_token_accuracy": 0.6857413649559021, + "num_tokens": 61455316.0, + "step": 2371 + }, + { + "epoch": 0.26048759059960463, + "grad_norm": 2.6193275451660156, + "learning_rate": 4.339311859443631e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7031186819076538, + "num_tokens": 61475786.0, + "step": 2372 + }, + { + "epoch": 0.26059740830221834, + "grad_norm": 2.231746196746826, + "learning_rate": 4.341142020497804e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6978198289871216, + "num_tokens": 61497523.0, + "step": 2373 + }, + { + "epoch": 0.260707226004832, + "grad_norm": 2.0618836879730225, + "learning_rate": 4.342972181551977e-06, + "loss": 1.1034, + "mean_token_accuracy": 0.6677507162094116, + "num_tokens": 61524390.0, + "step": 2374 + }, + { + "epoch": 0.26081704370744563, + "grad_norm": 2.280094623565674, + "learning_rate": 4.34480234260615e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.693571925163269, + "num_tokens": 61545834.0, + "step": 2375 + }, + { + "epoch": 0.2609268614100593, + "grad_norm": 2.0355045795440674, + "learning_rate": 4.346632503660322e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6939377784729004, + "num_tokens": 61571372.0, + "step": 2376 + }, + { + "epoch": 0.261036679112673, + "grad_norm": 2.0271308422088623, + "learning_rate": 4.348462664714495e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6954503059387207, + "num_tokens": 61598181.0, + "step": 2377 + }, + { + "epoch": 0.2611464968152866, + "grad_norm": 2.2731220722198486, + "learning_rate": 4.350292825768667e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7039927244186401, + "num_tokens": 61621608.0, + "step": 2378 + }, + { + "epoch": 0.26125631451790027, + "grad_norm": 2.3218116760253906, + "learning_rate": 4.352122986822841e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7105076313018799, + "num_tokens": 61644684.0, + "step": 2379 + }, + { + "epoch": 0.26136613222051397, + "grad_norm": 2.3476059436798096, + "learning_rate": 4.353953147877013e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7170002460479736, + "num_tokens": 61665778.0, + "step": 2380 + }, + { + "epoch": 0.2614759499231276, + "grad_norm": 2.0299201011657715, + "learning_rate": 4.355783308931186e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6920366287231445, + "num_tokens": 61693569.0, + "step": 2381 + }, + { + "epoch": 0.26158576762574126, + "grad_norm": 2.2104318141937256, + "learning_rate": 4.3576134699853586e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6905559301376343, + "num_tokens": 61719180.0, + "step": 2382 + }, + { + "epoch": 0.2616955853283549, + "grad_norm": 1.984513282775879, + "learning_rate": 4.3594436310395316e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7094560265541077, + "num_tokens": 61745538.0, + "step": 2383 + }, + { + "epoch": 0.2618054030309686, + "grad_norm": 1.6803512573242188, + "learning_rate": 4.361273792093705e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6772675514221191, + "num_tokens": 61781320.0, + "step": 2384 + }, + { + "epoch": 0.26191522073358225, + "grad_norm": 2.010737180709839, + "learning_rate": 4.363103953147878e-06, + "loss": 1.004, + "mean_token_accuracy": 0.695859432220459, + "num_tokens": 61809863.0, + "step": 2385 + }, + { + "epoch": 0.2620250384361959, + "grad_norm": 1.9783345460891724, + "learning_rate": 4.36493411420205e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6912809014320374, + "num_tokens": 61837201.0, + "step": 2386 + }, + { + "epoch": 0.2621348561388096, + "grad_norm": 1.891943335533142, + "learning_rate": 4.366764275256223e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6901829242706299, + "num_tokens": 61865872.0, + "step": 2387 + }, + { + "epoch": 0.26224467384142325, + "grad_norm": 2.3830575942993164, + "learning_rate": 4.368594436310396e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6980398297309875, + "num_tokens": 61888455.0, + "step": 2388 + }, + { + "epoch": 0.2623544915440369, + "grad_norm": 2.1559715270996094, + "learning_rate": 4.370424597364569e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7041763067245483, + "num_tokens": 61915021.0, + "step": 2389 + }, + { + "epoch": 0.26246430924665054, + "grad_norm": 2.274522542953491, + "learning_rate": 4.372254758418741e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6835631728172302, + "num_tokens": 61937514.0, + "step": 2390 + }, + { + "epoch": 0.26257412694926424, + "grad_norm": 1.8387731313705444, + "learning_rate": 4.374084919472914e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7050626277923584, + "num_tokens": 61968926.0, + "step": 2391 + }, + { + "epoch": 0.2626839446518779, + "grad_norm": 2.0613715648651123, + "learning_rate": 4.375915080527087e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6851986050605774, + "num_tokens": 61995043.0, + "step": 2392 + }, + { + "epoch": 0.26279376235449153, + "grad_norm": 1.8265780210494995, + "learning_rate": 4.377745241581259e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6872812509536743, + "num_tokens": 62028629.0, + "step": 2393 + }, + { + "epoch": 0.2629035800571052, + "grad_norm": 2.0499565601348877, + "learning_rate": 4.379575402635432e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6996210217475891, + "num_tokens": 62052907.0, + "step": 2394 + }, + { + "epoch": 0.2630133977597189, + "grad_norm": 1.973528504371643, + "learning_rate": 4.381405563689605e-06, + "loss": 1.1401, + "mean_token_accuracy": 0.6666717529296875, + "num_tokens": 62082628.0, + "step": 2395 + }, + { + "epoch": 0.2631232154623325, + "grad_norm": 2.057708263397217, + "learning_rate": 4.383235724743778e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6936626434326172, + "num_tokens": 62106670.0, + "step": 2396 + }, + { + "epoch": 0.2632330331649462, + "grad_norm": 2.2060651779174805, + "learning_rate": 4.38506588579795e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7239157557487488, + "num_tokens": 62130412.0, + "step": 2397 + }, + { + "epoch": 0.2633428508675599, + "grad_norm": 2.2579410076141357, + "learning_rate": 4.386896046852123e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7058799266815186, + "num_tokens": 62152342.0, + "step": 2398 + }, + { + "epoch": 0.2634526685701735, + "grad_norm": 2.1440277099609375, + "learning_rate": 4.3887262079062955e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6906789541244507, + "num_tokens": 62177622.0, + "step": 2399 + }, + { + "epoch": 0.26356248627278717, + "grad_norm": 2.2474544048309326, + "learning_rate": 4.390556368960469e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.718641996383667, + "num_tokens": 62199535.0, + "step": 2400 + }, + { + "epoch": 0.2636723039754008, + "grad_norm": 2.222177743911743, + "learning_rate": 4.3923865300146415e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7097842693328857, + "num_tokens": 62223148.0, + "step": 2401 + }, + { + "epoch": 0.2637821216780145, + "grad_norm": 2.1812050342559814, + "learning_rate": 4.3942166910688145e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6954712271690369, + "num_tokens": 62251017.0, + "step": 2402 + }, + { + "epoch": 0.26389193938062816, + "grad_norm": 2.386855363845825, + "learning_rate": 4.396046852122987e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6895512342453003, + "num_tokens": 62273163.0, + "step": 2403 + }, + { + "epoch": 0.2640017570832418, + "grad_norm": 1.979761004447937, + "learning_rate": 4.39787701317716e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6925981044769287, + "num_tokens": 62305159.0, + "step": 2404 + }, + { + "epoch": 0.2641115747858555, + "grad_norm": 2.0182571411132812, + "learning_rate": 4.399707174231333e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6934648156166077, + "num_tokens": 62331393.0, + "step": 2405 + }, + { + "epoch": 0.26422139248846915, + "grad_norm": 2.4965155124664307, + "learning_rate": 4.401537335285506e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7068223357200623, + "num_tokens": 62350906.0, + "step": 2406 + }, + { + "epoch": 0.2643312101910828, + "grad_norm": 2.0888824462890625, + "learning_rate": 4.403367496339678e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6801801323890686, + "num_tokens": 62376191.0, + "step": 2407 + }, + { + "epoch": 0.26444102789369645, + "grad_norm": 1.9338972568511963, + "learning_rate": 4.405197657393851e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7063876986503601, + "num_tokens": 62404735.0, + "step": 2408 + }, + { + "epoch": 0.26455084559631015, + "grad_norm": 1.9037314653396606, + "learning_rate": 4.407027818448024e-06, + "loss": 1.063, + "mean_token_accuracy": 0.679568886756897, + "num_tokens": 62439815.0, + "step": 2409 + }, + { + "epoch": 0.2646606632989238, + "grad_norm": 1.9703127145767212, + "learning_rate": 4.408857979502196e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6999363303184509, + "num_tokens": 62470426.0, + "step": 2410 + }, + { + "epoch": 0.26477048100153744, + "grad_norm": 2.051107168197632, + "learning_rate": 4.41068814055637e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6843231916427612, + "num_tokens": 62497446.0, + "step": 2411 + }, + { + "epoch": 0.2648802987041511, + "grad_norm": 2.1516613960266113, + "learning_rate": 4.412518301610542e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7048792839050293, + "num_tokens": 62523314.0, + "step": 2412 + }, + { + "epoch": 0.2649901164067648, + "grad_norm": 2.0040857791900635, + "learning_rate": 4.414348462664715e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7197313904762268, + "num_tokens": 62550116.0, + "step": 2413 + }, + { + "epoch": 0.26509993410937843, + "grad_norm": 2.1842446327209473, + "learning_rate": 4.416178623718887e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.695734441280365, + "num_tokens": 62575382.0, + "step": 2414 + }, + { + "epoch": 0.2652097518119921, + "grad_norm": 2.0731804370880127, + "learning_rate": 4.41800878477306e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6894079446792603, + "num_tokens": 62601971.0, + "step": 2415 + }, + { + "epoch": 0.2653195695146058, + "grad_norm": 1.862960934638977, + "learning_rate": 4.419838945827233e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6920018196105957, + "num_tokens": 62632976.0, + "step": 2416 + }, + { + "epoch": 0.2654293872172194, + "grad_norm": 2.9174540042877197, + "learning_rate": 4.421669106881406e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7064081430435181, + "num_tokens": 62651573.0, + "step": 2417 + }, + { + "epoch": 0.26553920491983307, + "grad_norm": 2.2908241748809814, + "learning_rate": 4.4234992679355785e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7086131572723389, + "num_tokens": 62673525.0, + "step": 2418 + }, + { + "epoch": 0.2656490226224467, + "grad_norm": 2.261732339859009, + "learning_rate": 4.4253294289897515e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6924316883087158, + "num_tokens": 62695475.0, + "step": 2419 + }, + { + "epoch": 0.2657588403250604, + "grad_norm": 1.8703690767288208, + "learning_rate": 4.427159590043924e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7092763185501099, + "num_tokens": 62725520.0, + "step": 2420 + }, + { + "epoch": 0.26586865802767407, + "grad_norm": 2.450408935546875, + "learning_rate": 4.4289897510980975e-06, + "loss": 0.937, + "mean_token_accuracy": 0.710754930973053, + "num_tokens": 62745594.0, + "step": 2421 + }, + { + "epoch": 0.2659784757302877, + "grad_norm": 2.0922417640686035, + "learning_rate": 4.43081991215227e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6987792253494263, + "num_tokens": 62771892.0, + "step": 2422 + }, + { + "epoch": 0.26608829343290136, + "grad_norm": 2.1543033123016357, + "learning_rate": 4.432650073206443e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7074695825576782, + "num_tokens": 62797078.0, + "step": 2423 + }, + { + "epoch": 0.26619811113551506, + "grad_norm": 2.000946044921875, + "learning_rate": 4.434480234260616e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6859947443008423, + "num_tokens": 62829404.0, + "step": 2424 + }, + { + "epoch": 0.2663079288381287, + "grad_norm": 2.2320728302001953, + "learning_rate": 4.436310395314788e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7200648784637451, + "num_tokens": 62854126.0, + "step": 2425 + }, + { + "epoch": 0.26641774654074235, + "grad_norm": 2.3231801986694336, + "learning_rate": 4.438140556368961e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7126001119613647, + "num_tokens": 62880448.0, + "step": 2426 + }, + { + "epoch": 0.26652756424335605, + "grad_norm": 2.1074321269989014, + "learning_rate": 4.439970717423134e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7026818990707397, + "num_tokens": 62906302.0, + "step": 2427 + }, + { + "epoch": 0.2666373819459697, + "grad_norm": 1.9687398672103882, + "learning_rate": 4.441800878477307e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7244165539741516, + "num_tokens": 62930818.0, + "step": 2428 + }, + { + "epoch": 0.26674719964858334, + "grad_norm": 1.920535922050476, + "learning_rate": 4.443631039531479e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.685354471206665, + "num_tokens": 62961798.0, + "step": 2429 + }, + { + "epoch": 0.266857017351197, + "grad_norm": 2.037299156188965, + "learning_rate": 4.445461200585652e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6979146003723145, + "num_tokens": 62988951.0, + "step": 2430 + }, + { + "epoch": 0.2669668350538107, + "grad_norm": 2.2023673057556152, + "learning_rate": 4.447291361639824e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6948037147521973, + "num_tokens": 63014532.0, + "step": 2431 + }, + { + "epoch": 0.26707665275642434, + "grad_norm": 2.1599929332733154, + "learning_rate": 4.449121522693998e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6873373985290527, + "num_tokens": 63043041.0, + "step": 2432 + }, + { + "epoch": 0.267186470459038, + "grad_norm": 2.0033693313598633, + "learning_rate": 4.45095168374817e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6776832342147827, + "num_tokens": 63073830.0, + "step": 2433 + }, + { + "epoch": 0.2672962881616517, + "grad_norm": 2.221425771713257, + "learning_rate": 4.452781844802343e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7063202261924744, + "num_tokens": 63096423.0, + "step": 2434 + }, + { + "epoch": 0.26740610586426533, + "grad_norm": 2.0615406036376953, + "learning_rate": 4.454612005856515e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6962604522705078, + "num_tokens": 63123494.0, + "step": 2435 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 2.161560535430908, + "learning_rate": 4.456442166910688e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7160660028457642, + "num_tokens": 63148542.0, + "step": 2436 + }, + { + "epoch": 0.2676257412694926, + "grad_norm": 2.007892608642578, + "learning_rate": 4.458272327964861e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6750773191452026, + "num_tokens": 63177439.0, + "step": 2437 + }, + { + "epoch": 0.2677355589721063, + "grad_norm": 1.8655644655227661, + "learning_rate": 4.460102489019034e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7039718627929688, + "num_tokens": 63206798.0, + "step": 2438 + }, + { + "epoch": 0.26784537667471997, + "grad_norm": 2.1659560203552246, + "learning_rate": 4.461932650073207e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7054250240325928, + "num_tokens": 63229714.0, + "step": 2439 + }, + { + "epoch": 0.2679551943773336, + "grad_norm": 2.0719149112701416, + "learning_rate": 4.46376281112738e-06, + "loss": 1.021, + "mean_token_accuracy": 0.684267520904541, + "num_tokens": 63257462.0, + "step": 2440 + }, + { + "epoch": 0.26806501207994726, + "grad_norm": 2.379384994506836, + "learning_rate": 4.465592972181553e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7038697004318237, + "num_tokens": 63279123.0, + "step": 2441 + }, + { + "epoch": 0.26817482978256096, + "grad_norm": 2.1609408855438232, + "learning_rate": 4.467423133235725e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6872495412826538, + "num_tokens": 63305141.0, + "step": 2442 + }, + { + "epoch": 0.2682846474851746, + "grad_norm": 2.250209093093872, + "learning_rate": 4.469253294289898e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.710843026638031, + "num_tokens": 63326908.0, + "step": 2443 + }, + { + "epoch": 0.26839446518778826, + "grad_norm": 2.396803140640259, + "learning_rate": 4.471083455344071e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6855758428573608, + "num_tokens": 63350176.0, + "step": 2444 + }, + { + "epoch": 0.26850428289040196, + "grad_norm": 2.1368255615234375, + "learning_rate": 4.472913616398244e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.6700749397277832, + "num_tokens": 63378141.0, + "step": 2445 + }, + { + "epoch": 0.2686141005930156, + "grad_norm": 2.153733253479004, + "learning_rate": 4.474743777452416e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7061638832092285, + "num_tokens": 63400751.0, + "step": 2446 + }, + { + "epoch": 0.26872391829562925, + "grad_norm": 2.0823919773101807, + "learning_rate": 4.476573938506589e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.690619707107544, + "num_tokens": 63428797.0, + "step": 2447 + }, + { + "epoch": 0.2688337359982429, + "grad_norm": 1.9439853429794312, + "learning_rate": 4.478404099560762e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7440657615661621, + "num_tokens": 63453330.0, + "step": 2448 + }, + { + "epoch": 0.2689435537008566, + "grad_norm": 2.266913652420044, + "learning_rate": 4.480234260614935e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6777344942092896, + "num_tokens": 63478687.0, + "step": 2449 + }, + { + "epoch": 0.26905337140347024, + "grad_norm": 2.192305564880371, + "learning_rate": 4.482064421669107e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.695319414138794, + "num_tokens": 63503908.0, + "step": 2450 + }, + { + "epoch": 0.2691631891060839, + "grad_norm": 2.0284013748168945, + "learning_rate": 4.48389458272328e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6974003911018372, + "num_tokens": 63529297.0, + "step": 2451 + }, + { + "epoch": 0.26927300680869753, + "grad_norm": 2.0308289527893066, + "learning_rate": 4.485724743777452e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.686566948890686, + "num_tokens": 63558287.0, + "step": 2452 + }, + { + "epoch": 0.26938282451131124, + "grad_norm": 2.3991971015930176, + "learning_rate": 4.487554904831626e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6850779056549072, + "num_tokens": 63584034.0, + "step": 2453 + }, + { + "epoch": 0.2694926422139249, + "grad_norm": 1.8584182262420654, + "learning_rate": 4.489385065885798e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6911495327949524, + "num_tokens": 63615667.0, + "step": 2454 + }, + { + "epoch": 0.26960245991653853, + "grad_norm": 1.7877006530761719, + "learning_rate": 4.491215226939971e-06, + "loss": 1.0818, + "mean_token_accuracy": 0.6769566535949707, + "num_tokens": 63651006.0, + "step": 2455 + }, + { + "epoch": 0.26971227761915223, + "grad_norm": 2.242936134338379, + "learning_rate": 4.4930453879941435e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6960058212280273, + "num_tokens": 63674574.0, + "step": 2456 + }, + { + "epoch": 0.2698220953217659, + "grad_norm": 2.462722063064575, + "learning_rate": 4.4948755490483165e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6976680755615234, + "num_tokens": 63693544.0, + "step": 2457 + }, + { + "epoch": 0.2699319130243795, + "grad_norm": 2.5352699756622314, + "learning_rate": 4.4967057101024895e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.701644778251648, + "num_tokens": 63710259.0, + "step": 2458 + }, + { + "epoch": 0.27004173072699317, + "grad_norm": 2.0789601802825928, + "learning_rate": 4.4985358711566626e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6851025819778442, + "num_tokens": 63737529.0, + "step": 2459 + }, + { + "epoch": 0.27015154842960687, + "grad_norm": 2.113112688064575, + "learning_rate": 4.500366032210835e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6909881830215454, + "num_tokens": 63764001.0, + "step": 2460 + }, + { + "epoch": 0.2702613661322205, + "grad_norm": 2.0545971393585205, + "learning_rate": 4.502196193265008e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7137178778648376, + "num_tokens": 63788901.0, + "step": 2461 + }, + { + "epoch": 0.27037118383483416, + "grad_norm": 2.1358842849731445, + "learning_rate": 4.504026354319181e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6927952766418457, + "num_tokens": 63814461.0, + "step": 2462 + }, + { + "epoch": 0.27048100153744786, + "grad_norm": 2.26668381690979, + "learning_rate": 4.505856515373353e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7071847915649414, + "num_tokens": 63837508.0, + "step": 2463 + }, + { + "epoch": 0.2705908192400615, + "grad_norm": 2.1727042198181152, + "learning_rate": 4.507686676427526e-06, + "loss": 1.073, + "mean_token_accuracy": 0.6838399171829224, + "num_tokens": 63864814.0, + "step": 2464 + }, + { + "epoch": 0.27070063694267515, + "grad_norm": 2.1018545627593994, + "learning_rate": 4.509516837481699e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6958774328231812, + "num_tokens": 63890786.0, + "step": 2465 + }, + { + "epoch": 0.2708104546452888, + "grad_norm": 2.0611987113952637, + "learning_rate": 4.511346998535872e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7052291631698608, + "num_tokens": 63915264.0, + "step": 2466 + }, + { + "epoch": 0.2709202723479025, + "grad_norm": 2.2572169303894043, + "learning_rate": 4.513177159590044e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7099853754043579, + "num_tokens": 63939382.0, + "step": 2467 + }, + { + "epoch": 0.27103009005051615, + "grad_norm": 2.299274206161499, + "learning_rate": 4.515007320644217e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7131972312927246, + "num_tokens": 63960538.0, + "step": 2468 + }, + { + "epoch": 0.2711399077531298, + "grad_norm": 1.925458312034607, + "learning_rate": 4.51683748169839e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7123374938964844, + "num_tokens": 63987783.0, + "step": 2469 + }, + { + "epoch": 0.27124972545574344, + "grad_norm": 2.1487553119659424, + "learning_rate": 4.518667642752563e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7015852928161621, + "num_tokens": 64016202.0, + "step": 2470 + }, + { + "epoch": 0.27135954315835714, + "grad_norm": 2.0856354236602783, + "learning_rate": 4.520497803806735e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.6982991695404053, + "num_tokens": 64041562.0, + "step": 2471 + }, + { + "epoch": 0.2714693608609708, + "grad_norm": 2.0233356952667236, + "learning_rate": 4.522327964860908e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6943461298942566, + "num_tokens": 64071198.0, + "step": 2472 + }, + { + "epoch": 0.27157917856358443, + "grad_norm": 1.9049665927886963, + "learning_rate": 4.5241581259150805e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7118100523948669, + "num_tokens": 64100689.0, + "step": 2473 + }, + { + "epoch": 0.27168899626619814, + "grad_norm": 2.011183977127075, + "learning_rate": 4.5259882869692535e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.674505352973938, + "num_tokens": 64128584.0, + "step": 2474 + }, + { + "epoch": 0.2717988139688118, + "grad_norm": 2.0015571117401123, + "learning_rate": 4.5278184480234265e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6988822221755981, + "num_tokens": 64155202.0, + "step": 2475 + }, + { + "epoch": 0.2719086316714254, + "grad_norm": 2.1764111518859863, + "learning_rate": 4.5296486090775995e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6837883591651917, + "num_tokens": 64181760.0, + "step": 2476 + }, + { + "epoch": 0.2720184493740391, + "grad_norm": 2.0545918941497803, + "learning_rate": 4.531478770131772e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6926778554916382, + "num_tokens": 64208042.0, + "step": 2477 + }, + { + "epoch": 0.2721282670766528, + "grad_norm": 1.9005001783370972, + "learning_rate": 4.533308931185945e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6909327507019043, + "num_tokens": 64236674.0, + "step": 2478 + }, + { + "epoch": 0.2722380847792664, + "grad_norm": 1.9597591161727905, + "learning_rate": 4.535139092240118e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7102184295654297, + "num_tokens": 64262096.0, + "step": 2479 + }, + { + "epoch": 0.27234790248188007, + "grad_norm": 2.3987607955932617, + "learning_rate": 4.536969253294291e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7094370126724243, + "num_tokens": 64280965.0, + "step": 2480 + }, + { + "epoch": 0.27245772018449377, + "grad_norm": 2.2707481384277344, + "learning_rate": 4.538799414348463e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7179946899414062, + "num_tokens": 64302818.0, + "step": 2481 + }, + { + "epoch": 0.2725675378871074, + "grad_norm": 2.4350314140319824, + "learning_rate": 4.540629575402636e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7096526622772217, + "num_tokens": 64323632.0, + "step": 2482 + }, + { + "epoch": 0.27267735558972106, + "grad_norm": 2.060011386871338, + "learning_rate": 4.542459736456809e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7061327695846558, + "num_tokens": 64348944.0, + "step": 2483 + }, + { + "epoch": 0.2727871732923347, + "grad_norm": 2.120103359222412, + "learning_rate": 4.544289897510981e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6824682354927063, + "num_tokens": 64376055.0, + "step": 2484 + }, + { + "epoch": 0.2728969909949484, + "grad_norm": 2.399139165878296, + "learning_rate": 4.546120058565154e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6949009299278259, + "num_tokens": 64398760.0, + "step": 2485 + }, + { + "epoch": 0.27300680869756205, + "grad_norm": 2.078274965286255, + "learning_rate": 4.547950219619327e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7059495449066162, + "num_tokens": 64422800.0, + "step": 2486 + }, + { + "epoch": 0.2731166264001757, + "grad_norm": 2.405219793319702, + "learning_rate": 4.5497803806735e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7053537964820862, + "num_tokens": 64442954.0, + "step": 2487 + }, + { + "epoch": 0.27322644410278935, + "grad_norm": 2.1885428428649902, + "learning_rate": 4.551610541727672e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7004684209823608, + "num_tokens": 64467050.0, + "step": 2488 + }, + { + "epoch": 0.27333626180540305, + "grad_norm": 2.322455883026123, + "learning_rate": 4.553440702781845e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7070469856262207, + "num_tokens": 64489187.0, + "step": 2489 + }, + { + "epoch": 0.2734460795080167, + "grad_norm": 2.309199571609497, + "learning_rate": 4.555270863836017e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.684624433517456, + "num_tokens": 64512956.0, + "step": 2490 + }, + { + "epoch": 0.27355589721063034, + "grad_norm": 2.3451406955718994, + "learning_rate": 4.557101024890191e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6904850602149963, + "num_tokens": 64533344.0, + "step": 2491 + }, + { + "epoch": 0.27366571491324404, + "grad_norm": 2.0298402309417725, + "learning_rate": 4.558931185944363e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6871875524520874, + "num_tokens": 64559786.0, + "step": 2492 + }, + { + "epoch": 0.2737755326158577, + "grad_norm": 2.2813351154327393, + "learning_rate": 4.5607613469985364e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6989480257034302, + "num_tokens": 64586090.0, + "step": 2493 + }, + { + "epoch": 0.27388535031847133, + "grad_norm": 1.840522289276123, + "learning_rate": 4.562591508052709e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6918062567710876, + "num_tokens": 64618419.0, + "step": 2494 + }, + { + "epoch": 0.273995168021085, + "grad_norm": 2.004307985305786, + "learning_rate": 4.564421669106882e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6928608417510986, + "num_tokens": 64647899.0, + "step": 2495 + }, + { + "epoch": 0.2741049857236987, + "grad_norm": 2.194002389907837, + "learning_rate": 4.566251830161055e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.7044205069541931, + "num_tokens": 64672526.0, + "step": 2496 + }, + { + "epoch": 0.2742148034263123, + "grad_norm": 2.2418556213378906, + "learning_rate": 4.568081991215228e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7155624628067017, + "num_tokens": 64695441.0, + "step": 2497 + }, + { + "epoch": 0.27432462112892597, + "grad_norm": 1.8138113021850586, + "learning_rate": 4.5699121522694e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7380223274230957, + "num_tokens": 64725214.0, + "step": 2498 + }, + { + "epoch": 0.2744344388315396, + "grad_norm": 1.8775968551635742, + "learning_rate": 4.571742313323573e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7049331665039062, + "num_tokens": 64753200.0, + "step": 2499 + }, + { + "epoch": 0.2745442565341533, + "grad_norm": 1.890720248222351, + "learning_rate": 4.573572474377746e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7081353664398193, + "num_tokens": 64783099.0, + "step": 2500 + }, + { + "epoch": 0.27465407423676697, + "grad_norm": 2.2274608612060547, + "learning_rate": 4.575402635431919e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7187472581863403, + "num_tokens": 64806208.0, + "step": 2501 + }, + { + "epoch": 0.2747638919393806, + "grad_norm": 2.106058359146118, + "learning_rate": 4.577232796486091e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.708258867263794, + "num_tokens": 64831326.0, + "step": 2502 + }, + { + "epoch": 0.2748737096419943, + "grad_norm": 2.283313274383545, + "learning_rate": 4.579062957540264e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7185039520263672, + "num_tokens": 64851291.0, + "step": 2503 + }, + { + "epoch": 0.27498352734460796, + "grad_norm": 2.3888909816741943, + "learning_rate": 4.580893118594437e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.679443895816803, + "num_tokens": 64872886.0, + "step": 2504 + }, + { + "epoch": 0.2750933450472216, + "grad_norm": 2.022505044937134, + "learning_rate": 4.582723279648609e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7065584063529968, + "num_tokens": 64899427.0, + "step": 2505 + }, + { + "epoch": 0.27520316274983525, + "grad_norm": 2.116412878036499, + "learning_rate": 4.584553440702782e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7006794214248657, + "num_tokens": 64924024.0, + "step": 2506 + }, + { + "epoch": 0.27531298045244895, + "grad_norm": 2.1801764965057373, + "learning_rate": 4.586383601756955e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.715414822101593, + "num_tokens": 64946993.0, + "step": 2507 + }, + { + "epoch": 0.2754227981550626, + "grad_norm": 2.346209764480591, + "learning_rate": 4.588213762811128e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6958814859390259, + "num_tokens": 64969292.0, + "step": 2508 + }, + { + "epoch": 0.27553261585767624, + "grad_norm": 2.1568541526794434, + "learning_rate": 4.5900439238653e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.684537410736084, + "num_tokens": 64993326.0, + "step": 2509 + }, + { + "epoch": 0.27564243356028995, + "grad_norm": 2.325993776321411, + "learning_rate": 4.591874084919473e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6830116510391235, + "num_tokens": 65018750.0, + "step": 2510 + }, + { + "epoch": 0.2757522512629036, + "grad_norm": 2.109477996826172, + "learning_rate": 4.5937042459736455e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6995616555213928, + "num_tokens": 65046175.0, + "step": 2511 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 1.9176032543182373, + "learning_rate": 4.595534407027819e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6860017776489258, + "num_tokens": 65077313.0, + "step": 2512 + }, + { + "epoch": 0.2759718866681309, + "grad_norm": 2.119011878967285, + "learning_rate": 4.5973645680819915e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7142341136932373, + "num_tokens": 65104059.0, + "step": 2513 + }, + { + "epoch": 0.2760817043707446, + "grad_norm": 2.2983920574188232, + "learning_rate": 4.5991947291361646e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6997590661048889, + "num_tokens": 65127031.0, + "step": 2514 + }, + { + "epoch": 0.27619152207335823, + "grad_norm": 2.110337734222412, + "learning_rate": 4.601024890190337e-06, + "loss": 1.1145, + "mean_token_accuracy": 0.6716879606246948, + "num_tokens": 65156444.0, + "step": 2515 + }, + { + "epoch": 0.2763013397759719, + "grad_norm": 1.8991196155548096, + "learning_rate": 4.60285505124451e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.706794798374176, + "num_tokens": 65185438.0, + "step": 2516 + }, + { + "epoch": 0.2764111574785855, + "grad_norm": 2.0995774269104004, + "learning_rate": 4.604685212298683e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7166801691055298, + "num_tokens": 65211880.0, + "step": 2517 + }, + { + "epoch": 0.2765209751811992, + "grad_norm": 2.1173956394195557, + "learning_rate": 4.606515373352856e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7010728716850281, + "num_tokens": 65235949.0, + "step": 2518 + }, + { + "epoch": 0.27663079288381287, + "grad_norm": 2.316687822341919, + "learning_rate": 4.608345534407028e-06, + "loss": 1.0987, + "mean_token_accuracy": 0.6817036867141724, + "num_tokens": 65259536.0, + "step": 2519 + }, + { + "epoch": 0.2767406105864265, + "grad_norm": 1.9704972505569458, + "learning_rate": 4.610175695461201e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6867773532867432, + "num_tokens": 65289166.0, + "step": 2520 + }, + { + "epoch": 0.2768504282890402, + "grad_norm": 1.9613537788391113, + "learning_rate": 4.612005856515374e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7166305780410767, + "num_tokens": 65316158.0, + "step": 2521 + }, + { + "epoch": 0.27696024599165386, + "grad_norm": 2.1313929557800293, + "learning_rate": 4.613836017569546e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6926034688949585, + "num_tokens": 65340542.0, + "step": 2522 + }, + { + "epoch": 0.2770700636942675, + "grad_norm": 1.9230732917785645, + "learning_rate": 4.615666178623719e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7097132205963135, + "num_tokens": 65366850.0, + "step": 2523 + }, + { + "epoch": 0.27717988139688116, + "grad_norm": 2.1400058269500732, + "learning_rate": 4.617496339677892e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6910035610198975, + "num_tokens": 65390729.0, + "step": 2524 + }, + { + "epoch": 0.27728969909949486, + "grad_norm": 2.1250357627868652, + "learning_rate": 4.619326500732065e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7040133476257324, + "num_tokens": 65413448.0, + "step": 2525 + }, + { + "epoch": 0.2773995168021085, + "grad_norm": 2.037353277206421, + "learning_rate": 4.621156661786237e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6829133033752441, + "num_tokens": 65440873.0, + "step": 2526 + }, + { + "epoch": 0.27750933450472215, + "grad_norm": 1.9918925762176514, + "learning_rate": 4.62298682284041e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7068382501602173, + "num_tokens": 65466649.0, + "step": 2527 + }, + { + "epoch": 0.2776191522073358, + "grad_norm": 2.2787630558013916, + "learning_rate": 4.624816983894583e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7034802436828613, + "num_tokens": 65487692.0, + "step": 2528 + }, + { + "epoch": 0.2777289699099495, + "grad_norm": 2.093393564224243, + "learning_rate": 4.626647144948756e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7204922437667847, + "num_tokens": 65512012.0, + "step": 2529 + }, + { + "epoch": 0.27783878761256314, + "grad_norm": 2.143822193145752, + "learning_rate": 4.6284773060029285e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.6764899492263794, + "num_tokens": 65538844.0, + "step": 2530 + }, + { + "epoch": 0.2779486053151768, + "grad_norm": 1.8705652952194214, + "learning_rate": 4.6303074670571015e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6915006637573242, + "num_tokens": 65571582.0, + "step": 2531 + }, + { + "epoch": 0.2780584230177905, + "grad_norm": 2.3601255416870117, + "learning_rate": 4.632137628111274e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6924954056739807, + "num_tokens": 65591637.0, + "step": 2532 + }, + { + "epoch": 0.27816824072040414, + "grad_norm": 1.9997522830963135, + "learning_rate": 4.6339677891654475e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6842719316482544, + "num_tokens": 65623951.0, + "step": 2533 + }, + { + "epoch": 0.2782780584230178, + "grad_norm": 2.3007895946502686, + "learning_rate": 4.63579795021962e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7159653902053833, + "num_tokens": 65647033.0, + "step": 2534 + }, + { + "epoch": 0.27838787612563143, + "grad_norm": 2.2222492694854736, + "learning_rate": 4.637628111273793e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.717376708984375, + "num_tokens": 65669615.0, + "step": 2535 + }, + { + "epoch": 0.27849769382824513, + "grad_norm": 2.5059945583343506, + "learning_rate": 4.639458272327965e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6955761909484863, + "num_tokens": 65689315.0, + "step": 2536 + }, + { + "epoch": 0.2786075115308588, + "grad_norm": 2.1013174057006836, + "learning_rate": 4.641288433382138e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7002711296081543, + "num_tokens": 65715589.0, + "step": 2537 + }, + { + "epoch": 0.2787173292334724, + "grad_norm": 1.897029995918274, + "learning_rate": 4.643118594436311e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6965913772583008, + "num_tokens": 65745920.0, + "step": 2538 + }, + { + "epoch": 0.2788271469360861, + "grad_norm": 2.046941041946411, + "learning_rate": 4.644948755490484e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.702103316783905, + "num_tokens": 65771497.0, + "step": 2539 + }, + { + "epoch": 0.27893696463869977, + "grad_norm": 2.1897318363189697, + "learning_rate": 4.646778916544656e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7115197777748108, + "num_tokens": 65795621.0, + "step": 2540 + }, + { + "epoch": 0.2790467823413134, + "grad_norm": 2.32732892036438, + "learning_rate": 4.648609077598829e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.6990722417831421, + "num_tokens": 65819847.0, + "step": 2541 + }, + { + "epoch": 0.27915660004392706, + "grad_norm": 2.0492639541625977, + "learning_rate": 4.650439238653002e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6822269558906555, + "num_tokens": 65848988.0, + "step": 2542 + }, + { + "epoch": 0.27926641774654076, + "grad_norm": 2.0835516452789307, + "learning_rate": 4.652269399707174e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.704534113407135, + "num_tokens": 65875170.0, + "step": 2543 + }, + { + "epoch": 0.2793762354491544, + "grad_norm": 1.8508309125900269, + "learning_rate": 4.654099560761347e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7300553321838379, + "num_tokens": 65905410.0, + "step": 2544 + }, + { + "epoch": 0.27948605315176805, + "grad_norm": 1.9541809558868408, + "learning_rate": 4.65592972181552e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6882347464561462, + "num_tokens": 65932815.0, + "step": 2545 + }, + { + "epoch": 0.2795958708543817, + "grad_norm": 2.0801963806152344, + "learning_rate": 4.657759882869693e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6982381343841553, + "num_tokens": 65957804.0, + "step": 2546 + }, + { + "epoch": 0.2797056885569954, + "grad_norm": 2.23991060256958, + "learning_rate": 4.659590043923865e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7056970596313477, + "num_tokens": 65980164.0, + "step": 2547 + }, + { + "epoch": 0.27981550625960905, + "grad_norm": 2.0092601776123047, + "learning_rate": 4.6614202049780384e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6898197531700134, + "num_tokens": 66009495.0, + "step": 2548 + }, + { + "epoch": 0.2799253239622227, + "grad_norm": 2.152879238128662, + "learning_rate": 4.6632503660322114e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7114110589027405, + "num_tokens": 66031139.0, + "step": 2549 + }, + { + "epoch": 0.2800351416648364, + "grad_norm": 2.046149730682373, + "learning_rate": 4.6650805270863845e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6984061002731323, + "num_tokens": 66059934.0, + "step": 2550 + }, + { + "epoch": 0.28014495936745004, + "grad_norm": 2.3216123580932617, + "learning_rate": 4.666910688140557e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7012450695037842, + "num_tokens": 66080541.0, + "step": 2551 + }, + { + "epoch": 0.2802547770700637, + "grad_norm": 1.8380740880966187, + "learning_rate": 4.66874084919473e-06, + "loss": 1.1249, + "mean_token_accuracy": 0.6746821403503418, + "num_tokens": 66115673.0, + "step": 2552 + }, + { + "epoch": 0.28036459477267733, + "grad_norm": 2.101923942565918, + "learning_rate": 4.670571010248902e-06, + "loss": 1.1398, + "mean_token_accuracy": 0.6615222692489624, + "num_tokens": 66143948.0, + "step": 2553 + }, + { + "epoch": 0.28047441247529104, + "grad_norm": 2.102435350418091, + "learning_rate": 4.672401171303075e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7162708640098572, + "num_tokens": 66168850.0, + "step": 2554 + }, + { + "epoch": 0.2805842301779047, + "grad_norm": 2.4047138690948486, + "learning_rate": 4.674231332357248e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6876274347305298, + "num_tokens": 66190705.0, + "step": 2555 + }, + { + "epoch": 0.2806940478805183, + "grad_norm": 2.4056053161621094, + "learning_rate": 4.676061493411421e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6924906969070435, + "num_tokens": 66213190.0, + "step": 2556 + }, + { + "epoch": 0.28080386558313203, + "grad_norm": 2.270953416824341, + "learning_rate": 4.677891654465593e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.6653496623039246, + "num_tokens": 66241382.0, + "step": 2557 + }, + { + "epoch": 0.2809136832857457, + "grad_norm": 2.211677074432373, + "learning_rate": 4.679721815519766e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7118111848831177, + "num_tokens": 66263899.0, + "step": 2558 + }, + { + "epoch": 0.2810235009883593, + "grad_norm": 2.343200445175171, + "learning_rate": 4.681551976573939e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7031552791595459, + "num_tokens": 66286691.0, + "step": 2559 + }, + { + "epoch": 0.28113331869097297, + "grad_norm": 2.102945327758789, + "learning_rate": 4.683382137628112e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6856309175491333, + "num_tokens": 66310475.0, + "step": 2560 + }, + { + "epoch": 0.28124313639358667, + "grad_norm": 1.9467484951019287, + "learning_rate": 4.685212298682284e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7035559415817261, + "num_tokens": 66337392.0, + "step": 2561 + }, + { + "epoch": 0.2813529540962003, + "grad_norm": 2.1548194885253906, + "learning_rate": 4.687042459736457e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6918625831604004, + "num_tokens": 66364531.0, + "step": 2562 + }, + { + "epoch": 0.28146277179881396, + "grad_norm": 2.299121856689453, + "learning_rate": 4.68887262079063e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7004269361495972, + "num_tokens": 66384779.0, + "step": 2563 + }, + { + "epoch": 0.2815725895014276, + "grad_norm": 2.207817554473877, + "learning_rate": 4.690702781844802e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6870481371879578, + "num_tokens": 66409282.0, + "step": 2564 + }, + { + "epoch": 0.2816824072040413, + "grad_norm": 2.082357883453369, + "learning_rate": 4.692532942898975e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.6813851594924927, + "num_tokens": 66439729.0, + "step": 2565 + }, + { + "epoch": 0.28179222490665495, + "grad_norm": 2.3100945949554443, + "learning_rate": 4.694363103953148e-06, + "loss": 1.0983, + "mean_token_accuracy": 0.6806092858314514, + "num_tokens": 66461533.0, + "step": 2566 + }, + { + "epoch": 0.2819020426092686, + "grad_norm": 2.230752944946289, + "learning_rate": 4.696193265007321e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6986104846000671, + "num_tokens": 66485025.0, + "step": 2567 + }, + { + "epoch": 0.2820118603118823, + "grad_norm": 1.8748443126678467, + "learning_rate": 4.6980234260614936e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6992377042770386, + "num_tokens": 66513718.0, + "step": 2568 + }, + { + "epoch": 0.28212167801449595, + "grad_norm": 2.1732802391052246, + "learning_rate": 4.6998535871156666e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.699094295501709, + "num_tokens": 66537250.0, + "step": 2569 + }, + { + "epoch": 0.2822314957171096, + "grad_norm": 1.9361273050308228, + "learning_rate": 4.701683748169839e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6960968971252441, + "num_tokens": 66567217.0, + "step": 2570 + }, + { + "epoch": 0.28234131341972324, + "grad_norm": 2.0232112407684326, + "learning_rate": 4.703513909224013e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7092455625534058, + "num_tokens": 66595151.0, + "step": 2571 + }, + { + "epoch": 0.28245113112233694, + "grad_norm": 1.954464316368103, + "learning_rate": 4.705344070278185e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7080469131469727, + "num_tokens": 66622995.0, + "step": 2572 + }, + { + "epoch": 0.2825609488249506, + "grad_norm": 1.9833605289459229, + "learning_rate": 4.707174231332358e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6868140697479248, + "num_tokens": 66653455.0, + "step": 2573 + }, + { + "epoch": 0.28267076652756423, + "grad_norm": 1.9822574853897095, + "learning_rate": 4.70900439238653e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7104458808898926, + "num_tokens": 66679514.0, + "step": 2574 + }, + { + "epoch": 0.2827805842301779, + "grad_norm": 2.1236748695373535, + "learning_rate": 4.710834553440703e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6771429777145386, + "num_tokens": 66705791.0, + "step": 2575 + }, + { + "epoch": 0.2828904019327916, + "grad_norm": 2.389981269836426, + "learning_rate": 4.712664714494876e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7017021179199219, + "num_tokens": 66727367.0, + "step": 2576 + }, + { + "epoch": 0.2830002196354052, + "grad_norm": 2.0825142860412598, + "learning_rate": 4.714494875549049e-06, + "loss": 0.974, + "mean_token_accuracy": 0.702732503414154, + "num_tokens": 66753596.0, + "step": 2577 + }, + { + "epoch": 0.28311003733801887, + "grad_norm": 1.9799699783325195, + "learning_rate": 4.716325036603221e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.732501745223999, + "num_tokens": 66780654.0, + "step": 2578 + }, + { + "epoch": 0.2832198550406326, + "grad_norm": 2.1907715797424316, + "learning_rate": 4.718155197657394e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6968410611152649, + "num_tokens": 66805579.0, + "step": 2579 + }, + { + "epoch": 0.2833296727432462, + "grad_norm": 2.2870471477508545, + "learning_rate": 4.719985358711567e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6890597343444824, + "num_tokens": 66829422.0, + "step": 2580 + }, + { + "epoch": 0.28343949044585987, + "grad_norm": 2.209843873977661, + "learning_rate": 4.72181551976574e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7024572491645813, + "num_tokens": 66853046.0, + "step": 2581 + }, + { + "epoch": 0.2835493081484735, + "grad_norm": 2.1744725704193115, + "learning_rate": 4.723645680819912e-06, + "loss": 1.1411, + "mean_token_accuracy": 0.6646579504013062, + "num_tokens": 66879442.0, + "step": 2582 + }, + { + "epoch": 0.2836591258510872, + "grad_norm": 2.2081398963928223, + "learning_rate": 4.725475841874085e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6910350322723389, + "num_tokens": 66905289.0, + "step": 2583 + }, + { + "epoch": 0.28376894355370086, + "grad_norm": 2.258596420288086, + "learning_rate": 4.727306002928258e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6947125196456909, + "num_tokens": 66927819.0, + "step": 2584 + }, + { + "epoch": 0.2838787612563145, + "grad_norm": 1.8644002676010132, + "learning_rate": 4.7291361639824305e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7002725005149841, + "num_tokens": 66960510.0, + "step": 2585 + }, + { + "epoch": 0.2839885789589282, + "grad_norm": 1.9351412057876587, + "learning_rate": 4.7309663250366035e-06, + "loss": 1.1225, + "mean_token_accuracy": 0.667647123336792, + "num_tokens": 66989373.0, + "step": 2586 + }, + { + "epoch": 0.28409839666154185, + "grad_norm": 1.9638351202011108, + "learning_rate": 4.7327964860907765e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6889716386795044, + "num_tokens": 67018294.0, + "step": 2587 + }, + { + "epoch": 0.2842082143641555, + "grad_norm": 2.00278902053833, + "learning_rate": 4.7346266471449495e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6960293650627136, + "num_tokens": 67047245.0, + "step": 2588 + }, + { + "epoch": 0.28431803206676914, + "grad_norm": 2.071209669113159, + "learning_rate": 4.736456808199122e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6952037811279297, + "num_tokens": 67075048.0, + "step": 2589 + }, + { + "epoch": 0.28442784976938285, + "grad_norm": 1.998957872390747, + "learning_rate": 4.738286969253295e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6988250017166138, + "num_tokens": 67102353.0, + "step": 2590 + }, + { + "epoch": 0.2845376674719965, + "grad_norm": 2.5072827339172363, + "learning_rate": 4.740117130307467e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7152363061904907, + "num_tokens": 67119504.0, + "step": 2591 + }, + { + "epoch": 0.28464748517461014, + "grad_norm": 1.9661756753921509, + "learning_rate": 4.741947291361641e-06, + "loss": 1.1457, + "mean_token_accuracy": 0.6630808115005493, + "num_tokens": 67154750.0, + "step": 2592 + }, + { + "epoch": 0.2847573028772238, + "grad_norm": 2.175278663635254, + "learning_rate": 4.743777452415813e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6995800137519836, + "num_tokens": 67180818.0, + "step": 2593 + }, + { + "epoch": 0.2848671205798375, + "grad_norm": 2.269852876663208, + "learning_rate": 4.745607613469986e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6943732500076294, + "num_tokens": 67204631.0, + "step": 2594 + }, + { + "epoch": 0.28497693828245113, + "grad_norm": 2.0240349769592285, + "learning_rate": 4.747437774524158e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6799598932266235, + "num_tokens": 67232071.0, + "step": 2595 + }, + { + "epoch": 0.2850867559850648, + "grad_norm": 2.0117995738983154, + "learning_rate": 4.749267935578331e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6901733875274658, + "num_tokens": 67259514.0, + "step": 2596 + }, + { + "epoch": 0.2851965736876785, + "grad_norm": 1.8810231685638428, + "learning_rate": 4.751098096632504e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6974653005599976, + "num_tokens": 67290460.0, + "step": 2597 + }, + { + "epoch": 0.2853063913902921, + "grad_norm": 2.2251479625701904, + "learning_rate": 4.752928257686677e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6983516216278076, + "num_tokens": 67311632.0, + "step": 2598 + }, + { + "epoch": 0.28541620909290577, + "grad_norm": 2.2532901763916016, + "learning_rate": 4.754758418740849e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7029955983161926, + "num_tokens": 67333780.0, + "step": 2599 + }, + { + "epoch": 0.2855260267955194, + "grad_norm": 1.9583076238632202, + "learning_rate": 4.756588579795022e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7229572534561157, + "num_tokens": 67359884.0, + "step": 2600 + }, + { + "epoch": 0.2856358444981331, + "grad_norm": 2.0804672241210938, + "learning_rate": 4.758418740849195e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7323154807090759, + "num_tokens": 67382776.0, + "step": 2601 + }, + { + "epoch": 0.28574566220074676, + "grad_norm": 2.0059101581573486, + "learning_rate": 4.7602489019033674e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6878331899642944, + "num_tokens": 67411921.0, + "step": 2602 + }, + { + "epoch": 0.2858554799033604, + "grad_norm": 2.109692096710205, + "learning_rate": 4.7620790629575404e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7177788615226746, + "num_tokens": 67436515.0, + "step": 2603 + }, + { + "epoch": 0.28596529760597406, + "grad_norm": 2.02590274810791, + "learning_rate": 4.7639092240117134e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6833999156951904, + "num_tokens": 67467790.0, + "step": 2604 + }, + { + "epoch": 0.28607511530858776, + "grad_norm": 2.079669237136841, + "learning_rate": 4.7657393850658865e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7019878029823303, + "num_tokens": 67493937.0, + "step": 2605 + }, + { + "epoch": 0.2861849330112014, + "grad_norm": 2.3957622051239014, + "learning_rate": 4.767569546120059e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7212693095207214, + "num_tokens": 67512818.0, + "step": 2606 + }, + { + "epoch": 0.28629475071381505, + "grad_norm": 2.0135552883148193, + "learning_rate": 4.769399707174232e-06, + "loss": 1.1085, + "mean_token_accuracy": 0.6701594591140747, + "num_tokens": 67544697.0, + "step": 2607 + }, + { + "epoch": 0.28640456841642875, + "grad_norm": 1.9673875570297241, + "learning_rate": 4.771229868228405e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6898941993713379, + "num_tokens": 67571403.0, + "step": 2608 + }, + { + "epoch": 0.2865143861190424, + "grad_norm": 1.9515126943588257, + "learning_rate": 4.773060029282578e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6840593814849854, + "num_tokens": 67601354.0, + "step": 2609 + }, + { + "epoch": 0.28662420382165604, + "grad_norm": 2.3201351165771484, + "learning_rate": 4.77489019033675e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7061194181442261, + "num_tokens": 67622530.0, + "step": 2610 + }, + { + "epoch": 0.2867340215242697, + "grad_norm": 2.238157272338867, + "learning_rate": 4.776720351390923e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6853299140930176, + "num_tokens": 67644974.0, + "step": 2611 + }, + { + "epoch": 0.2868438392268834, + "grad_norm": 2.111133098602295, + "learning_rate": 4.778550512445095e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7092322111129761, + "num_tokens": 67669093.0, + "step": 2612 + }, + { + "epoch": 0.28695365692949704, + "grad_norm": 2.364443778991699, + "learning_rate": 4.780380673499269e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7000338435173035, + "num_tokens": 67689800.0, + "step": 2613 + }, + { + "epoch": 0.2870634746321107, + "grad_norm": 1.9567610025405884, + "learning_rate": 4.782210834553441e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7142312526702881, + "num_tokens": 67717602.0, + "step": 2614 + }, + { + "epoch": 0.2871732923347244, + "grad_norm": 2.0595180988311768, + "learning_rate": 4.784040995607614e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7132064700126648, + "num_tokens": 67742603.0, + "step": 2615 + }, + { + "epoch": 0.28728311003733803, + "grad_norm": 2.5070109367370605, + "learning_rate": 4.785871156661786e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7115304470062256, + "num_tokens": 67762100.0, + "step": 2616 + }, + { + "epoch": 0.2873929277399517, + "grad_norm": 2.2693538665771484, + "learning_rate": 4.787701317715959e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7024117112159729, + "num_tokens": 67784918.0, + "step": 2617 + }, + { + "epoch": 0.2875027454425653, + "grad_norm": 2.197474479675293, + "learning_rate": 4.789531478770132e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7068597078323364, + "num_tokens": 67808253.0, + "step": 2618 + }, + { + "epoch": 0.287612563145179, + "grad_norm": 2.170623302459717, + "learning_rate": 4.791361639824305e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.700437068939209, + "num_tokens": 67833726.0, + "step": 2619 + }, + { + "epoch": 0.28772238084779267, + "grad_norm": 2.3527567386627197, + "learning_rate": 4.793191800878478e-06, + "loss": 0.964, + "mean_token_accuracy": 0.6989771127700806, + "num_tokens": 67856840.0, + "step": 2620 + }, + { + "epoch": 0.2878321985504063, + "grad_norm": 2.5319769382476807, + "learning_rate": 4.79502196193265e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7052626013755798, + "num_tokens": 67876848.0, + "step": 2621 + }, + { + "epoch": 0.28794201625301996, + "grad_norm": 2.1096644401550293, + "learning_rate": 4.796852122986823e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6788049340248108, + "num_tokens": 67903328.0, + "step": 2622 + }, + { + "epoch": 0.28805183395563366, + "grad_norm": 2.525885581970215, + "learning_rate": 4.7986822840409956e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7257137298583984, + "num_tokens": 67920760.0, + "step": 2623 + }, + { + "epoch": 0.2881616516582473, + "grad_norm": 2.302537441253662, + "learning_rate": 4.800512445095169e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6839118599891663, + "num_tokens": 67947644.0, + "step": 2624 + }, + { + "epoch": 0.28827146936086095, + "grad_norm": 2.3122198581695557, + "learning_rate": 4.8023426061493416e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7102838158607483, + "num_tokens": 67970626.0, + "step": 2625 + }, + { + "epoch": 0.28838128706347466, + "grad_norm": 2.070342779159546, + "learning_rate": 4.804172767203515e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6976621150970459, + "num_tokens": 67998351.0, + "step": 2626 + }, + { + "epoch": 0.2884911047660883, + "grad_norm": 2.106929063796997, + "learning_rate": 4.806002928257687e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6885931491851807, + "num_tokens": 68022665.0, + "step": 2627 + }, + { + "epoch": 0.28860092246870195, + "grad_norm": 2.2038402557373047, + "learning_rate": 4.80783308931186e-06, + "loss": 1.096, + "mean_token_accuracy": 0.672214150428772, + "num_tokens": 68048752.0, + "step": 2628 + }, + { + "epoch": 0.2887107401713156, + "grad_norm": 2.9575464725494385, + "learning_rate": 4.809663250366033e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7053194642066956, + "num_tokens": 68074077.0, + "step": 2629 + }, + { + "epoch": 0.2888205578739293, + "grad_norm": 2.340214967727661, + "learning_rate": 4.811493411420206e-06, + "loss": 1.0963, + "mean_token_accuracy": 0.6752364039421082, + "num_tokens": 68099605.0, + "step": 2630 + }, + { + "epoch": 0.28893037557654294, + "grad_norm": 2.1447694301605225, + "learning_rate": 4.813323572474378e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.698872447013855, + "num_tokens": 68126552.0, + "step": 2631 + }, + { + "epoch": 0.2890401932791566, + "grad_norm": 2.0173165798187256, + "learning_rate": 4.815153733528551e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6937627196311951, + "num_tokens": 68154765.0, + "step": 2632 + }, + { + "epoch": 0.2891500109817703, + "grad_norm": 2.1467437744140625, + "learning_rate": 4.816983894582723e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6730949878692627, + "num_tokens": 68181587.0, + "step": 2633 + }, + { + "epoch": 0.28925982868438394, + "grad_norm": 2.242307424545288, + "learning_rate": 4.818814055636896e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6904112100601196, + "num_tokens": 68204203.0, + "step": 2634 + }, + { + "epoch": 0.2893696463869976, + "grad_norm": 1.9166439771652222, + "learning_rate": 4.820644216691069e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7145786881446838, + "num_tokens": 68232131.0, + "step": 2635 + }, + { + "epoch": 0.2894794640896112, + "grad_norm": 2.1115801334381104, + "learning_rate": 4.822474377745242e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7259215712547302, + "num_tokens": 68255623.0, + "step": 2636 + }, + { + "epoch": 0.28958928179222493, + "grad_norm": 2.091181993484497, + "learning_rate": 4.824304538799415e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6941792964935303, + "num_tokens": 68281099.0, + "step": 2637 + }, + { + "epoch": 0.2896990994948386, + "grad_norm": 1.9649910926818848, + "learning_rate": 4.826134699853587e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7071987390518188, + "num_tokens": 68310144.0, + "step": 2638 + }, + { + "epoch": 0.2898089171974522, + "grad_norm": 2.0659384727478027, + "learning_rate": 4.82796486090776e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6976100206375122, + "num_tokens": 68336932.0, + "step": 2639 + }, + { + "epoch": 0.28991873490006587, + "grad_norm": 1.992100715637207, + "learning_rate": 4.829795021961933e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.704375147819519, + "num_tokens": 68362596.0, + "step": 2640 + }, + { + "epoch": 0.29002855260267957, + "grad_norm": 2.082274913787842, + "learning_rate": 4.831625183016106e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6845259666442871, + "num_tokens": 68388625.0, + "step": 2641 + }, + { + "epoch": 0.2901383703052932, + "grad_norm": 2.149841070175171, + "learning_rate": 4.8334553440702785e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6975100636482239, + "num_tokens": 68412514.0, + "step": 2642 + }, + { + "epoch": 0.29024818800790686, + "grad_norm": 2.309117555618286, + "learning_rate": 4.8352855051244515e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.722176194190979, + "num_tokens": 68433861.0, + "step": 2643 + }, + { + "epoch": 0.29035800571052056, + "grad_norm": 2.1857151985168457, + "learning_rate": 4.837115666178624e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6898493766784668, + "num_tokens": 68459636.0, + "step": 2644 + }, + { + "epoch": 0.2904678234131342, + "grad_norm": 2.072990894317627, + "learning_rate": 4.8389458272327975e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6899614334106445, + "num_tokens": 68485791.0, + "step": 2645 + }, + { + "epoch": 0.29057764111574785, + "grad_norm": 2.146996259689331, + "learning_rate": 4.84077598828697e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7079395651817322, + "num_tokens": 68511178.0, + "step": 2646 + }, + { + "epoch": 0.2906874588183615, + "grad_norm": 2.100750207901001, + "learning_rate": 4.842606149341143e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6909639835357666, + "num_tokens": 68539082.0, + "step": 2647 + }, + { + "epoch": 0.2907972765209752, + "grad_norm": 1.9906636476516724, + "learning_rate": 4.844436310395315e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7119476199150085, + "num_tokens": 68565689.0, + "step": 2648 + }, + { + "epoch": 0.29090709422358885, + "grad_norm": 1.8868037462234497, + "learning_rate": 4.846266471449488e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6977620124816895, + "num_tokens": 68595931.0, + "step": 2649 + }, + { + "epoch": 0.2910169119262025, + "grad_norm": 2.1449639797210693, + "learning_rate": 4.84809663250366e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7144747972488403, + "num_tokens": 68620779.0, + "step": 2650 + }, + { + "epoch": 0.29112672962881614, + "grad_norm": 2.2193381786346436, + "learning_rate": 4.849926793557834e-06, + "loss": 1.02, + "mean_token_accuracy": 0.7146804928779602, + "num_tokens": 68643565.0, + "step": 2651 + }, + { + "epoch": 0.29123654733142984, + "grad_norm": 2.0954654216766357, + "learning_rate": 4.851756954612006e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7081906795501709, + "num_tokens": 68670626.0, + "step": 2652 + }, + { + "epoch": 0.2913463650340435, + "grad_norm": 2.307279109954834, + "learning_rate": 4.853587115666179e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7095229625701904, + "num_tokens": 68690377.0, + "step": 2653 + }, + { + "epoch": 0.29145618273665713, + "grad_norm": 2.2741334438323975, + "learning_rate": 4.855417276720352e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7017602920532227, + "num_tokens": 68713418.0, + "step": 2654 + }, + { + "epoch": 0.29156600043927083, + "grad_norm": 2.1774203777313232, + "learning_rate": 4.857247437774524e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.7006773948669434, + "num_tokens": 68738897.0, + "step": 2655 + }, + { + "epoch": 0.2916758181418845, + "grad_norm": 1.9992913007736206, + "learning_rate": 4.859077598828697e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7080428600311279, + "num_tokens": 68768455.0, + "step": 2656 + }, + { + "epoch": 0.2917856358444981, + "grad_norm": 2.214062452316284, + "learning_rate": 4.86090775988287e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7045968174934387, + "num_tokens": 68793203.0, + "step": 2657 + }, + { + "epoch": 0.29189545354711177, + "grad_norm": 2.4822168350219727, + "learning_rate": 4.862737920937043e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.721549391746521, + "num_tokens": 68811679.0, + "step": 2658 + }, + { + "epoch": 0.2920052712497255, + "grad_norm": 1.8623446226119995, + "learning_rate": 4.8645680819912154e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6938192248344421, + "num_tokens": 68842285.0, + "step": 2659 + }, + { + "epoch": 0.2921150889523391, + "grad_norm": 1.9536867141723633, + "learning_rate": 4.8663982430453885e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6922545433044434, + "num_tokens": 68872099.0, + "step": 2660 + }, + { + "epoch": 0.29222490665495277, + "grad_norm": 1.9826271533966064, + "learning_rate": 4.8682284040995615e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7040801644325256, + "num_tokens": 68898476.0, + "step": 2661 + }, + { + "epoch": 0.29233472435756647, + "grad_norm": 2.086592674255371, + "learning_rate": 4.8700585651537345e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6893085241317749, + "num_tokens": 68923398.0, + "step": 2662 + }, + { + "epoch": 0.2924445420601801, + "grad_norm": 1.8487118482589722, + "learning_rate": 4.871888726207907e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6952567100524902, + "num_tokens": 68955963.0, + "step": 2663 + }, + { + "epoch": 0.29255435976279376, + "grad_norm": 2.423539638519287, + "learning_rate": 4.87371888726208e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7160803079605103, + "num_tokens": 68978141.0, + "step": 2664 + }, + { + "epoch": 0.2926641774654074, + "grad_norm": 1.955578088760376, + "learning_rate": 4.875549048316252e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.697651743888855, + "num_tokens": 69010976.0, + "step": 2665 + }, + { + "epoch": 0.2927739951680211, + "grad_norm": 2.0694594383239746, + "learning_rate": 4.877379209370425e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6893723011016846, + "num_tokens": 69039728.0, + "step": 2666 + }, + { + "epoch": 0.29288381287063475, + "grad_norm": 2.255424976348877, + "learning_rate": 4.879209370424598e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6970800161361694, + "num_tokens": 69061270.0, + "step": 2667 + }, + { + "epoch": 0.2929936305732484, + "grad_norm": 2.0334925651550293, + "learning_rate": 4.881039531478771e-06, + "loss": 1.0946, + "mean_token_accuracy": 0.6746362447738647, + "num_tokens": 69087995.0, + "step": 2668 + }, + { + "epoch": 0.29310344827586204, + "grad_norm": 2.0397257804870605, + "learning_rate": 4.882869692532943e-06, + "loss": 1.07, + "mean_token_accuracy": 0.679755449295044, + "num_tokens": 69115528.0, + "step": 2669 + }, + { + "epoch": 0.29321326597847575, + "grad_norm": 2.41890025138855, + "learning_rate": 4.884699853587116e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.692540168762207, + "num_tokens": 69138505.0, + "step": 2670 + }, + { + "epoch": 0.2933230836810894, + "grad_norm": 2.487152576446533, + "learning_rate": 4.886530014641289e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.6887307167053223, + "num_tokens": 69160683.0, + "step": 2671 + }, + { + "epoch": 0.29343290138370304, + "grad_norm": 2.320434093475342, + "learning_rate": 4.888360175695462e-06, + "loss": 0.98, + "mean_token_accuracy": 0.6988824009895325, + "num_tokens": 69182971.0, + "step": 2672 + }, + { + "epoch": 0.29354271908631674, + "grad_norm": 1.990928053855896, + "learning_rate": 4.890190336749634e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.715144693851471, + "num_tokens": 69210278.0, + "step": 2673 + }, + { + "epoch": 0.2936525367889304, + "grad_norm": 2.0351901054382324, + "learning_rate": 4.892020497803807e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.707476019859314, + "num_tokens": 69236355.0, + "step": 2674 + }, + { + "epoch": 0.29376235449154403, + "grad_norm": 1.8601354360580444, + "learning_rate": 4.89385065885798e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7107107639312744, + "num_tokens": 69266689.0, + "step": 2675 + }, + { + "epoch": 0.2938721721941577, + "grad_norm": 1.8467518091201782, + "learning_rate": 4.895680819912152e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7039451599121094, + "num_tokens": 69299064.0, + "step": 2676 + }, + { + "epoch": 0.2939819898967714, + "grad_norm": 2.057124137878418, + "learning_rate": 4.897510980966325e-06, + "loss": 1.0863, + "mean_token_accuracy": 0.6776862144470215, + "num_tokens": 69327757.0, + "step": 2677 + }, + { + "epoch": 0.294091807599385, + "grad_norm": 2.264960289001465, + "learning_rate": 4.899341142020498e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6969651579856873, + "num_tokens": 69350082.0, + "step": 2678 + }, + { + "epoch": 0.29420162530199867, + "grad_norm": 1.9592489004135132, + "learning_rate": 4.901171303074671e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6928355693817139, + "num_tokens": 69377228.0, + "step": 2679 + }, + { + "epoch": 0.2943114430046123, + "grad_norm": 1.9764196872711182, + "learning_rate": 4.903001464128844e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7131205797195435, + "num_tokens": 69401521.0, + "step": 2680 + }, + { + "epoch": 0.294421260707226, + "grad_norm": 1.928614616394043, + "learning_rate": 4.904831625183017e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7093040943145752, + "num_tokens": 69427873.0, + "step": 2681 + }, + { + "epoch": 0.29453107840983966, + "grad_norm": 1.9784094095230103, + "learning_rate": 4.906661786237189e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.682657778263092, + "num_tokens": 69460632.0, + "step": 2682 + }, + { + "epoch": 0.2946408961124533, + "grad_norm": 2.2891855239868164, + "learning_rate": 4.908491947291363e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.690203070640564, + "num_tokens": 69483534.0, + "step": 2683 + }, + { + "epoch": 0.294750713815067, + "grad_norm": 2.1554694175720215, + "learning_rate": 4.910322108345535e-06, + "loss": 1.0805, + "mean_token_accuracy": 0.6749656796455383, + "num_tokens": 69510793.0, + "step": 2684 + }, + { + "epoch": 0.29486053151768066, + "grad_norm": 2.1692490577697754, + "learning_rate": 4.912152269399708e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6898360252380371, + "num_tokens": 69539554.0, + "step": 2685 + }, + { + "epoch": 0.2949703492202943, + "grad_norm": 1.848903775215149, + "learning_rate": 4.91398243045388e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7081212997436523, + "num_tokens": 69568431.0, + "step": 2686 + }, + { + "epoch": 0.29508016692290795, + "grad_norm": 1.9168730974197388, + "learning_rate": 4.915812591508053e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6942021250724792, + "num_tokens": 69599547.0, + "step": 2687 + }, + { + "epoch": 0.29518998462552165, + "grad_norm": 1.7830699682235718, + "learning_rate": 4.917642752562226e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.683629035949707, + "num_tokens": 69633484.0, + "step": 2688 + }, + { + "epoch": 0.2952998023281353, + "grad_norm": 2.205845832824707, + "learning_rate": 4.919472913616399e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7042465209960938, + "num_tokens": 69656584.0, + "step": 2689 + }, + { + "epoch": 0.29540962003074894, + "grad_norm": 2.358649492263794, + "learning_rate": 4.921303074670571e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7021023631095886, + "num_tokens": 69678206.0, + "step": 2690 + }, + { + "epoch": 0.29551943773336264, + "grad_norm": 2.1758651733398438, + "learning_rate": 4.923133235724744e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6820043325424194, + "num_tokens": 69704210.0, + "step": 2691 + }, + { + "epoch": 0.2956292554359763, + "grad_norm": 2.134956121444702, + "learning_rate": 4.924963396778917e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6861929893493652, + "num_tokens": 69729498.0, + "step": 2692 + }, + { + "epoch": 0.29573907313858994, + "grad_norm": 1.8821161985397339, + "learning_rate": 4.92679355783309e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6783047914505005, + "num_tokens": 69759029.0, + "step": 2693 + }, + { + "epoch": 0.2958488908412036, + "grad_norm": 2.103518009185791, + "learning_rate": 4.928623718887262e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6782001852989197, + "num_tokens": 69784830.0, + "step": 2694 + }, + { + "epoch": 0.2959587085438173, + "grad_norm": 2.0854432582855225, + "learning_rate": 4.930453879941435e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7199294567108154, + "num_tokens": 69809312.0, + "step": 2695 + }, + { + "epoch": 0.29606852624643093, + "grad_norm": 2.018158197402954, + "learning_rate": 4.932284040995608e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.6876476407051086, + "num_tokens": 69839111.0, + "step": 2696 + }, + { + "epoch": 0.2961783439490446, + "grad_norm": 1.6943544149398804, + "learning_rate": 4.9341142020497805e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6954768896102905, + "num_tokens": 69875505.0, + "step": 2697 + }, + { + "epoch": 0.2962881616516582, + "grad_norm": 1.9200645685195923, + "learning_rate": 4.9359443631039535e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6912564039230347, + "num_tokens": 69906359.0, + "step": 2698 + }, + { + "epoch": 0.2963979793542719, + "grad_norm": 2.141848564147949, + "learning_rate": 4.9377745241581265e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7099350094795227, + "num_tokens": 69932659.0, + "step": 2699 + }, + { + "epoch": 0.29650779705688557, + "grad_norm": 2.0079855918884277, + "learning_rate": 4.9396046852122995e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7005214095115662, + "num_tokens": 69958413.0, + "step": 2700 + }, + { + "epoch": 0.2966176147594992, + "grad_norm": 1.977754831314087, + "learning_rate": 4.941434846266472e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6769822835922241, + "num_tokens": 69987173.0, + "step": 2701 + }, + { + "epoch": 0.2967274324621129, + "grad_norm": 2.045529365539551, + "learning_rate": 4.943265007320645e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6995010375976562, + "num_tokens": 70013631.0, + "step": 2702 + }, + { + "epoch": 0.29683725016472656, + "grad_norm": 2.0590453147888184, + "learning_rate": 4.945095168374817e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6804415583610535, + "num_tokens": 70038927.0, + "step": 2703 + }, + { + "epoch": 0.2969470678673402, + "grad_norm": 1.8991847038269043, + "learning_rate": 4.946925329428991e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6799393892288208, + "num_tokens": 70069352.0, + "step": 2704 + }, + { + "epoch": 0.29705688556995385, + "grad_norm": 2.134006977081299, + "learning_rate": 4.948755490483163e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6974890828132629, + "num_tokens": 70092854.0, + "step": 2705 + }, + { + "epoch": 0.29716670327256756, + "grad_norm": 1.8260996341705322, + "learning_rate": 4.950585651537336e-06, + "loss": 1.0871, + "mean_token_accuracy": 0.6744384765625, + "num_tokens": 70126256.0, + "step": 2706 + }, + { + "epoch": 0.2972765209751812, + "grad_norm": 2.2739310264587402, + "learning_rate": 4.952415812591508e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7155838012695312, + "num_tokens": 70148212.0, + "step": 2707 + }, + { + "epoch": 0.29738633867779485, + "grad_norm": 2.1657874584198, + "learning_rate": 4.954245973645681e-06, + "loss": 1.069, + "mean_token_accuracy": 0.6871070265769958, + "num_tokens": 70172457.0, + "step": 2708 + }, + { + "epoch": 0.29749615638040855, + "grad_norm": 1.9076184034347534, + "learning_rate": 4.956076134699854e-06, + "loss": 1.0762, + "mean_token_accuracy": 0.6780685782432556, + "num_tokens": 70201135.0, + "step": 2709 + }, + { + "epoch": 0.2976059740830222, + "grad_norm": 1.8837289810180664, + "learning_rate": 4.957906295754027e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6978353261947632, + "num_tokens": 70230854.0, + "step": 2710 + }, + { + "epoch": 0.29771579178563584, + "grad_norm": 1.9760366678237915, + "learning_rate": 4.959736456808199e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6806375980377197, + "num_tokens": 70255840.0, + "step": 2711 + }, + { + "epoch": 0.2978256094882495, + "grad_norm": 2.186692714691162, + "learning_rate": 4.961566617862372e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.709776759147644, + "num_tokens": 70276652.0, + "step": 2712 + }, + { + "epoch": 0.2979354271908632, + "grad_norm": 1.9453716278076172, + "learning_rate": 4.963396778916545e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7034599781036377, + "num_tokens": 70304221.0, + "step": 2713 + }, + { + "epoch": 0.29804524489347684, + "grad_norm": 2.247288703918457, + "learning_rate": 4.9652269399707175e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7031819820404053, + "num_tokens": 70327732.0, + "step": 2714 + }, + { + "epoch": 0.2981550625960905, + "grad_norm": 1.998866081237793, + "learning_rate": 4.9670571010248905e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6976858973503113, + "num_tokens": 70353888.0, + "step": 2715 + }, + { + "epoch": 0.2982648802987041, + "grad_norm": 2.036289930343628, + "learning_rate": 4.9688872620790635e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7072877883911133, + "num_tokens": 70380426.0, + "step": 2716 + }, + { + "epoch": 0.29837469800131783, + "grad_norm": 2.1395392417907715, + "learning_rate": 4.9707174231332365e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7172852754592896, + "num_tokens": 70402633.0, + "step": 2717 + }, + { + "epoch": 0.2984845157039315, + "grad_norm": 2.0043177604675293, + "learning_rate": 4.972547584187409e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.715121865272522, + "num_tokens": 70431322.0, + "step": 2718 + }, + { + "epoch": 0.2985943334065451, + "grad_norm": 2.1274986267089844, + "learning_rate": 4.974377745241582e-06, + "loss": 0.97, + "mean_token_accuracy": 0.705299973487854, + "num_tokens": 70455190.0, + "step": 2719 + }, + { + "epoch": 0.2987041511091588, + "grad_norm": 2.150383234024048, + "learning_rate": 4.976207906295755e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7069064974784851, + "num_tokens": 70478616.0, + "step": 2720 + }, + { + "epoch": 0.29881396881177247, + "grad_norm": 2.0269386768341064, + "learning_rate": 4.978038067349928e-06, + "loss": 1.1128, + "mean_token_accuracy": 0.6675028800964355, + "num_tokens": 70507599.0, + "step": 2721 + }, + { + "epoch": 0.2989237865143861, + "grad_norm": 2.0951130390167236, + "learning_rate": 4.9798682284041e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7063184976577759, + "num_tokens": 70533611.0, + "step": 2722 + }, + { + "epoch": 0.29903360421699976, + "grad_norm": 1.9738030433654785, + "learning_rate": 4.981698389458273e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6926183700561523, + "num_tokens": 70560242.0, + "step": 2723 + }, + { + "epoch": 0.29914342191961346, + "grad_norm": 2.0300934314727783, + "learning_rate": 4.983528550512445e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.711946427822113, + "num_tokens": 70586355.0, + "step": 2724 + }, + { + "epoch": 0.2992532396222271, + "grad_norm": 2.240138530731201, + "learning_rate": 4.985358711566619e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7056006789207458, + "num_tokens": 70608769.0, + "step": 2725 + }, + { + "epoch": 0.29936305732484075, + "grad_norm": 1.9788440465927124, + "learning_rate": 4.987188872620791e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6831364035606384, + "num_tokens": 70636039.0, + "step": 2726 + }, + { + "epoch": 0.2994728750274544, + "grad_norm": 2.1232192516326904, + "learning_rate": 4.989019033674964e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.7038111686706543, + "num_tokens": 70661430.0, + "step": 2727 + }, + { + "epoch": 0.2995826927300681, + "grad_norm": 1.8305267095565796, + "learning_rate": 4.990849194729136e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6822882890701294, + "num_tokens": 70693031.0, + "step": 2728 + }, + { + "epoch": 0.29969251043268175, + "grad_norm": 1.9430650472640991, + "learning_rate": 4.992679355783309e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6881463527679443, + "num_tokens": 70722507.0, + "step": 2729 + }, + { + "epoch": 0.2998023281352954, + "grad_norm": 1.7632657289505005, + "learning_rate": 4.994509516837482e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6974742412567139, + "num_tokens": 70754283.0, + "step": 2730 + }, + { + "epoch": 0.2999121458379091, + "grad_norm": 2.3511180877685547, + "learning_rate": 4.996339677891655e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6986281871795654, + "num_tokens": 70776558.0, + "step": 2731 + }, + { + "epoch": 0.30002196354052274, + "grad_norm": 1.935287594795227, + "learning_rate": 4.998169838945827e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6891862750053406, + "num_tokens": 70806010.0, + "step": 2732 + }, + { + "epoch": 0.3001317812431364, + "grad_norm": 2.1058647632598877, + "learning_rate": 5e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6832999587059021, + "num_tokens": 70833826.0, + "step": 2733 + }, + { + "epoch": 0.30024159894575003, + "grad_norm": 2.1185898780822754, + "learning_rate": 5e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6942952871322632, + "num_tokens": 70859277.0, + "step": 2734 + }, + { + "epoch": 0.30035141664836373, + "grad_norm": 2.4362001419067383, + "learning_rate": 5e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7167171239852905, + "num_tokens": 70877971.0, + "step": 2735 + }, + { + "epoch": 0.3004612343509774, + "grad_norm": 2.0449349880218506, + "learning_rate": 5e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6956244111061096, + "num_tokens": 70905952.0, + "step": 2736 + }, + { + "epoch": 0.300571052053591, + "grad_norm": 2.035430431365967, + "learning_rate": 5e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7215102910995483, + "num_tokens": 70931589.0, + "step": 2737 + }, + { + "epoch": 0.3006808697562047, + "grad_norm": 2.079406261444092, + "learning_rate": 5e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7025189399719238, + "num_tokens": 70958784.0, + "step": 2738 + }, + { + "epoch": 0.3007906874588184, + "grad_norm": 2.2620720863342285, + "learning_rate": 5e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.679097056388855, + "num_tokens": 70982681.0, + "step": 2739 + }, + { + "epoch": 0.300900505161432, + "grad_norm": 2.101489543914795, + "learning_rate": 5e-06, + "loss": 1.078, + "mean_token_accuracy": 0.6785386800765991, + "num_tokens": 71009402.0, + "step": 2740 + }, + { + "epoch": 0.30101032286404567, + "grad_norm": 2.2936904430389404, + "learning_rate": 5e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7149989008903503, + "num_tokens": 71032359.0, + "step": 2741 + }, + { + "epoch": 0.30112014056665937, + "grad_norm": 2.394784688949585, + "learning_rate": 5e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7053502202033997, + "num_tokens": 71051996.0, + "step": 2742 + }, + { + "epoch": 0.301229958269273, + "grad_norm": 2.3540656566619873, + "learning_rate": 5e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6931328773498535, + "num_tokens": 71072357.0, + "step": 2743 + }, + { + "epoch": 0.30133977597188666, + "grad_norm": 2.059934616088867, + "learning_rate": 5e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.685674250125885, + "num_tokens": 71098895.0, + "step": 2744 + }, + { + "epoch": 0.3014495936745003, + "grad_norm": 2.1310763359069824, + "learning_rate": 5e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7146044373512268, + "num_tokens": 71120594.0, + "step": 2745 + }, + { + "epoch": 0.301559411377114, + "grad_norm": 1.9549014568328857, + "learning_rate": 5e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6888156533241272, + "num_tokens": 71149013.0, + "step": 2746 + }, + { + "epoch": 0.30166922907972765, + "grad_norm": 2.1211540699005127, + "learning_rate": 5e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7313282489776611, + "num_tokens": 71171203.0, + "step": 2747 + }, + { + "epoch": 0.3017790467823413, + "grad_norm": 1.9577362537384033, + "learning_rate": 5e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6927353143692017, + "num_tokens": 71202582.0, + "step": 2748 + }, + { + "epoch": 0.301888864484955, + "grad_norm": 1.9750524759292603, + "learning_rate": 5e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6927375793457031, + "num_tokens": 71230549.0, + "step": 2749 + }, + { + "epoch": 0.30199868218756865, + "grad_norm": 2.256997585296631, + "learning_rate": 5e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6753997206687927, + "num_tokens": 71257485.0, + "step": 2750 + }, + { + "epoch": 0.3021084998901823, + "grad_norm": 2.0685276985168457, + "learning_rate": 5e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.679703414440155, + "num_tokens": 71284800.0, + "step": 2751 + }, + { + "epoch": 0.30221831759279594, + "grad_norm": 1.8049525022506714, + "learning_rate": 5e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7083991765975952, + "num_tokens": 71317263.0, + "step": 2752 + }, + { + "epoch": 0.30232813529540964, + "grad_norm": 2.150331735610962, + "learning_rate": 5e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7301743030548096, + "num_tokens": 71340233.0, + "step": 2753 + }, + { + "epoch": 0.3024379529980233, + "grad_norm": 2.102522134780884, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7239875197410583, + "num_tokens": 71362296.0, + "step": 2754 + }, + { + "epoch": 0.30254777070063693, + "grad_norm": 2.2254462242126465, + "learning_rate": 5e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6900263428688049, + "num_tokens": 71384922.0, + "step": 2755 + }, + { + "epoch": 0.3026575884032506, + "grad_norm": 1.904090166091919, + "learning_rate": 5e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.689591109752655, + "num_tokens": 71414528.0, + "step": 2756 + }, + { + "epoch": 0.3027674061058643, + "grad_norm": 2.0330231189727783, + "learning_rate": 5e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.6936138868331909, + "num_tokens": 71440034.0, + "step": 2757 + }, + { + "epoch": 0.3028772238084779, + "grad_norm": 1.9048703908920288, + "learning_rate": 5e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6815423965454102, + "num_tokens": 71473515.0, + "step": 2758 + }, + { + "epoch": 0.30298704151109157, + "grad_norm": 2.0468547344207764, + "learning_rate": 5e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6943652629852295, + "num_tokens": 71498456.0, + "step": 2759 + }, + { + "epoch": 0.30309685921370527, + "grad_norm": 2.3834421634674072, + "learning_rate": 5e-06, + "loss": 1.0945, + "mean_token_accuracy": 0.681896984577179, + "num_tokens": 71522516.0, + "step": 2760 + }, + { + "epoch": 0.3032066769163189, + "grad_norm": 1.987179160118103, + "learning_rate": 5e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7100856304168701, + "num_tokens": 71553675.0, + "step": 2761 + }, + { + "epoch": 0.30331649461893256, + "grad_norm": 2.0310475826263428, + "learning_rate": 5e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6879595518112183, + "num_tokens": 71582219.0, + "step": 2762 + }, + { + "epoch": 0.3034263123215462, + "grad_norm": 1.959519863128662, + "learning_rate": 5e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7097116708755493, + "num_tokens": 71609037.0, + "step": 2763 + }, + { + "epoch": 0.3035361300241599, + "grad_norm": 2.0534474849700928, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7062102556228638, + "num_tokens": 71637377.0, + "step": 2764 + }, + { + "epoch": 0.30364594772677356, + "grad_norm": 2.2104568481445312, + "learning_rate": 5e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6971543431282043, + "num_tokens": 71664626.0, + "step": 2765 + }, + { + "epoch": 0.3037557654293872, + "grad_norm": 1.7497963905334473, + "learning_rate": 5e-06, + "loss": 1.1172, + "mean_token_accuracy": 0.66651850938797, + "num_tokens": 71701551.0, + "step": 2766 + }, + { + "epoch": 0.3038655831320009, + "grad_norm": 1.8143925666809082, + "learning_rate": 5e-06, + "loss": 1.0904, + "mean_token_accuracy": 0.6792320609092712, + "num_tokens": 71735958.0, + "step": 2767 + }, + { + "epoch": 0.30397540083461455, + "grad_norm": 1.945694923400879, + "learning_rate": 5e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.710464596748352, + "num_tokens": 71764261.0, + "step": 2768 + }, + { + "epoch": 0.3040852185372282, + "grad_norm": 2.396001100540161, + "learning_rate": 5e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6990534067153931, + "num_tokens": 71784413.0, + "step": 2769 + }, + { + "epoch": 0.30419503623984184, + "grad_norm": 2.052231550216675, + "learning_rate": 5e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7241253852844238, + "num_tokens": 71806818.0, + "step": 2770 + }, + { + "epoch": 0.30430485394245554, + "grad_norm": 1.918864130973816, + "learning_rate": 5e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6901812553405762, + "num_tokens": 71834870.0, + "step": 2771 + }, + { + "epoch": 0.3044146716450692, + "grad_norm": 2.0096707344055176, + "learning_rate": 5e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7066841125488281, + "num_tokens": 71859850.0, + "step": 2772 + }, + { + "epoch": 0.30452448934768284, + "grad_norm": 1.9894115924835205, + "learning_rate": 5e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.700837254524231, + "num_tokens": 71884424.0, + "step": 2773 + }, + { + "epoch": 0.3046343070502965, + "grad_norm": 1.9868003129959106, + "learning_rate": 5e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7003192901611328, + "num_tokens": 71912804.0, + "step": 2774 + }, + { + "epoch": 0.3047441247529102, + "grad_norm": 1.8716719150543213, + "learning_rate": 5e-06, + "loss": 1.1228, + "mean_token_accuracy": 0.6616541743278503, + "num_tokens": 71943140.0, + "step": 2775 + }, + { + "epoch": 0.30485394245552383, + "grad_norm": 2.1447415351867676, + "learning_rate": 5e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.697218656539917, + "num_tokens": 71966449.0, + "step": 2776 + }, + { + "epoch": 0.3049637601581375, + "grad_norm": 2.2802419662475586, + "learning_rate": 5e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7016375064849854, + "num_tokens": 71986595.0, + "step": 2777 + }, + { + "epoch": 0.3050735778607512, + "grad_norm": 1.9633671045303345, + "learning_rate": 5e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6977115869522095, + "num_tokens": 72015756.0, + "step": 2778 + }, + { + "epoch": 0.3051833955633648, + "grad_norm": 1.958578109741211, + "learning_rate": 5e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.694561243057251, + "num_tokens": 72043444.0, + "step": 2779 + }, + { + "epoch": 0.30529321326597847, + "grad_norm": 1.81065833568573, + "learning_rate": 5e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6776508092880249, + "num_tokens": 72075749.0, + "step": 2780 + }, + { + "epoch": 0.3054030309685921, + "grad_norm": 2.1416313648223877, + "learning_rate": 5e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6826788783073425, + "num_tokens": 72100056.0, + "step": 2781 + }, + { + "epoch": 0.3055128486712058, + "grad_norm": 1.9466609954833984, + "learning_rate": 5e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6914317011833191, + "num_tokens": 72129161.0, + "step": 2782 + }, + { + "epoch": 0.30562266637381946, + "grad_norm": 2.2002358436584473, + "learning_rate": 5e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6936157941818237, + "num_tokens": 72150579.0, + "step": 2783 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 1.9380711317062378, + "learning_rate": 5e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7025799751281738, + "num_tokens": 72179518.0, + "step": 2784 + }, + { + "epoch": 0.30584230177904675, + "grad_norm": 1.8500251770019531, + "learning_rate": 5e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7003178000450134, + "num_tokens": 72209472.0, + "step": 2785 + }, + { + "epoch": 0.30595211948166046, + "grad_norm": 2.0953686237335205, + "learning_rate": 5e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6917339563369751, + "num_tokens": 72234350.0, + "step": 2786 + }, + { + "epoch": 0.3060619371842741, + "grad_norm": 2.3262603282928467, + "learning_rate": 5e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6878478527069092, + "num_tokens": 72257066.0, + "step": 2787 + }, + { + "epoch": 0.30617175488688775, + "grad_norm": 2.0547728538513184, + "learning_rate": 5e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7074709534645081, + "num_tokens": 72281119.0, + "step": 2788 + }, + { + "epoch": 0.30628157258950145, + "grad_norm": 1.9880118370056152, + "learning_rate": 5e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6910403966903687, + "num_tokens": 72309591.0, + "step": 2789 + }, + { + "epoch": 0.3063913902921151, + "grad_norm": 2.0688631534576416, + "learning_rate": 5e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7038941383361816, + "num_tokens": 72333318.0, + "step": 2790 + }, + { + "epoch": 0.30650120799472874, + "grad_norm": 2.0720739364624023, + "learning_rate": 5e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6922214031219482, + "num_tokens": 72359734.0, + "step": 2791 + }, + { + "epoch": 0.3066110256973424, + "grad_norm": 2.1505396366119385, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7113939523696899, + "num_tokens": 72382563.0, + "step": 2792 + }, + { + "epoch": 0.3067208433999561, + "grad_norm": 2.2424166202545166, + "learning_rate": 5e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6956346035003662, + "num_tokens": 72408870.0, + "step": 2793 + }, + { + "epoch": 0.30683066110256974, + "grad_norm": 1.8521347045898438, + "learning_rate": 5e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7076789140701294, + "num_tokens": 72439511.0, + "step": 2794 + }, + { + "epoch": 0.3069404788051834, + "grad_norm": 1.9188029766082764, + "learning_rate": 5e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7008505463600159, + "num_tokens": 72466418.0, + "step": 2795 + }, + { + "epoch": 0.3070502965077971, + "grad_norm": 2.0539772510528564, + "learning_rate": 5e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7062762975692749, + "num_tokens": 72490359.0, + "step": 2796 + }, + { + "epoch": 0.30716011421041073, + "grad_norm": 2.170905590057373, + "learning_rate": 5e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6855249404907227, + "num_tokens": 72515273.0, + "step": 2797 + }, + { + "epoch": 0.3072699319130244, + "grad_norm": 2.04933500289917, + "learning_rate": 5e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6881478428840637, + "num_tokens": 72542627.0, + "step": 2798 + }, + { + "epoch": 0.307379749615638, + "grad_norm": 2.2330336570739746, + "learning_rate": 5e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7006233930587769, + "num_tokens": 72564520.0, + "step": 2799 + }, + { + "epoch": 0.3074895673182517, + "grad_norm": 1.7585853338241577, + "learning_rate": 5e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7159029245376587, + "num_tokens": 72594932.0, + "step": 2800 + }, + { + "epoch": 0.30759938502086537, + "grad_norm": 2.039738178253174, + "learning_rate": 5e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6757740378379822, + "num_tokens": 72621108.0, + "step": 2801 + }, + { + "epoch": 0.307709202723479, + "grad_norm": 1.924028992652893, + "learning_rate": 5e-06, + "loss": 1.0882, + "mean_token_accuracy": 0.6699891686439514, + "num_tokens": 72652083.0, + "step": 2802 + }, + { + "epoch": 0.30781902042609266, + "grad_norm": 1.9589835405349731, + "learning_rate": 5e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7289462089538574, + "num_tokens": 72678660.0, + "step": 2803 + }, + { + "epoch": 0.30792883812870636, + "grad_norm": 2.0057179927825928, + "learning_rate": 5e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7144694328308105, + "num_tokens": 72707856.0, + "step": 2804 + }, + { + "epoch": 0.30803865583132, + "grad_norm": 1.8861377239227295, + "learning_rate": 5e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7065314054489136, + "num_tokens": 72737767.0, + "step": 2805 + }, + { + "epoch": 0.30814847353393365, + "grad_norm": 2.259582042694092, + "learning_rate": 5e-06, + "loss": 0.936, + "mean_token_accuracy": 0.714312732219696, + "num_tokens": 72756754.0, + "step": 2806 + }, + { + "epoch": 0.30825829123654735, + "grad_norm": 2.222099781036377, + "learning_rate": 5e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.698020339012146, + "num_tokens": 72779095.0, + "step": 2807 + }, + { + "epoch": 0.308368108939161, + "grad_norm": 2.113189697265625, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7229810953140259, + "num_tokens": 72801120.0, + "step": 2808 + }, + { + "epoch": 0.30847792664177465, + "grad_norm": 1.9751299619674683, + "learning_rate": 5e-06, + "loss": 1.1109, + "mean_token_accuracy": 0.6716301441192627, + "num_tokens": 72832638.0, + "step": 2809 + }, + { + "epoch": 0.3085877443443883, + "grad_norm": 2.0456318855285645, + "learning_rate": 5e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7108715176582336, + "num_tokens": 72857556.0, + "step": 2810 + }, + { + "epoch": 0.308697562047002, + "grad_norm": 2.0712428092956543, + "learning_rate": 5e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7001170516014099, + "num_tokens": 72880383.0, + "step": 2811 + }, + { + "epoch": 0.30880737974961564, + "grad_norm": 2.0408577919006348, + "learning_rate": 5e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.701277494430542, + "num_tokens": 72907579.0, + "step": 2812 + }, + { + "epoch": 0.3089171974522293, + "grad_norm": 2.2633936405181885, + "learning_rate": 5e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6968995928764343, + "num_tokens": 72929897.0, + "step": 2813 + }, + { + "epoch": 0.309027015154843, + "grad_norm": 2.0876305103302, + "learning_rate": 5e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6898032426834106, + "num_tokens": 72955494.0, + "step": 2814 + }, + { + "epoch": 0.30913683285745663, + "grad_norm": 2.2196290493011475, + "learning_rate": 5e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7198885679244995, + "num_tokens": 72977080.0, + "step": 2815 + }, + { + "epoch": 0.3092466505600703, + "grad_norm": 2.158952236175537, + "learning_rate": 5e-06, + "loss": 1.1342, + "mean_token_accuracy": 0.6594858169555664, + "num_tokens": 73005473.0, + "step": 2816 + }, + { + "epoch": 0.3093564682626839, + "grad_norm": 2.157987356185913, + "learning_rate": 5e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.711182713508606, + "num_tokens": 73028427.0, + "step": 2817 + }, + { + "epoch": 0.3094662859652976, + "grad_norm": 2.171410322189331, + "learning_rate": 5e-06, + "loss": 1.097, + "mean_token_accuracy": 0.6681855916976929, + "num_tokens": 73054689.0, + "step": 2818 + }, + { + "epoch": 0.3095761036679113, + "grad_norm": 2.244263172149658, + "learning_rate": 5e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7087308168411255, + "num_tokens": 73079016.0, + "step": 2819 + }, + { + "epoch": 0.3096859213705249, + "grad_norm": 1.942435622215271, + "learning_rate": 5e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7002013921737671, + "num_tokens": 73106021.0, + "step": 2820 + }, + { + "epoch": 0.30979573907313857, + "grad_norm": 1.8754518032073975, + "learning_rate": 5e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6896622180938721, + "num_tokens": 73133981.0, + "step": 2821 + }, + { + "epoch": 0.30990555677575227, + "grad_norm": 2.0981290340423584, + "learning_rate": 5e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6911373138427734, + "num_tokens": 73159759.0, + "step": 2822 + }, + { + "epoch": 0.3100153744783659, + "grad_norm": 2.309598207473755, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7242339849472046, + "num_tokens": 73179656.0, + "step": 2823 + }, + { + "epoch": 0.31012519218097956, + "grad_norm": 2.058396100997925, + "learning_rate": 5e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6957234144210815, + "num_tokens": 73206517.0, + "step": 2824 + }, + { + "epoch": 0.31023500988359326, + "grad_norm": 1.9410170316696167, + "learning_rate": 5e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6815359592437744, + "num_tokens": 73235763.0, + "step": 2825 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 1.9744082689285278, + "learning_rate": 5e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7029067873954773, + "num_tokens": 73260052.0, + "step": 2826 + }, + { + "epoch": 0.31045464528882055, + "grad_norm": 1.9791669845581055, + "learning_rate": 5e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7063237428665161, + "num_tokens": 73286040.0, + "step": 2827 + }, + { + "epoch": 0.3105644629914342, + "grad_norm": 2.312011957168579, + "learning_rate": 5e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6896952390670776, + "num_tokens": 73306261.0, + "step": 2828 + }, + { + "epoch": 0.3106742806940479, + "grad_norm": 2.166964292526245, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7041447162628174, + "num_tokens": 73330513.0, + "step": 2829 + }, + { + "epoch": 0.31078409839666155, + "grad_norm": 2.073331117630005, + "learning_rate": 5e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6961231231689453, + "num_tokens": 73355701.0, + "step": 2830 + }, + { + "epoch": 0.3108939160992752, + "grad_norm": 2.0734076499938965, + "learning_rate": 5e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6917868852615356, + "num_tokens": 73378977.0, + "step": 2831 + }, + { + "epoch": 0.31100373380188884, + "grad_norm": 2.428805351257324, + "learning_rate": 5e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6968400478363037, + "num_tokens": 73397415.0, + "step": 2832 + }, + { + "epoch": 0.31111355150450254, + "grad_norm": 2.016138792037964, + "learning_rate": 5e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7055566310882568, + "num_tokens": 73422832.0, + "step": 2833 + }, + { + "epoch": 0.3112233692071162, + "grad_norm": 2.204094886779785, + "learning_rate": 5e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6755256652832031, + "num_tokens": 73447631.0, + "step": 2834 + }, + { + "epoch": 0.31133318690972983, + "grad_norm": 2.0085201263427734, + "learning_rate": 5e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6915276050567627, + "num_tokens": 73475147.0, + "step": 2835 + }, + { + "epoch": 0.31144300461234353, + "grad_norm": 2.0935957431793213, + "learning_rate": 5e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6965683698654175, + "num_tokens": 73501222.0, + "step": 2836 + }, + { + "epoch": 0.3115528223149572, + "grad_norm": 2.0183990001678467, + "learning_rate": 5e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6788187623023987, + "num_tokens": 73529484.0, + "step": 2837 + }, + { + "epoch": 0.3116626400175708, + "grad_norm": 1.872006893157959, + "learning_rate": 5e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.7003532648086548, + "num_tokens": 73558180.0, + "step": 2838 + }, + { + "epoch": 0.31177245772018447, + "grad_norm": 2.431556224822998, + "learning_rate": 5e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6953798532485962, + "num_tokens": 73577719.0, + "step": 2839 + }, + { + "epoch": 0.31188227542279817, + "grad_norm": 2.1140244007110596, + "learning_rate": 5e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6932173371315002, + "num_tokens": 73600515.0, + "step": 2840 + }, + { + "epoch": 0.3119920931254118, + "grad_norm": 2.14898681640625, + "learning_rate": 5e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7072916030883789, + "num_tokens": 73623333.0, + "step": 2841 + }, + { + "epoch": 0.31210191082802546, + "grad_norm": 1.9027540683746338, + "learning_rate": 5e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.684827983379364, + "num_tokens": 73654756.0, + "step": 2842 + }, + { + "epoch": 0.31221172853063917, + "grad_norm": 1.7761989831924438, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7010614275932312, + "num_tokens": 73686866.0, + "step": 2843 + }, + { + "epoch": 0.3123215462332528, + "grad_norm": 2.1051108837127686, + "learning_rate": 5e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7099488973617554, + "num_tokens": 73710579.0, + "step": 2844 + }, + { + "epoch": 0.31243136393586646, + "grad_norm": 2.6462230682373047, + "learning_rate": 5e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7170271873474121, + "num_tokens": 73726719.0, + "step": 2845 + }, + { + "epoch": 0.3125411816384801, + "grad_norm": 2.489745855331421, + "learning_rate": 5e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7203226685523987, + "num_tokens": 73745184.0, + "step": 2846 + }, + { + "epoch": 0.3126509993410938, + "grad_norm": 2.119166612625122, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6966376304626465, + "num_tokens": 73769762.0, + "step": 2847 + }, + { + "epoch": 0.31276081704370745, + "grad_norm": 2.1454105377197266, + "learning_rate": 5e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7134361267089844, + "num_tokens": 73793964.0, + "step": 2848 + }, + { + "epoch": 0.3128706347463211, + "grad_norm": 1.896321177482605, + "learning_rate": 5e-06, + "loss": 1.096, + "mean_token_accuracy": 0.6694746017456055, + "num_tokens": 73824261.0, + "step": 2849 + }, + { + "epoch": 0.31298045244893474, + "grad_norm": 2.037715435028076, + "learning_rate": 5e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6851771473884583, + "num_tokens": 73849408.0, + "step": 2850 + }, + { + "epoch": 0.31309027015154844, + "grad_norm": 1.9244357347488403, + "learning_rate": 5e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6934579014778137, + "num_tokens": 73879021.0, + "step": 2851 + }, + { + "epoch": 0.3132000878541621, + "grad_norm": 2.1522908210754395, + "learning_rate": 5e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6959471702575684, + "num_tokens": 73902689.0, + "step": 2852 + }, + { + "epoch": 0.31330990555677574, + "grad_norm": 2.1706559658050537, + "learning_rate": 5e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.699211835861206, + "num_tokens": 73924171.0, + "step": 2853 + }, + { + "epoch": 0.31341972325938944, + "grad_norm": 2.0580332279205322, + "learning_rate": 5e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7040759921073914, + "num_tokens": 73949131.0, + "step": 2854 + }, + { + "epoch": 0.3135295409620031, + "grad_norm": 2.1030895709991455, + "learning_rate": 5e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7095263004302979, + "num_tokens": 73971343.0, + "step": 2855 + }, + { + "epoch": 0.31363935866461673, + "grad_norm": 2.190093994140625, + "learning_rate": 5e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7159067392349243, + "num_tokens": 73991586.0, + "step": 2856 + }, + { + "epoch": 0.3137491763672304, + "grad_norm": 2.030055284500122, + "learning_rate": 5e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.732002317905426, + "num_tokens": 74014612.0, + "step": 2857 + }, + { + "epoch": 0.3138589940698441, + "grad_norm": 2.1869325637817383, + "learning_rate": 5e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7030861973762512, + "num_tokens": 74045204.0, + "step": 2858 + }, + { + "epoch": 0.3139688117724577, + "grad_norm": 2.010850191116333, + "learning_rate": 5e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7227336168289185, + "num_tokens": 74073373.0, + "step": 2859 + }, + { + "epoch": 0.31407862947507137, + "grad_norm": 2.1487791538238525, + "learning_rate": 5e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.69822096824646, + "num_tokens": 74095252.0, + "step": 2860 + }, + { + "epoch": 0.314188447177685, + "grad_norm": 1.9972143173217773, + "learning_rate": 5e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6834968328475952, + "num_tokens": 74122525.0, + "step": 2861 + }, + { + "epoch": 0.3142982648802987, + "grad_norm": 2.06339430809021, + "learning_rate": 5e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6865240335464478, + "num_tokens": 74151352.0, + "step": 2862 + }, + { + "epoch": 0.31440808258291236, + "grad_norm": 2.26603364944458, + "learning_rate": 5e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.706237256526947, + "num_tokens": 74172550.0, + "step": 2863 + }, + { + "epoch": 0.314517900285526, + "grad_norm": 2.1610660552978516, + "learning_rate": 5e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7051132917404175, + "num_tokens": 74195270.0, + "step": 2864 + }, + { + "epoch": 0.3146277179881397, + "grad_norm": 2.1303274631500244, + "learning_rate": 5e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6896318197250366, + "num_tokens": 74217611.0, + "step": 2865 + }, + { + "epoch": 0.31473753569075336, + "grad_norm": 1.9785305261611938, + "learning_rate": 5e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6951152086257935, + "num_tokens": 74244050.0, + "step": 2866 + }, + { + "epoch": 0.314847353393367, + "grad_norm": 1.9757318496704102, + "learning_rate": 5e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6879897117614746, + "num_tokens": 74271077.0, + "step": 2867 + }, + { + "epoch": 0.31495717109598065, + "grad_norm": 2.227457284927368, + "learning_rate": 5e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.716903567314148, + "num_tokens": 74293158.0, + "step": 2868 + }, + { + "epoch": 0.31506698879859435, + "grad_norm": 2.274559497833252, + "learning_rate": 5e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7117809653282166, + "num_tokens": 74313878.0, + "step": 2869 + }, + { + "epoch": 0.315176806501208, + "grad_norm": 1.9623165130615234, + "learning_rate": 5e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.686822772026062, + "num_tokens": 74341211.0, + "step": 2870 + }, + { + "epoch": 0.31528662420382164, + "grad_norm": 2.070784091949463, + "learning_rate": 5e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7092235088348389, + "num_tokens": 74366432.0, + "step": 2871 + }, + { + "epoch": 0.31539644190643534, + "grad_norm": 1.740162968635559, + "learning_rate": 5e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7035980224609375, + "num_tokens": 74399023.0, + "step": 2872 + }, + { + "epoch": 0.315506259609049, + "grad_norm": 2.3924977779388428, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6953575611114502, + "num_tokens": 74420461.0, + "step": 2873 + }, + { + "epoch": 0.31561607731166264, + "grad_norm": 1.984116554260254, + "learning_rate": 5e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.704474151134491, + "num_tokens": 74445429.0, + "step": 2874 + }, + { + "epoch": 0.3157258950142763, + "grad_norm": 1.9310764074325562, + "learning_rate": 5e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6878618597984314, + "num_tokens": 74474804.0, + "step": 2875 + }, + { + "epoch": 0.31583571271689, + "grad_norm": 1.812150478363037, + "learning_rate": 5e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6975721120834351, + "num_tokens": 74506540.0, + "step": 2876 + }, + { + "epoch": 0.31594553041950363, + "grad_norm": 2.2914814949035645, + "learning_rate": 5e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6959494948387146, + "num_tokens": 74527586.0, + "step": 2877 + }, + { + "epoch": 0.3160553481221173, + "grad_norm": 1.90108323097229, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7054773569107056, + "num_tokens": 74555269.0, + "step": 2878 + }, + { + "epoch": 0.3161651658247309, + "grad_norm": 2.1054279804229736, + "learning_rate": 5e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6730340719223022, + "num_tokens": 74580649.0, + "step": 2879 + }, + { + "epoch": 0.3162749835273446, + "grad_norm": 2.109100103378296, + "learning_rate": 5e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7019761204719543, + "num_tokens": 74603802.0, + "step": 2880 + }, + { + "epoch": 0.31638480122995827, + "grad_norm": 2.036306381225586, + "learning_rate": 5e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7009263038635254, + "num_tokens": 74629339.0, + "step": 2881 + }, + { + "epoch": 0.3164946189325719, + "grad_norm": 2.015637159347534, + "learning_rate": 5e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7005278468132019, + "num_tokens": 74656737.0, + "step": 2882 + }, + { + "epoch": 0.3166044366351856, + "grad_norm": 2.249614953994751, + "learning_rate": 5e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7032499313354492, + "num_tokens": 74682167.0, + "step": 2883 + }, + { + "epoch": 0.31671425433779926, + "grad_norm": 2.4470651149749756, + "learning_rate": 5e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6918632984161377, + "num_tokens": 74701912.0, + "step": 2884 + }, + { + "epoch": 0.3168240720404129, + "grad_norm": 1.941527247428894, + "learning_rate": 5e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6964839696884155, + "num_tokens": 74730817.0, + "step": 2885 + }, + { + "epoch": 0.31693388974302655, + "grad_norm": 2.0590548515319824, + "learning_rate": 5e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6963874101638794, + "num_tokens": 74757888.0, + "step": 2886 + }, + { + "epoch": 0.31704370744564025, + "grad_norm": 1.9321500062942505, + "learning_rate": 5e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6932460069656372, + "num_tokens": 74786151.0, + "step": 2887 + }, + { + "epoch": 0.3171535251482539, + "grad_norm": 1.9808083772659302, + "learning_rate": 5e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6879719495773315, + "num_tokens": 74812546.0, + "step": 2888 + }, + { + "epoch": 0.31726334285086755, + "grad_norm": 2.1710822582244873, + "learning_rate": 5e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6776825189590454, + "num_tokens": 74835248.0, + "step": 2889 + }, + { + "epoch": 0.31737316055348125, + "grad_norm": 1.9095721244812012, + "learning_rate": 5e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7128461599349976, + "num_tokens": 74862099.0, + "step": 2890 + }, + { + "epoch": 0.3174829782560949, + "grad_norm": 2.0401694774627686, + "learning_rate": 5e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.692007303237915, + "num_tokens": 74890262.0, + "step": 2891 + }, + { + "epoch": 0.31759279595870854, + "grad_norm": 2.0202105045318604, + "learning_rate": 5e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.694432258605957, + "num_tokens": 74917136.0, + "step": 2892 + }, + { + "epoch": 0.3177026136613222, + "grad_norm": 1.9397542476654053, + "learning_rate": 5e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7033051252365112, + "num_tokens": 74947237.0, + "step": 2893 + }, + { + "epoch": 0.3178124313639359, + "grad_norm": 2.187293291091919, + "learning_rate": 5e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6947544813156128, + "num_tokens": 74971751.0, + "step": 2894 + }, + { + "epoch": 0.31792224906654953, + "grad_norm": 2.1170105934143066, + "learning_rate": 5e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6988113522529602, + "num_tokens": 74996514.0, + "step": 2895 + }, + { + "epoch": 0.3180320667691632, + "grad_norm": 1.935502529144287, + "learning_rate": 5e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7088044285774231, + "num_tokens": 75026083.0, + "step": 2896 + }, + { + "epoch": 0.3181418844717768, + "grad_norm": 2.3343849182128906, + "learning_rate": 5e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6931483745574951, + "num_tokens": 75047733.0, + "step": 2897 + }, + { + "epoch": 0.3182517021743905, + "grad_norm": 2.056466579437256, + "learning_rate": 5e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6969183683395386, + "num_tokens": 75074584.0, + "step": 2898 + }, + { + "epoch": 0.3183615198770042, + "grad_norm": 2.334717035293579, + "learning_rate": 5e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7079635262489319, + "num_tokens": 75093033.0, + "step": 2899 + }, + { + "epoch": 0.3184713375796178, + "grad_norm": 2.137115001678467, + "learning_rate": 5e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6844269037246704, + "num_tokens": 75116007.0, + "step": 2900 + }, + { + "epoch": 0.3185811552822315, + "grad_norm": 2.023392677307129, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6990870237350464, + "num_tokens": 75144754.0, + "step": 2901 + }, + { + "epoch": 0.31869097298484517, + "grad_norm": 2.1269500255584717, + "learning_rate": 5e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7081732749938965, + "num_tokens": 75166440.0, + "step": 2902 + }, + { + "epoch": 0.3188007906874588, + "grad_norm": 1.787739634513855, + "learning_rate": 5e-06, + "loss": 1.0774, + "mean_token_accuracy": 0.6833688020706177, + "num_tokens": 75199804.0, + "step": 2903 + }, + { + "epoch": 0.31891060839007246, + "grad_norm": 2.041435480117798, + "learning_rate": 5e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6808979511260986, + "num_tokens": 75226573.0, + "step": 2904 + }, + { + "epoch": 0.31902042609268616, + "grad_norm": 2.0824520587921143, + "learning_rate": 5e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7120540142059326, + "num_tokens": 75250076.0, + "step": 2905 + }, + { + "epoch": 0.3191302437952998, + "grad_norm": 1.9289958477020264, + "learning_rate": 5e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6972765922546387, + "num_tokens": 75278022.0, + "step": 2906 + }, + { + "epoch": 0.31924006149791345, + "grad_norm": 2.0903825759887695, + "learning_rate": 5e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6848423480987549, + "num_tokens": 75304396.0, + "step": 2907 + }, + { + "epoch": 0.3193498792005271, + "grad_norm": 2.1081807613372803, + "learning_rate": 5e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6875344514846802, + "num_tokens": 75328878.0, + "step": 2908 + }, + { + "epoch": 0.3194596969031408, + "grad_norm": 2.0520031452178955, + "learning_rate": 5e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7098870873451233, + "num_tokens": 75355308.0, + "step": 2909 + }, + { + "epoch": 0.31956951460575445, + "grad_norm": 2.0801360607147217, + "learning_rate": 5e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7071989178657532, + "num_tokens": 75377960.0, + "step": 2910 + }, + { + "epoch": 0.3196793323083681, + "grad_norm": 2.0738539695739746, + "learning_rate": 5e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6847747564315796, + "num_tokens": 75405349.0, + "step": 2911 + }, + { + "epoch": 0.3197891500109818, + "grad_norm": 2.3529131412506104, + "learning_rate": 5e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7042815685272217, + "num_tokens": 75426052.0, + "step": 2912 + }, + { + "epoch": 0.31989896771359544, + "grad_norm": 2.0410349369049072, + "learning_rate": 5e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6928407549858093, + "num_tokens": 75453377.0, + "step": 2913 + }, + { + "epoch": 0.3200087854162091, + "grad_norm": 2.042926073074341, + "learning_rate": 5e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.702100396156311, + "num_tokens": 75475055.0, + "step": 2914 + }, + { + "epoch": 0.32011860311882273, + "grad_norm": 2.1209328174591064, + "learning_rate": 5e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6930415630340576, + "num_tokens": 75501136.0, + "step": 2915 + }, + { + "epoch": 0.32022842082143643, + "grad_norm": 2.132328987121582, + "learning_rate": 5e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6862585544586182, + "num_tokens": 75527200.0, + "step": 2916 + }, + { + "epoch": 0.3203382385240501, + "grad_norm": 1.8791131973266602, + "learning_rate": 5e-06, + "loss": 1.0972, + "mean_token_accuracy": 0.6783168911933899, + "num_tokens": 75560704.0, + "step": 2917 + }, + { + "epoch": 0.3204480562266637, + "grad_norm": 1.9283636808395386, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7033461332321167, + "num_tokens": 75586282.0, + "step": 2918 + }, + { + "epoch": 0.3205578739292774, + "grad_norm": 2.1475613117218018, + "learning_rate": 5e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7147846817970276, + "num_tokens": 75608152.0, + "step": 2919 + }, + { + "epoch": 0.32066769163189107, + "grad_norm": 2.24582839012146, + "learning_rate": 5e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6983165144920349, + "num_tokens": 75629472.0, + "step": 2920 + }, + { + "epoch": 0.3207775093345047, + "grad_norm": 2.1374547481536865, + "learning_rate": 5e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7147667407989502, + "num_tokens": 75652133.0, + "step": 2921 + }, + { + "epoch": 0.32088732703711836, + "grad_norm": 2.2259161472320557, + "learning_rate": 5e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7152583599090576, + "num_tokens": 75670986.0, + "step": 2922 + }, + { + "epoch": 0.32099714473973207, + "grad_norm": 2.1172449588775635, + "learning_rate": 5e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.716454267501831, + "num_tokens": 75693394.0, + "step": 2923 + }, + { + "epoch": 0.3211069624423457, + "grad_norm": 1.7160841226577759, + "learning_rate": 5e-06, + "loss": 1.0808, + "mean_token_accuracy": 0.6790403127670288, + "num_tokens": 75727591.0, + "step": 2924 + }, + { + "epoch": 0.32121678014495936, + "grad_norm": 2.054084300994873, + "learning_rate": 5e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6874452829360962, + "num_tokens": 75752324.0, + "step": 2925 + }, + { + "epoch": 0.321326597847573, + "grad_norm": 2.0678200721740723, + "learning_rate": 5e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7250480651855469, + "num_tokens": 75776719.0, + "step": 2926 + }, + { + "epoch": 0.3214364155501867, + "grad_norm": 2.1357412338256836, + "learning_rate": 5e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6865161657333374, + "num_tokens": 75802811.0, + "step": 2927 + }, + { + "epoch": 0.32154623325280035, + "grad_norm": 2.0433499813079834, + "learning_rate": 5e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6877079010009766, + "num_tokens": 75830225.0, + "step": 2928 + }, + { + "epoch": 0.321656050955414, + "grad_norm": 2.230114698410034, + "learning_rate": 5e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7052513957023621, + "num_tokens": 75851625.0, + "step": 2929 + }, + { + "epoch": 0.3217658686580277, + "grad_norm": 1.6623541116714478, + "learning_rate": 5e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6925378441810608, + "num_tokens": 75885634.0, + "step": 2930 + }, + { + "epoch": 0.32187568636064134, + "grad_norm": 1.9454679489135742, + "learning_rate": 5e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6781334280967712, + "num_tokens": 75919172.0, + "step": 2931 + }, + { + "epoch": 0.321985504063255, + "grad_norm": 1.7146642208099365, + "learning_rate": 5e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7198927402496338, + "num_tokens": 75950171.0, + "step": 2932 + }, + { + "epoch": 0.32209532176586864, + "grad_norm": 2.3863742351531982, + "learning_rate": 5e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6857104897499084, + "num_tokens": 75969651.0, + "step": 2933 + }, + { + "epoch": 0.32220513946848234, + "grad_norm": 1.9630722999572754, + "learning_rate": 5e-06, + "loss": 1.1243, + "mean_token_accuracy": 0.6732491850852966, + "num_tokens": 75999311.0, + "step": 2934 + }, + { + "epoch": 0.322314957171096, + "grad_norm": 1.9617996215820312, + "learning_rate": 5e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7093172073364258, + "num_tokens": 76027863.0, + "step": 2935 + }, + { + "epoch": 0.32242477487370963, + "grad_norm": 2.4000253677368164, + "learning_rate": 5e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7090427279472351, + "num_tokens": 76046841.0, + "step": 2936 + }, + { + "epoch": 0.3225345925763233, + "grad_norm": 2.1796927452087402, + "learning_rate": 5e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7098284363746643, + "num_tokens": 76069303.0, + "step": 2937 + }, + { + "epoch": 0.322644410278937, + "grad_norm": 1.8179157972335815, + "learning_rate": 5e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6967235207557678, + "num_tokens": 76100429.0, + "step": 2938 + }, + { + "epoch": 0.3227542279815506, + "grad_norm": 2.214958906173706, + "learning_rate": 5e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6954758167266846, + "num_tokens": 76122008.0, + "step": 2939 + }, + { + "epoch": 0.32286404568416427, + "grad_norm": 2.5566587448120117, + "learning_rate": 5e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6997344493865967, + "num_tokens": 76140149.0, + "step": 2940 + }, + { + "epoch": 0.32297386338677797, + "grad_norm": 2.472378730773926, + "learning_rate": 5e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7040712833404541, + "num_tokens": 76159645.0, + "step": 2941 + }, + { + "epoch": 0.3230836810893916, + "grad_norm": 2.0606188774108887, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.714989960193634, + "num_tokens": 76185239.0, + "step": 2942 + }, + { + "epoch": 0.32319349879200526, + "grad_norm": 1.9278432130813599, + "learning_rate": 5e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6845169067382812, + "num_tokens": 76213225.0, + "step": 2943 + }, + { + "epoch": 0.3233033164946189, + "grad_norm": 1.9682427644729614, + "learning_rate": 5e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.70128333568573, + "num_tokens": 76241940.0, + "step": 2944 + }, + { + "epoch": 0.3234131341972326, + "grad_norm": 2.171071767807007, + "learning_rate": 5e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7011935710906982, + "num_tokens": 76265208.0, + "step": 2945 + }, + { + "epoch": 0.32352295189984626, + "grad_norm": 2.341780424118042, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.722201943397522, + "num_tokens": 76283332.0, + "step": 2946 + }, + { + "epoch": 0.3236327696024599, + "grad_norm": 1.920198917388916, + "learning_rate": 5e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6927410364151001, + "num_tokens": 76313876.0, + "step": 2947 + }, + { + "epoch": 0.3237425873050736, + "grad_norm": 1.9706182479858398, + "learning_rate": 5e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7032658457756042, + "num_tokens": 76342113.0, + "step": 2948 + }, + { + "epoch": 0.32385240500768725, + "grad_norm": 2.137599468231201, + "learning_rate": 5e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6997476816177368, + "num_tokens": 76367164.0, + "step": 2949 + }, + { + "epoch": 0.3239622227103009, + "grad_norm": 2.131079912185669, + "learning_rate": 5e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.6998468637466431, + "num_tokens": 76391837.0, + "step": 2950 + }, + { + "epoch": 0.32407204041291454, + "grad_norm": 2.048462390899658, + "learning_rate": 5e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6976684331893921, + "num_tokens": 76416582.0, + "step": 2951 + }, + { + "epoch": 0.32418185811552824, + "grad_norm": 2.0863254070281982, + "learning_rate": 5e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6970560550689697, + "num_tokens": 76440255.0, + "step": 2952 + }, + { + "epoch": 0.3242916758181419, + "grad_norm": 1.982649803161621, + "learning_rate": 5e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7041720747947693, + "num_tokens": 76466860.0, + "step": 2953 + }, + { + "epoch": 0.32440149352075554, + "grad_norm": 1.9218432903289795, + "learning_rate": 5e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6872609257698059, + "num_tokens": 76496480.0, + "step": 2954 + }, + { + "epoch": 0.3245113112233692, + "grad_norm": 1.98908269405365, + "learning_rate": 5e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7061330080032349, + "num_tokens": 76524629.0, + "step": 2955 + }, + { + "epoch": 0.3246211289259829, + "grad_norm": 1.9892345666885376, + "learning_rate": 5e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7043246626853943, + "num_tokens": 76550773.0, + "step": 2956 + }, + { + "epoch": 0.32473094662859653, + "grad_norm": 1.8105627298355103, + "learning_rate": 5e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6893526315689087, + "num_tokens": 76583842.0, + "step": 2957 + }, + { + "epoch": 0.3248407643312102, + "grad_norm": 2.0484845638275146, + "learning_rate": 5e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6884180307388306, + "num_tokens": 76610315.0, + "step": 2958 + }, + { + "epoch": 0.3249505820338239, + "grad_norm": 2.0860390663146973, + "learning_rate": 5e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6967348456382751, + "num_tokens": 76633985.0, + "step": 2959 + }, + { + "epoch": 0.3250603997364375, + "grad_norm": 1.9529271125793457, + "learning_rate": 5e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7090611457824707, + "num_tokens": 76663142.0, + "step": 2960 + }, + { + "epoch": 0.32517021743905117, + "grad_norm": 1.9348875284194946, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7017127275466919, + "num_tokens": 76690727.0, + "step": 2961 + }, + { + "epoch": 0.3252800351416648, + "grad_norm": 1.8617136478424072, + "learning_rate": 5e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6990179419517517, + "num_tokens": 76720340.0, + "step": 2962 + }, + { + "epoch": 0.3253898528442785, + "grad_norm": 1.9781140089035034, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7208679914474487, + "num_tokens": 76744443.0, + "step": 2963 + }, + { + "epoch": 0.32549967054689216, + "grad_norm": 1.860992431640625, + "learning_rate": 5e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7011570334434509, + "num_tokens": 76774234.0, + "step": 2964 + }, + { + "epoch": 0.3256094882495058, + "grad_norm": 2.0921664237976074, + "learning_rate": 5e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6966054439544678, + "num_tokens": 76801142.0, + "step": 2965 + }, + { + "epoch": 0.3257193059521195, + "grad_norm": 1.993442416191101, + "learning_rate": 5e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7215927243232727, + "num_tokens": 76826016.0, + "step": 2966 + }, + { + "epoch": 0.32582912365473315, + "grad_norm": 2.106027603149414, + "learning_rate": 5e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.705083966255188, + "num_tokens": 76848548.0, + "step": 2967 + }, + { + "epoch": 0.3259389413573468, + "grad_norm": 2.1717050075531006, + "learning_rate": 5e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6961698532104492, + "num_tokens": 76870506.0, + "step": 2968 + }, + { + "epoch": 0.32604875905996045, + "grad_norm": 1.853515625, + "learning_rate": 5e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7171566486358643, + "num_tokens": 76899009.0, + "step": 2969 + }, + { + "epoch": 0.32615857676257415, + "grad_norm": 1.9275990724563599, + "learning_rate": 5e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6859673261642456, + "num_tokens": 76930063.0, + "step": 2970 + }, + { + "epoch": 0.3262683944651878, + "grad_norm": 2.0202198028564453, + "learning_rate": 5e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7024638056755066, + "num_tokens": 76956461.0, + "step": 2971 + }, + { + "epoch": 0.32637821216780144, + "grad_norm": 2.003563165664673, + "learning_rate": 5e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6881749629974365, + "num_tokens": 76987926.0, + "step": 2972 + }, + { + "epoch": 0.3264880298704151, + "grad_norm": 1.875317096710205, + "learning_rate": 5e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7067768573760986, + "num_tokens": 77019560.0, + "step": 2973 + }, + { + "epoch": 0.3265978475730288, + "grad_norm": 2.0206005573272705, + "learning_rate": 5e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7167313098907471, + "num_tokens": 77044825.0, + "step": 2974 + }, + { + "epoch": 0.32670766527564243, + "grad_norm": 2.0168960094451904, + "learning_rate": 5e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.675528883934021, + "num_tokens": 77071494.0, + "step": 2975 + }, + { + "epoch": 0.3268174829782561, + "grad_norm": 1.7754284143447876, + "learning_rate": 5e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6844667196273804, + "num_tokens": 77102129.0, + "step": 2976 + }, + { + "epoch": 0.3269273006808698, + "grad_norm": 1.9710640907287598, + "learning_rate": 5e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7203586101531982, + "num_tokens": 77126725.0, + "step": 2977 + }, + { + "epoch": 0.3270371183834834, + "grad_norm": 1.983881950378418, + "learning_rate": 5e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7198202013969421, + "num_tokens": 77152923.0, + "step": 2978 + }, + { + "epoch": 0.3271469360860971, + "grad_norm": 2.023608446121216, + "learning_rate": 5e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6898611187934875, + "num_tokens": 77177682.0, + "step": 2979 + }, + { + "epoch": 0.3272567537887107, + "grad_norm": 1.8669849634170532, + "learning_rate": 5e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.707970142364502, + "num_tokens": 77210274.0, + "step": 2980 + }, + { + "epoch": 0.3273665714913244, + "grad_norm": 1.9232254028320312, + "learning_rate": 5e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6762694120407104, + "num_tokens": 77239835.0, + "step": 2981 + }, + { + "epoch": 0.32747638919393807, + "grad_norm": 2.0388951301574707, + "learning_rate": 5e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7101924419403076, + "num_tokens": 77262847.0, + "step": 2982 + }, + { + "epoch": 0.3275862068965517, + "grad_norm": 1.9914302825927734, + "learning_rate": 5e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6892659068107605, + "num_tokens": 77288384.0, + "step": 2983 + }, + { + "epoch": 0.32769602459916536, + "grad_norm": 1.977233648300171, + "learning_rate": 5e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.687861442565918, + "num_tokens": 77315963.0, + "step": 2984 + }, + { + "epoch": 0.32780584230177906, + "grad_norm": 2.106487989425659, + "learning_rate": 5e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7020046710968018, + "num_tokens": 77341337.0, + "step": 2985 + }, + { + "epoch": 0.3279156600043927, + "grad_norm": 2.206373691558838, + "learning_rate": 5e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6886037588119507, + "num_tokens": 77364315.0, + "step": 2986 + }, + { + "epoch": 0.32802547770700635, + "grad_norm": 2.287341833114624, + "learning_rate": 5e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6963955760002136, + "num_tokens": 77385083.0, + "step": 2987 + }, + { + "epoch": 0.32813529540962005, + "grad_norm": 1.9046635627746582, + "learning_rate": 5e-06, + "loss": 1.0812, + "mean_token_accuracy": 0.6793475151062012, + "num_tokens": 77417141.0, + "step": 2988 + }, + { + "epoch": 0.3282451131122337, + "grad_norm": 2.345090866088867, + "learning_rate": 5e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6787746548652649, + "num_tokens": 77440292.0, + "step": 2989 + }, + { + "epoch": 0.32835493081484735, + "grad_norm": 2.1014699935913086, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.719803512096405, + "num_tokens": 77462849.0, + "step": 2990 + }, + { + "epoch": 0.328464748517461, + "grad_norm": 2.1458795070648193, + "learning_rate": 5e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6842148303985596, + "num_tokens": 77486378.0, + "step": 2991 + }, + { + "epoch": 0.3285745662200747, + "grad_norm": 2.1600639820098877, + "learning_rate": 5e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7071108818054199, + "num_tokens": 77508487.0, + "step": 2992 + }, + { + "epoch": 0.32868438392268834, + "grad_norm": 2.09669828414917, + "learning_rate": 5e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7012919187545776, + "num_tokens": 77533541.0, + "step": 2993 + }, + { + "epoch": 0.328794201625302, + "grad_norm": 2.2205042839050293, + "learning_rate": 5e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6897228956222534, + "num_tokens": 77559058.0, + "step": 2994 + }, + { + "epoch": 0.3289040193279157, + "grad_norm": 2.1809639930725098, + "learning_rate": 5e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7127972841262817, + "num_tokens": 77582857.0, + "step": 2995 + }, + { + "epoch": 0.32901383703052933, + "grad_norm": 2.2831077575683594, + "learning_rate": 5e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7074057459831238, + "num_tokens": 77603605.0, + "step": 2996 + }, + { + "epoch": 0.329123654733143, + "grad_norm": 2.3442091941833496, + "learning_rate": 5e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7062262296676636, + "num_tokens": 77624364.0, + "step": 2997 + }, + { + "epoch": 0.3292334724357566, + "grad_norm": 1.9632762670516968, + "learning_rate": 5e-06, + "loss": 0.995, + "mean_token_accuracy": 0.69927978515625, + "num_tokens": 77652659.0, + "step": 2998 + }, + { + "epoch": 0.3293432901383703, + "grad_norm": 2.0479912757873535, + "learning_rate": 5e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.713168740272522, + "num_tokens": 77675840.0, + "step": 2999 + }, + { + "epoch": 0.32945310784098397, + "grad_norm": 2.0340566635131836, + "learning_rate": 5e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6955685615539551, + "num_tokens": 77702662.0, + "step": 3000 + }, + { + "epoch": 0.3295629255435976, + "grad_norm": 1.9917783737182617, + "learning_rate": 5e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.70017409324646, + "num_tokens": 77727733.0, + "step": 3001 + }, + { + "epoch": 0.32967274324621126, + "grad_norm": 1.708168387413025, + "learning_rate": 5e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6836296916007996, + "num_tokens": 77761130.0, + "step": 3002 + }, + { + "epoch": 0.32978256094882497, + "grad_norm": 1.940534234046936, + "learning_rate": 5e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.6979870200157166, + "num_tokens": 77786347.0, + "step": 3003 + }, + { + "epoch": 0.3298923786514386, + "grad_norm": 2.2772974967956543, + "learning_rate": 5e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7139703035354614, + "num_tokens": 77806808.0, + "step": 3004 + }, + { + "epoch": 0.33000219635405226, + "grad_norm": 1.965147852897644, + "learning_rate": 5e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.718321681022644, + "num_tokens": 77832859.0, + "step": 3005 + }, + { + "epoch": 0.33011201405666596, + "grad_norm": 2.2284624576568604, + "learning_rate": 5e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7041038274765015, + "num_tokens": 77854219.0, + "step": 3006 + }, + { + "epoch": 0.3302218317592796, + "grad_norm": 2.012439489364624, + "learning_rate": 5e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6928681135177612, + "num_tokens": 77881452.0, + "step": 3007 + }, + { + "epoch": 0.33033164946189325, + "grad_norm": 1.9963780641555786, + "learning_rate": 5e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6929886341094971, + "num_tokens": 77908351.0, + "step": 3008 + }, + { + "epoch": 0.3304414671645069, + "grad_norm": 2.0282135009765625, + "learning_rate": 5e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7061795592308044, + "num_tokens": 77932426.0, + "step": 3009 + }, + { + "epoch": 0.3305512848671206, + "grad_norm": 1.9660675525665283, + "learning_rate": 5e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7058727741241455, + "num_tokens": 77959758.0, + "step": 3010 + }, + { + "epoch": 0.33066110256973424, + "grad_norm": 1.9214918613433838, + "learning_rate": 5e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6987917423248291, + "num_tokens": 77989905.0, + "step": 3011 + }, + { + "epoch": 0.3307709202723479, + "grad_norm": 1.9738699197769165, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7071008086204529, + "num_tokens": 78019097.0, + "step": 3012 + }, + { + "epoch": 0.33088073797496154, + "grad_norm": 1.9404891729354858, + "learning_rate": 5e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6740617752075195, + "num_tokens": 78048466.0, + "step": 3013 + }, + { + "epoch": 0.33099055567757524, + "grad_norm": 2.138648509979248, + "learning_rate": 5e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7168971300125122, + "num_tokens": 78070702.0, + "step": 3014 + }, + { + "epoch": 0.3311003733801889, + "grad_norm": 2.0629143714904785, + "learning_rate": 5e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7072535753250122, + "num_tokens": 78096606.0, + "step": 3015 + }, + { + "epoch": 0.33121019108280253, + "grad_norm": 1.8966045379638672, + "learning_rate": 5e-06, + "loss": 1.0881, + "mean_token_accuracy": 0.6759379506111145, + "num_tokens": 78127697.0, + "step": 3016 + }, + { + "epoch": 0.33132000878541623, + "grad_norm": 1.977825403213501, + "learning_rate": 5e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6875180602073669, + "num_tokens": 78153754.0, + "step": 3017 + }, + { + "epoch": 0.3314298264880299, + "grad_norm": 2.113102912902832, + "learning_rate": 5e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6914114356040955, + "num_tokens": 78176342.0, + "step": 3018 + }, + { + "epoch": 0.3315396441906435, + "grad_norm": 2.4610657691955566, + "learning_rate": 5e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7141876816749573, + "num_tokens": 78193240.0, + "step": 3019 + }, + { + "epoch": 0.33164946189325717, + "grad_norm": 1.971684455871582, + "learning_rate": 5e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7118625640869141, + "num_tokens": 78222619.0, + "step": 3020 + }, + { + "epoch": 0.33175927959587087, + "grad_norm": 2.3113386631011963, + "learning_rate": 5e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7229876518249512, + "num_tokens": 78243455.0, + "step": 3021 + }, + { + "epoch": 0.3318690972984845, + "grad_norm": 2.094344139099121, + "learning_rate": 5e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6960291862487793, + "num_tokens": 78271762.0, + "step": 3022 + }, + { + "epoch": 0.33197891500109816, + "grad_norm": 2.129124402999878, + "learning_rate": 5e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7123432159423828, + "num_tokens": 78294282.0, + "step": 3023 + }, + { + "epoch": 0.33208873270371186, + "grad_norm": 1.967758297920227, + "learning_rate": 5e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7030932903289795, + "num_tokens": 78320845.0, + "step": 3024 + }, + { + "epoch": 0.3321985504063255, + "grad_norm": 2.3771297931671143, + "learning_rate": 5e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6980082988739014, + "num_tokens": 78341231.0, + "step": 3025 + }, + { + "epoch": 0.33230836810893916, + "grad_norm": 2.099048137664795, + "learning_rate": 5e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7055647969245911, + "num_tokens": 78366647.0, + "step": 3026 + }, + { + "epoch": 0.3324181858115528, + "grad_norm": 1.878610372543335, + "learning_rate": 5e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.6709401607513428, + "num_tokens": 78399824.0, + "step": 3027 + }, + { + "epoch": 0.3325280035141665, + "grad_norm": 2.0034964084625244, + "learning_rate": 5e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.6974280476570129, + "num_tokens": 78425197.0, + "step": 3028 + }, + { + "epoch": 0.33263782121678015, + "grad_norm": 2.1434390544891357, + "learning_rate": 5e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.719951868057251, + "num_tokens": 78447209.0, + "step": 3029 + }, + { + "epoch": 0.3327476389193938, + "grad_norm": 1.899158000946045, + "learning_rate": 5e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6964843273162842, + "num_tokens": 78476487.0, + "step": 3030 + }, + { + "epoch": 0.33285745662200744, + "grad_norm": 2.000962257385254, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7082191705703735, + "num_tokens": 78500530.0, + "step": 3031 + }, + { + "epoch": 0.33296727432462114, + "grad_norm": 1.896685242652893, + "learning_rate": 5e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6999586820602417, + "num_tokens": 78530859.0, + "step": 3032 + }, + { + "epoch": 0.3330770920272348, + "grad_norm": 2.2416112422943115, + "learning_rate": 5e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7098875641822815, + "num_tokens": 78551452.0, + "step": 3033 + }, + { + "epoch": 0.33318690972984844, + "grad_norm": 2.0175223350524902, + "learning_rate": 5e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7214109897613525, + "num_tokens": 78576571.0, + "step": 3034 + }, + { + "epoch": 0.33329672743246214, + "grad_norm": 1.9913451671600342, + "learning_rate": 5e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6909128427505493, + "num_tokens": 78604748.0, + "step": 3035 + }, + { + "epoch": 0.3334065451350758, + "grad_norm": 1.9999303817749023, + "learning_rate": 5e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7060118317604065, + "num_tokens": 78631869.0, + "step": 3036 + }, + { + "epoch": 0.33351636283768943, + "grad_norm": 2.210597276687622, + "learning_rate": 5e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7019420266151428, + "num_tokens": 78654070.0, + "step": 3037 + }, + { + "epoch": 0.3336261805403031, + "grad_norm": 2.1781980991363525, + "learning_rate": 5e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7063653469085693, + "num_tokens": 78677083.0, + "step": 3038 + }, + { + "epoch": 0.3337359982429168, + "grad_norm": 1.829604983329773, + "learning_rate": 5e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6985899209976196, + "num_tokens": 78710680.0, + "step": 3039 + }, + { + "epoch": 0.3338458159455304, + "grad_norm": 2.2686610221862793, + "learning_rate": 5e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7041468620300293, + "num_tokens": 78732383.0, + "step": 3040 + }, + { + "epoch": 0.33395563364814407, + "grad_norm": 1.9086642265319824, + "learning_rate": 5e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6907551288604736, + "num_tokens": 78762226.0, + "step": 3041 + }, + { + "epoch": 0.33406545135075777, + "grad_norm": 1.8941371440887451, + "learning_rate": 5e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7027274370193481, + "num_tokens": 78790477.0, + "step": 3042 + }, + { + "epoch": 0.3341752690533714, + "grad_norm": 2.047020196914673, + "learning_rate": 5e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7058392763137817, + "num_tokens": 78814468.0, + "step": 3043 + }, + { + "epoch": 0.33428508675598506, + "grad_norm": 2.2170045375823975, + "learning_rate": 5e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7163784503936768, + "num_tokens": 78834413.0, + "step": 3044 + }, + { + "epoch": 0.3343949044585987, + "grad_norm": 1.830954909324646, + "learning_rate": 5e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6884559392929077, + "num_tokens": 78863667.0, + "step": 3045 + }, + { + "epoch": 0.3345047221612124, + "grad_norm": 2.1762478351593018, + "learning_rate": 5e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7046538591384888, + "num_tokens": 78885511.0, + "step": 3046 + }, + { + "epoch": 0.33461453986382605, + "grad_norm": 2.13704252243042, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7070443630218506, + "num_tokens": 78908425.0, + "step": 3047 + }, + { + "epoch": 0.3347243575664397, + "grad_norm": 1.9818843603134155, + "learning_rate": 5e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6974653005599976, + "num_tokens": 78935010.0, + "step": 3048 + }, + { + "epoch": 0.33483417526905335, + "grad_norm": 1.9484542608261108, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6962175369262695, + "num_tokens": 78961379.0, + "step": 3049 + }, + { + "epoch": 0.33494399297166705, + "grad_norm": 2.0829718112945557, + "learning_rate": 5e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6955878138542175, + "num_tokens": 78985192.0, + "step": 3050 + }, + { + "epoch": 0.3350538106742807, + "grad_norm": 2.4009580612182617, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7224432229995728, + "num_tokens": 79002475.0, + "step": 3051 + }, + { + "epoch": 0.33516362837689434, + "grad_norm": 2.137970447540283, + "learning_rate": 5e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6986781358718872, + "num_tokens": 79028005.0, + "step": 3052 + }, + { + "epoch": 0.33527344607950804, + "grad_norm": 2.0416502952575684, + "learning_rate": 5e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6950470209121704, + "num_tokens": 79053833.0, + "step": 3053 + }, + { + "epoch": 0.3353832637821217, + "grad_norm": 2.0368316173553467, + "learning_rate": 5e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6924185752868652, + "num_tokens": 79079476.0, + "step": 3054 + }, + { + "epoch": 0.33549308148473533, + "grad_norm": 2.0998785495758057, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7132277488708496, + "num_tokens": 79102404.0, + "step": 3055 + }, + { + "epoch": 0.335602899187349, + "grad_norm": 2.2593467235565186, + "learning_rate": 5e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6946172118186951, + "num_tokens": 79123256.0, + "step": 3056 + }, + { + "epoch": 0.3357127168899627, + "grad_norm": 2.0939102172851562, + "learning_rate": 5e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7165403962135315, + "num_tokens": 79145046.0, + "step": 3057 + }, + { + "epoch": 0.3358225345925763, + "grad_norm": 1.8732049465179443, + "learning_rate": 5e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6881964206695557, + "num_tokens": 79174002.0, + "step": 3058 + }, + { + "epoch": 0.33593235229519, + "grad_norm": 2.0973567962646484, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7178755402565002, + "num_tokens": 79196483.0, + "step": 3059 + }, + { + "epoch": 0.3360421699978036, + "grad_norm": 2.313126564025879, + "learning_rate": 5e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7160300016403198, + "num_tokens": 79217453.0, + "step": 3060 + }, + { + "epoch": 0.3361519877004173, + "grad_norm": 2.026716709136963, + "learning_rate": 5e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7245044112205505, + "num_tokens": 79238703.0, + "step": 3061 + }, + { + "epoch": 0.33626180540303097, + "grad_norm": 2.097590684890747, + "learning_rate": 5e-06, + "loss": 1.1059, + "mean_token_accuracy": 0.6754080057144165, + "num_tokens": 79265576.0, + "step": 3062 + }, + { + "epoch": 0.3363716231056446, + "grad_norm": 1.8670363426208496, + "learning_rate": 5e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6848456859588623, + "num_tokens": 79295499.0, + "step": 3063 + }, + { + "epoch": 0.3364814408082583, + "grad_norm": 1.9297635555267334, + "learning_rate": 5e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.721503734588623, + "num_tokens": 79319561.0, + "step": 3064 + }, + { + "epoch": 0.33659125851087196, + "grad_norm": 1.913245677947998, + "learning_rate": 5e-06, + "loss": 1.0869, + "mean_token_accuracy": 0.6860156655311584, + "num_tokens": 79349118.0, + "step": 3065 + }, + { + "epoch": 0.3367010762134856, + "grad_norm": 2.3776638507843018, + "learning_rate": 5e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7044901251792908, + "num_tokens": 79369101.0, + "step": 3066 + }, + { + "epoch": 0.33681089391609925, + "grad_norm": 1.9331200122833252, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7014521360397339, + "num_tokens": 79394939.0, + "step": 3067 + }, + { + "epoch": 0.33692071161871295, + "grad_norm": 1.947738766670227, + "learning_rate": 5e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6888063549995422, + "num_tokens": 79427584.0, + "step": 3068 + }, + { + "epoch": 0.3370305293213266, + "grad_norm": 1.9847959280014038, + "learning_rate": 5e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7047266960144043, + "num_tokens": 79454188.0, + "step": 3069 + }, + { + "epoch": 0.33714034702394025, + "grad_norm": 1.994145154953003, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7273052930831909, + "num_tokens": 79477978.0, + "step": 3070 + }, + { + "epoch": 0.33725016472655395, + "grad_norm": 2.3072092533111572, + "learning_rate": 5e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6998505592346191, + "num_tokens": 79499824.0, + "step": 3071 + }, + { + "epoch": 0.3373599824291676, + "grad_norm": 2.1031055450439453, + "learning_rate": 5e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6927785277366638, + "num_tokens": 79525484.0, + "step": 3072 + }, + { + "epoch": 0.33746980013178124, + "grad_norm": 2.037916421890259, + "learning_rate": 5e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7074798345565796, + "num_tokens": 79550130.0, + "step": 3073 + }, + { + "epoch": 0.3375796178343949, + "grad_norm": 1.8990358114242554, + "learning_rate": 5e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6899093389511108, + "num_tokens": 79581549.0, + "step": 3074 + }, + { + "epoch": 0.3376894355370086, + "grad_norm": 2.0599324703216553, + "learning_rate": 5e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6990309357643127, + "num_tokens": 79606477.0, + "step": 3075 + }, + { + "epoch": 0.33779925323962223, + "grad_norm": 1.9236301183700562, + "learning_rate": 5e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6908413171768188, + "num_tokens": 79635907.0, + "step": 3076 + }, + { + "epoch": 0.3379090709422359, + "grad_norm": 2.1110026836395264, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6970695853233337, + "num_tokens": 79660034.0, + "step": 3077 + }, + { + "epoch": 0.3380188886448495, + "grad_norm": 1.9669077396392822, + "learning_rate": 5e-06, + "loss": 1.037, + "mean_token_accuracy": 0.693080484867096, + "num_tokens": 79688842.0, + "step": 3078 + }, + { + "epoch": 0.3381287063474632, + "grad_norm": 1.965689778327942, + "learning_rate": 5e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7156627178192139, + "num_tokens": 79715731.0, + "step": 3079 + }, + { + "epoch": 0.33823852405007687, + "grad_norm": 2.0110666751861572, + "learning_rate": 5e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6949602365493774, + "num_tokens": 79742326.0, + "step": 3080 + }, + { + "epoch": 0.3383483417526905, + "grad_norm": 1.8504002094268799, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7329276204109192, + "num_tokens": 79770252.0, + "step": 3081 + }, + { + "epoch": 0.3384581594553042, + "grad_norm": 1.8245879411697388, + "learning_rate": 5e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.679522693157196, + "num_tokens": 79804678.0, + "step": 3082 + }, + { + "epoch": 0.33856797715791787, + "grad_norm": 1.9797996282577515, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7145495414733887, + "num_tokens": 79828919.0, + "step": 3083 + }, + { + "epoch": 0.3386777948605315, + "grad_norm": 2.1026928424835205, + "learning_rate": 5e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7046777009963989, + "num_tokens": 79850705.0, + "step": 3084 + }, + { + "epoch": 0.33878761256314516, + "grad_norm": 2.060035467147827, + "learning_rate": 5e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6892732977867126, + "num_tokens": 79875705.0, + "step": 3085 + }, + { + "epoch": 0.33889743026575886, + "grad_norm": 2.1286895275115967, + "learning_rate": 5e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6895354986190796, + "num_tokens": 79898723.0, + "step": 3086 + }, + { + "epoch": 0.3390072479683725, + "grad_norm": 2.0627079010009766, + "learning_rate": 5e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6859964728355408, + "num_tokens": 79924629.0, + "step": 3087 + }, + { + "epoch": 0.33911706567098615, + "grad_norm": 1.879988193511963, + "learning_rate": 5e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7041150331497192, + "num_tokens": 79955140.0, + "step": 3088 + }, + { + "epoch": 0.3392268833735998, + "grad_norm": 1.9140437841415405, + "learning_rate": 5e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7027395963668823, + "num_tokens": 79983462.0, + "step": 3089 + }, + { + "epoch": 0.3393367010762135, + "grad_norm": 1.8033710718154907, + "learning_rate": 5e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6981102228164673, + "num_tokens": 80013943.0, + "step": 3090 + }, + { + "epoch": 0.33944651877882714, + "grad_norm": 2.565032482147217, + "learning_rate": 5e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6822973489761353, + "num_tokens": 80034756.0, + "step": 3091 + }, + { + "epoch": 0.3395563364814408, + "grad_norm": 1.7568581104278564, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7064005136489868, + "num_tokens": 80065527.0, + "step": 3092 + }, + { + "epoch": 0.3396661541840545, + "grad_norm": 2.0276174545288086, + "learning_rate": 5e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6916759610176086, + "num_tokens": 80091046.0, + "step": 3093 + }, + { + "epoch": 0.33977597188666814, + "grad_norm": 1.9494389295578003, + "learning_rate": 5e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.691956639289856, + "num_tokens": 80118650.0, + "step": 3094 + }, + { + "epoch": 0.3398857895892818, + "grad_norm": 1.9511957168579102, + "learning_rate": 5e-06, + "loss": 1.0771, + "mean_token_accuracy": 0.6808809041976929, + "num_tokens": 80145197.0, + "step": 3095 + }, + { + "epoch": 0.33999560729189543, + "grad_norm": 2.383044719696045, + "learning_rate": 5e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6925555467605591, + "num_tokens": 80166387.0, + "step": 3096 + }, + { + "epoch": 0.34010542499450913, + "grad_norm": 1.9705625772476196, + "learning_rate": 5e-06, + "loss": 1.0991, + "mean_token_accuracy": 0.6720348596572876, + "num_tokens": 80198091.0, + "step": 3097 + }, + { + "epoch": 0.3402152426971228, + "grad_norm": 2.2373619079589844, + "learning_rate": 5e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6902056932449341, + "num_tokens": 80221323.0, + "step": 3098 + }, + { + "epoch": 0.3403250603997364, + "grad_norm": 1.9962824583053589, + "learning_rate": 5e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7099826335906982, + "num_tokens": 80247926.0, + "step": 3099 + }, + { + "epoch": 0.3404348781023501, + "grad_norm": 2.0503742694854736, + "learning_rate": 5e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7299656867980957, + "num_tokens": 80275159.0, + "step": 3100 + }, + { + "epoch": 0.34054469580496377, + "grad_norm": 2.3390026092529297, + "learning_rate": 5e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7152310013771057, + "num_tokens": 80297503.0, + "step": 3101 + }, + { + "epoch": 0.3406545135075774, + "grad_norm": 1.8829565048217773, + "learning_rate": 5e-06, + "loss": 1.1316, + "mean_token_accuracy": 0.6589765548706055, + "num_tokens": 80329644.0, + "step": 3102 + }, + { + "epoch": 0.34076433121019106, + "grad_norm": 1.8053064346313477, + "learning_rate": 5e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7026307582855225, + "num_tokens": 80361709.0, + "step": 3103 + }, + { + "epoch": 0.34087414891280476, + "grad_norm": 1.9408693313598633, + "learning_rate": 5e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7123619318008423, + "num_tokens": 80387845.0, + "step": 3104 + }, + { + "epoch": 0.3409839666154184, + "grad_norm": 1.8771135807037354, + "learning_rate": 5e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.682745099067688, + "num_tokens": 80418519.0, + "step": 3105 + }, + { + "epoch": 0.34109378431803206, + "grad_norm": 1.8888227939605713, + "learning_rate": 5e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6900073289871216, + "num_tokens": 80445425.0, + "step": 3106 + }, + { + "epoch": 0.3412036020206457, + "grad_norm": 1.7946306467056274, + "learning_rate": 5e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7233939170837402, + "num_tokens": 80475185.0, + "step": 3107 + }, + { + "epoch": 0.3413134197232594, + "grad_norm": 1.9717826843261719, + "learning_rate": 5e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6941334009170532, + "num_tokens": 80500979.0, + "step": 3108 + }, + { + "epoch": 0.34142323742587305, + "grad_norm": 1.9913991689682007, + "learning_rate": 5e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7178148031234741, + "num_tokens": 80524819.0, + "step": 3109 + }, + { + "epoch": 0.3415330551284867, + "grad_norm": 2.0158798694610596, + "learning_rate": 5e-06, + "loss": 1.0857, + "mean_token_accuracy": 0.6839362382888794, + "num_tokens": 80551112.0, + "step": 3110 + }, + { + "epoch": 0.3416428728311004, + "grad_norm": 1.9180670976638794, + "learning_rate": 5e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6904945969581604, + "num_tokens": 80577010.0, + "step": 3111 + }, + { + "epoch": 0.34175269053371404, + "grad_norm": 1.9785107374191284, + "learning_rate": 5e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6976296901702881, + "num_tokens": 80602652.0, + "step": 3112 + }, + { + "epoch": 0.3418625082363277, + "grad_norm": 2.0987539291381836, + "learning_rate": 5e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6969766020774841, + "num_tokens": 80627726.0, + "step": 3113 + }, + { + "epoch": 0.34197232593894134, + "grad_norm": 1.9251645803451538, + "learning_rate": 5e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7089026570320129, + "num_tokens": 80655831.0, + "step": 3114 + }, + { + "epoch": 0.34208214364155504, + "grad_norm": 2.02671217918396, + "learning_rate": 5e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7085708379745483, + "num_tokens": 80680524.0, + "step": 3115 + }, + { + "epoch": 0.3421919613441687, + "grad_norm": 2.037822961807251, + "learning_rate": 5e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6986028552055359, + "num_tokens": 80709433.0, + "step": 3116 + }, + { + "epoch": 0.34230177904678233, + "grad_norm": 1.789426326751709, + "learning_rate": 5e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.6995846033096313, + "num_tokens": 80740118.0, + "step": 3117 + }, + { + "epoch": 0.34241159674939603, + "grad_norm": 2.1283349990844727, + "learning_rate": 5e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7120029330253601, + "num_tokens": 80761701.0, + "step": 3118 + }, + { + "epoch": 0.3425214144520097, + "grad_norm": 2.1189756393432617, + "learning_rate": 5e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.698040783405304, + "num_tokens": 80785398.0, + "step": 3119 + }, + { + "epoch": 0.3426312321546233, + "grad_norm": 2.0891640186309814, + "learning_rate": 5e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6911358833312988, + "num_tokens": 80810849.0, + "step": 3120 + }, + { + "epoch": 0.34274104985723697, + "grad_norm": 1.8751449584960938, + "learning_rate": 5e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6784372329711914, + "num_tokens": 80840945.0, + "step": 3121 + }, + { + "epoch": 0.34285086755985067, + "grad_norm": 2.103188991546631, + "learning_rate": 5e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7113021612167358, + "num_tokens": 80864561.0, + "step": 3122 + }, + { + "epoch": 0.3429606852624643, + "grad_norm": 2.0967962741851807, + "learning_rate": 5e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6882269978523254, + "num_tokens": 80888889.0, + "step": 3123 + }, + { + "epoch": 0.34307050296507796, + "grad_norm": 2.061197280883789, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.716289758682251, + "num_tokens": 80914714.0, + "step": 3124 + }, + { + "epoch": 0.3431803206676916, + "grad_norm": 1.9310674667358398, + "learning_rate": 5e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7010153532028198, + "num_tokens": 80941880.0, + "step": 3125 + }, + { + "epoch": 0.3432901383703053, + "grad_norm": 1.812048077583313, + "learning_rate": 5e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.699522852897644, + "num_tokens": 80971025.0, + "step": 3126 + }, + { + "epoch": 0.34339995607291895, + "grad_norm": 2.083791971206665, + "learning_rate": 5e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6928349733352661, + "num_tokens": 80994868.0, + "step": 3127 + }, + { + "epoch": 0.3435097737755326, + "grad_norm": 2.036412477493286, + "learning_rate": 5e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6956563591957092, + "num_tokens": 81021392.0, + "step": 3128 + }, + { + "epoch": 0.3436195914781463, + "grad_norm": 1.9319919347763062, + "learning_rate": 5e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7003661394119263, + "num_tokens": 81047998.0, + "step": 3129 + }, + { + "epoch": 0.34372940918075995, + "grad_norm": 1.7257180213928223, + "learning_rate": 5e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7002867460250854, + "num_tokens": 81082336.0, + "step": 3130 + }, + { + "epoch": 0.3438392268833736, + "grad_norm": 2.173522710800171, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7198978066444397, + "num_tokens": 81102702.0, + "step": 3131 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 1.8850951194763184, + "learning_rate": 5e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6929432153701782, + "num_tokens": 81129288.0, + "step": 3132 + }, + { + "epoch": 0.34405886228860094, + "grad_norm": 2.2757294178009033, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.698980450630188, + "num_tokens": 81149793.0, + "step": 3133 + }, + { + "epoch": 0.3441686799912146, + "grad_norm": 1.9554486274719238, + "learning_rate": 5e-06, + "loss": 1.1216, + "mean_token_accuracy": 0.674289345741272, + "num_tokens": 81175631.0, + "step": 3134 + }, + { + "epoch": 0.34427849769382823, + "grad_norm": 1.9362263679504395, + "learning_rate": 5e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6982698440551758, + "num_tokens": 81201921.0, + "step": 3135 + }, + { + "epoch": 0.3443883153964419, + "grad_norm": 2.0218355655670166, + "learning_rate": 5e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.693323016166687, + "num_tokens": 81226626.0, + "step": 3136 + }, + { + "epoch": 0.3444981330990556, + "grad_norm": 1.8048971891403198, + "learning_rate": 5e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7110481262207031, + "num_tokens": 81256697.0, + "step": 3137 + }, + { + "epoch": 0.3446079508016692, + "grad_norm": 2.1272363662719727, + "learning_rate": 5e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7161344885826111, + "num_tokens": 81279078.0, + "step": 3138 + }, + { + "epoch": 0.3447177685042829, + "grad_norm": 2.192068338394165, + "learning_rate": 5e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6997392177581787, + "num_tokens": 81301834.0, + "step": 3139 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 1.819623351097107, + "learning_rate": 5e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.6793242692947388, + "num_tokens": 81336004.0, + "step": 3140 + }, + { + "epoch": 0.3449374039095102, + "grad_norm": 2.2169551849365234, + "learning_rate": 5e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7041107416152954, + "num_tokens": 81357508.0, + "step": 3141 + }, + { + "epoch": 0.34504722161212387, + "grad_norm": 1.9356530904769897, + "learning_rate": 5e-06, + "loss": 1.1219, + "mean_token_accuracy": 0.6661162376403809, + "num_tokens": 81387828.0, + "step": 3142 + }, + { + "epoch": 0.3451570393147375, + "grad_norm": 2.2747066020965576, + "learning_rate": 5e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7351623177528381, + "num_tokens": 81405285.0, + "step": 3143 + }, + { + "epoch": 0.3452668570173512, + "grad_norm": 2.270869016647339, + "learning_rate": 5e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6931580305099487, + "num_tokens": 81430491.0, + "step": 3144 + }, + { + "epoch": 0.34537667471996486, + "grad_norm": 2.1383914947509766, + "learning_rate": 5e-06, + "loss": 1.0881, + "mean_token_accuracy": 0.6736639142036438, + "num_tokens": 81456018.0, + "step": 3145 + }, + { + "epoch": 0.3454864924225785, + "grad_norm": 1.996155023574829, + "learning_rate": 5e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6981103420257568, + "num_tokens": 81480099.0, + "step": 3146 + }, + { + "epoch": 0.3455963101251922, + "grad_norm": 2.03322172164917, + "learning_rate": 5e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6899017095565796, + "num_tokens": 81505225.0, + "step": 3147 + }, + { + "epoch": 0.34570612782780585, + "grad_norm": 2.077878475189209, + "learning_rate": 5e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.683071494102478, + "num_tokens": 81532191.0, + "step": 3148 + }, + { + "epoch": 0.3458159455304195, + "grad_norm": 2.020019769668579, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7135399580001831, + "num_tokens": 81556747.0, + "step": 3149 + }, + { + "epoch": 0.34592576323303315, + "grad_norm": 1.9171655178070068, + "learning_rate": 5e-06, + "loss": 1.0851, + "mean_token_accuracy": 0.6791330575942993, + "num_tokens": 81588125.0, + "step": 3150 + }, + { + "epoch": 0.34603558093564685, + "grad_norm": 1.9436452388763428, + "learning_rate": 5e-06, + "loss": 1.0966, + "mean_token_accuracy": 0.6727110147476196, + "num_tokens": 81620892.0, + "step": 3151 + }, + { + "epoch": 0.3461453986382605, + "grad_norm": 1.9877386093139648, + "learning_rate": 5e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.700383186340332, + "num_tokens": 81648045.0, + "step": 3152 + }, + { + "epoch": 0.34625521634087414, + "grad_norm": 2.0208840370178223, + "learning_rate": 5e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6981238126754761, + "num_tokens": 81673202.0, + "step": 3153 + }, + { + "epoch": 0.3463650340434878, + "grad_norm": 1.9781806468963623, + "learning_rate": 5e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7071434855461121, + "num_tokens": 81698699.0, + "step": 3154 + }, + { + "epoch": 0.3464748517461015, + "grad_norm": 2.4439597129821777, + "learning_rate": 5e-06, + "loss": 1.1118, + "mean_token_accuracy": 0.67734694480896, + "num_tokens": 81722404.0, + "step": 3155 + }, + { + "epoch": 0.34658466944871513, + "grad_norm": 2.0986013412475586, + "learning_rate": 5e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7026664614677429, + "num_tokens": 81745030.0, + "step": 3156 + }, + { + "epoch": 0.3466944871513288, + "grad_norm": 2.122197389602661, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7184990644454956, + "num_tokens": 81768211.0, + "step": 3157 + }, + { + "epoch": 0.3468043048539425, + "grad_norm": 2.048185110092163, + "learning_rate": 5e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.716918408870697, + "num_tokens": 81791990.0, + "step": 3158 + }, + { + "epoch": 0.3469141225565561, + "grad_norm": 2.0044589042663574, + "learning_rate": 5e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6818429231643677, + "num_tokens": 81818109.0, + "step": 3159 + }, + { + "epoch": 0.34702394025916977, + "grad_norm": 2.0499651432037354, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.716891884803772, + "num_tokens": 81843091.0, + "step": 3160 + }, + { + "epoch": 0.3471337579617834, + "grad_norm": 1.9859343767166138, + "learning_rate": 5e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6901382207870483, + "num_tokens": 81870491.0, + "step": 3161 + }, + { + "epoch": 0.3472435756643971, + "grad_norm": 1.7488335371017456, + "learning_rate": 5e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6785696148872375, + "num_tokens": 81905456.0, + "step": 3162 + }, + { + "epoch": 0.34735339336701077, + "grad_norm": 1.8653255701065063, + "learning_rate": 5e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6973289251327515, + "num_tokens": 81934616.0, + "step": 3163 + }, + { + "epoch": 0.3474632110696244, + "grad_norm": 2.1484014987945557, + "learning_rate": 5e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6829250454902649, + "num_tokens": 81958196.0, + "step": 3164 + }, + { + "epoch": 0.34757302877223806, + "grad_norm": 2.0903942584991455, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7421466112136841, + "num_tokens": 81982396.0, + "step": 3165 + }, + { + "epoch": 0.34768284647485176, + "grad_norm": 2.2976224422454834, + "learning_rate": 5e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6837190389633179, + "num_tokens": 82005400.0, + "step": 3166 + }, + { + "epoch": 0.3477926641774654, + "grad_norm": 1.8927123546600342, + "learning_rate": 5e-06, + "loss": 0.989, + "mean_token_accuracy": 0.702086329460144, + "num_tokens": 82034766.0, + "step": 3167 + }, + { + "epoch": 0.34790248188007905, + "grad_norm": 2.2008919715881348, + "learning_rate": 5e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7235603928565979, + "num_tokens": 82055621.0, + "step": 3168 + }, + { + "epoch": 0.34801229958269275, + "grad_norm": 2.150233507156372, + "learning_rate": 5e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7078869342803955, + "num_tokens": 82078262.0, + "step": 3169 + }, + { + "epoch": 0.3481221172853064, + "grad_norm": 2.0842552185058594, + "learning_rate": 5e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7094864845275879, + "num_tokens": 82102947.0, + "step": 3170 + }, + { + "epoch": 0.34823193498792004, + "grad_norm": 1.7503198385238647, + "learning_rate": 5e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6895476579666138, + "num_tokens": 82137685.0, + "step": 3171 + }, + { + "epoch": 0.3483417526905337, + "grad_norm": 2.128865957260132, + "learning_rate": 5e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6925482153892517, + "num_tokens": 82161817.0, + "step": 3172 + }, + { + "epoch": 0.3484515703931474, + "grad_norm": 2.0418591499328613, + "learning_rate": 5e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6945602893829346, + "num_tokens": 82188349.0, + "step": 3173 + }, + { + "epoch": 0.34856138809576104, + "grad_norm": 2.4542019367218018, + "learning_rate": 5e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7087497711181641, + "num_tokens": 82207646.0, + "step": 3174 + }, + { + "epoch": 0.3486712057983747, + "grad_norm": 1.776995062828064, + "learning_rate": 5e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7008371353149414, + "num_tokens": 82239275.0, + "step": 3175 + }, + { + "epoch": 0.3487810235009884, + "grad_norm": 2.3247640132904053, + "learning_rate": 5e-06, + "loss": 1.0892, + "mean_token_accuracy": 0.681833028793335, + "num_tokens": 82259635.0, + "step": 3176 + }, + { + "epoch": 0.34889084120360203, + "grad_norm": 2.1137874126434326, + "learning_rate": 5e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7071843147277832, + "num_tokens": 82282135.0, + "step": 3177 + }, + { + "epoch": 0.3490006589062157, + "grad_norm": 2.1888747215270996, + "learning_rate": 5e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7343114018440247, + "num_tokens": 82302268.0, + "step": 3178 + }, + { + "epoch": 0.3491104766088293, + "grad_norm": 2.085172176361084, + "learning_rate": 5e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.689276933670044, + "num_tokens": 82326725.0, + "step": 3179 + }, + { + "epoch": 0.349220294311443, + "grad_norm": 2.1420412063598633, + "learning_rate": 5e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.6799719333648682, + "num_tokens": 82350765.0, + "step": 3180 + }, + { + "epoch": 0.34933011201405667, + "grad_norm": 2.020585060119629, + "learning_rate": 5e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6803092956542969, + "num_tokens": 82374689.0, + "step": 3181 + }, + { + "epoch": 0.3494399297166703, + "grad_norm": 1.8886334896087646, + "learning_rate": 5e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6955568790435791, + "num_tokens": 82401687.0, + "step": 3182 + }, + { + "epoch": 0.34954974741928396, + "grad_norm": 1.9380515813827515, + "learning_rate": 5e-06, + "loss": 0.959, + "mean_token_accuracy": 0.705781102180481, + "num_tokens": 82428587.0, + "step": 3183 + }, + { + "epoch": 0.34965956512189766, + "grad_norm": 1.9458389282226562, + "learning_rate": 5e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6878676414489746, + "num_tokens": 82454820.0, + "step": 3184 + }, + { + "epoch": 0.3497693828245113, + "grad_norm": 2.2176544666290283, + "learning_rate": 5e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7129325270652771, + "num_tokens": 82477018.0, + "step": 3185 + }, + { + "epoch": 0.34987920052712496, + "grad_norm": 2.2071330547332764, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7208219766616821, + "num_tokens": 82497145.0, + "step": 3186 + }, + { + "epoch": 0.34998901822973866, + "grad_norm": 1.9689825773239136, + "learning_rate": 5e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.68647700548172, + "num_tokens": 82526664.0, + "step": 3187 + }, + { + "epoch": 0.3500988359323523, + "grad_norm": 1.768593430519104, + "learning_rate": 5e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6872601509094238, + "num_tokens": 82560579.0, + "step": 3188 + }, + { + "epoch": 0.35020865363496595, + "grad_norm": 2.3829171657562256, + "learning_rate": 5e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.6993104219436646, + "num_tokens": 82578321.0, + "step": 3189 + }, + { + "epoch": 0.3503184713375796, + "grad_norm": 2.2315754890441895, + "learning_rate": 5e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7066239714622498, + "num_tokens": 82600976.0, + "step": 3190 + }, + { + "epoch": 0.3504282890401933, + "grad_norm": 1.9264862537384033, + "learning_rate": 5e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6822660565376282, + "num_tokens": 82627499.0, + "step": 3191 + }, + { + "epoch": 0.35053810674280694, + "grad_norm": 1.8360546827316284, + "learning_rate": 5e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6931823492050171, + "num_tokens": 82658493.0, + "step": 3192 + }, + { + "epoch": 0.3506479244454206, + "grad_norm": 1.749342918395996, + "learning_rate": 5e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6946033835411072, + "num_tokens": 82692268.0, + "step": 3193 + }, + { + "epoch": 0.3507577421480343, + "grad_norm": 1.854517936706543, + "learning_rate": 5e-06, + "loss": 1.077, + "mean_token_accuracy": 0.6759018898010254, + "num_tokens": 82723799.0, + "step": 3194 + }, + { + "epoch": 0.35086755985064794, + "grad_norm": 1.9191876649856567, + "learning_rate": 5e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6990565061569214, + "num_tokens": 82752216.0, + "step": 3195 + }, + { + "epoch": 0.3509773775532616, + "grad_norm": 1.953339695930481, + "learning_rate": 5e-06, + "loss": 1.0654, + "mean_token_accuracy": 0.6799663305282593, + "num_tokens": 82778250.0, + "step": 3196 + }, + { + "epoch": 0.35108719525587523, + "grad_norm": 2.1144402027130127, + "learning_rate": 5e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6952825784683228, + "num_tokens": 82804120.0, + "step": 3197 + }, + { + "epoch": 0.35119701295848893, + "grad_norm": 1.924734115600586, + "learning_rate": 5e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7173047661781311, + "num_tokens": 82830216.0, + "step": 3198 + }, + { + "epoch": 0.3513068306611026, + "grad_norm": 2.078284740447998, + "learning_rate": 5e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6923741102218628, + "num_tokens": 82853867.0, + "step": 3199 + }, + { + "epoch": 0.3514166483637162, + "grad_norm": 1.8756935596466064, + "learning_rate": 5e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6859262585639954, + "num_tokens": 82881912.0, + "step": 3200 + }, + { + "epoch": 0.35152646606632987, + "grad_norm": 1.8982560634613037, + "learning_rate": 5e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6896772384643555, + "num_tokens": 82909592.0, + "step": 3201 + }, + { + "epoch": 0.35163628376894357, + "grad_norm": 2.123699903488159, + "learning_rate": 5e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6819030046463013, + "num_tokens": 82932916.0, + "step": 3202 + }, + { + "epoch": 0.3517461014715572, + "grad_norm": 1.8147358894348145, + "learning_rate": 5e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7005467414855957, + "num_tokens": 82964280.0, + "step": 3203 + }, + { + "epoch": 0.35185591917417086, + "grad_norm": 1.8728876113891602, + "learning_rate": 5e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7123099565505981, + "num_tokens": 82991417.0, + "step": 3204 + }, + { + "epoch": 0.35196573687678456, + "grad_norm": 1.9521634578704834, + "learning_rate": 5e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7175493836402893, + "num_tokens": 83018530.0, + "step": 3205 + }, + { + "epoch": 0.3520755545793982, + "grad_norm": 1.8826497793197632, + "learning_rate": 5e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7116439342498779, + "num_tokens": 83044992.0, + "step": 3206 + }, + { + "epoch": 0.35218537228201185, + "grad_norm": 2.0821034908294678, + "learning_rate": 5e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7044143676757812, + "num_tokens": 83067616.0, + "step": 3207 + }, + { + "epoch": 0.3522951899846255, + "grad_norm": 2.0065062046051025, + "learning_rate": 5e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6941186189651489, + "num_tokens": 83095502.0, + "step": 3208 + }, + { + "epoch": 0.3524050076872392, + "grad_norm": 1.9579973220825195, + "learning_rate": 5e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7011282444000244, + "num_tokens": 83122302.0, + "step": 3209 + }, + { + "epoch": 0.35251482538985285, + "grad_norm": 1.9571259021759033, + "learning_rate": 5e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6978871822357178, + "num_tokens": 83150116.0, + "step": 3210 + }, + { + "epoch": 0.3526246430924665, + "grad_norm": 2.02854323387146, + "learning_rate": 5e-06, + "loss": 1.075, + "mean_token_accuracy": 0.6812599897384644, + "num_tokens": 83174956.0, + "step": 3211 + }, + { + "epoch": 0.35273446079508014, + "grad_norm": 2.15737247467041, + "learning_rate": 5e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7026858329772949, + "num_tokens": 83195219.0, + "step": 3212 + }, + { + "epoch": 0.35284427849769384, + "grad_norm": 1.9883650541305542, + "learning_rate": 5e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7005249261856079, + "num_tokens": 83221753.0, + "step": 3213 + }, + { + "epoch": 0.3529540962003075, + "grad_norm": 2.0366456508636475, + "learning_rate": 5e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6932660341262817, + "num_tokens": 83248285.0, + "step": 3214 + }, + { + "epoch": 0.35306391390292113, + "grad_norm": 2.0281479358673096, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6930312514305115, + "num_tokens": 83271922.0, + "step": 3215 + }, + { + "epoch": 0.35317373160553484, + "grad_norm": 2.1290876865386963, + "learning_rate": 5e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.687484622001648, + "num_tokens": 83294708.0, + "step": 3216 + }, + { + "epoch": 0.3532835493081485, + "grad_norm": 2.1688849925994873, + "learning_rate": 5e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7181913256645203, + "num_tokens": 83315184.0, + "step": 3217 + }, + { + "epoch": 0.3533933670107621, + "grad_norm": 1.963439702987671, + "learning_rate": 5e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6916123628616333, + "num_tokens": 83341261.0, + "step": 3218 + }, + { + "epoch": 0.3535031847133758, + "grad_norm": 1.965786337852478, + "learning_rate": 5e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7212679386138916, + "num_tokens": 83366605.0, + "step": 3219 + }, + { + "epoch": 0.3536130024159895, + "grad_norm": 2.0568861961364746, + "learning_rate": 5e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7267168760299683, + "num_tokens": 83389634.0, + "step": 3220 + }, + { + "epoch": 0.3537228201186031, + "grad_norm": 1.9400291442871094, + "learning_rate": 5e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.6990773677825928, + "num_tokens": 83415701.0, + "step": 3221 + }, + { + "epoch": 0.35383263782121677, + "grad_norm": 1.8933639526367188, + "learning_rate": 5e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7098342776298523, + "num_tokens": 83441781.0, + "step": 3222 + }, + { + "epoch": 0.35394245552383047, + "grad_norm": 1.7061935663223267, + "learning_rate": 5e-06, + "loss": 1.1326, + "mean_token_accuracy": 0.6611959934234619, + "num_tokens": 83478922.0, + "step": 3223 + }, + { + "epoch": 0.3540522732264441, + "grad_norm": 1.9795674085617065, + "learning_rate": 5e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7071198225021362, + "num_tokens": 83504808.0, + "step": 3224 + }, + { + "epoch": 0.35416209092905776, + "grad_norm": 2.0896167755126953, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7192559838294983, + "num_tokens": 83527589.0, + "step": 3225 + }, + { + "epoch": 0.3542719086316714, + "grad_norm": 2.074927806854248, + "learning_rate": 5e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6825770139694214, + "num_tokens": 83552951.0, + "step": 3226 + }, + { + "epoch": 0.3543817263342851, + "grad_norm": 2.1015126705169678, + "learning_rate": 5e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6908764243125916, + "num_tokens": 83576088.0, + "step": 3227 + }, + { + "epoch": 0.35449154403689875, + "grad_norm": 2.2983267307281494, + "learning_rate": 5e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7001945972442627, + "num_tokens": 83595429.0, + "step": 3228 + }, + { + "epoch": 0.3546013617395124, + "grad_norm": 2.086493968963623, + "learning_rate": 5e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.6834113597869873, + "num_tokens": 83622071.0, + "step": 3229 + }, + { + "epoch": 0.35471117944212605, + "grad_norm": 2.167759895324707, + "learning_rate": 5e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7000966668128967, + "num_tokens": 83646846.0, + "step": 3230 + }, + { + "epoch": 0.35482099714473975, + "grad_norm": 2.0613839626312256, + "learning_rate": 5e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7098707556724548, + "num_tokens": 83668948.0, + "step": 3231 + }, + { + "epoch": 0.3549308148473534, + "grad_norm": 2.0955653190612793, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7000603675842285, + "num_tokens": 83692153.0, + "step": 3232 + }, + { + "epoch": 0.35504063254996704, + "grad_norm": 2.2933788299560547, + "learning_rate": 5e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7289654016494751, + "num_tokens": 83710200.0, + "step": 3233 + }, + { + "epoch": 0.35515045025258074, + "grad_norm": 1.8348042964935303, + "learning_rate": 5e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6883223056793213, + "num_tokens": 83739413.0, + "step": 3234 + }, + { + "epoch": 0.3552602679551944, + "grad_norm": 1.974341630935669, + "learning_rate": 5e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6910858154296875, + "num_tokens": 83764125.0, + "step": 3235 + }, + { + "epoch": 0.35537008565780803, + "grad_norm": 1.8970962762832642, + "learning_rate": 5e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.704058051109314, + "num_tokens": 83791426.0, + "step": 3236 + }, + { + "epoch": 0.3554799033604217, + "grad_norm": 1.960690975189209, + "learning_rate": 5e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6893131136894226, + "num_tokens": 83817406.0, + "step": 3237 + }, + { + "epoch": 0.3555897210630354, + "grad_norm": 2.0343143939971924, + "learning_rate": 5e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7205628156661987, + "num_tokens": 83839289.0, + "step": 3238 + }, + { + "epoch": 0.355699538765649, + "grad_norm": 2.042677164077759, + "learning_rate": 5e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7040945291519165, + "num_tokens": 83866063.0, + "step": 3239 + }, + { + "epoch": 0.35580935646826267, + "grad_norm": 2.113478422164917, + "learning_rate": 5e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7101421356201172, + "num_tokens": 83890435.0, + "step": 3240 + }, + { + "epoch": 0.3559191741708763, + "grad_norm": 1.8835238218307495, + "learning_rate": 5e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7137387990951538, + "num_tokens": 83916207.0, + "step": 3241 + }, + { + "epoch": 0.35602899187349, + "grad_norm": 1.7725863456726074, + "learning_rate": 5e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7139995098114014, + "num_tokens": 83944990.0, + "step": 3242 + }, + { + "epoch": 0.35613880957610367, + "grad_norm": 1.9563217163085938, + "learning_rate": 5e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7162840366363525, + "num_tokens": 83972021.0, + "step": 3243 + }, + { + "epoch": 0.3562486272787173, + "grad_norm": 2.0269408226013184, + "learning_rate": 5e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6755155920982361, + "num_tokens": 84000009.0, + "step": 3244 + }, + { + "epoch": 0.356358444981331, + "grad_norm": 2.12397837638855, + "learning_rate": 5e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.702467679977417, + "num_tokens": 84021649.0, + "step": 3245 + }, + { + "epoch": 0.35646826268394466, + "grad_norm": 2.164856433868408, + "learning_rate": 5e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6901035308837891, + "num_tokens": 84044645.0, + "step": 3246 + }, + { + "epoch": 0.3565780803865583, + "grad_norm": 1.862931728363037, + "learning_rate": 5e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6843684315681458, + "num_tokens": 84076170.0, + "step": 3247 + }, + { + "epoch": 0.35668789808917195, + "grad_norm": 1.8848131895065308, + "learning_rate": 5e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6931408047676086, + "num_tokens": 84106215.0, + "step": 3248 + }, + { + "epoch": 0.35679771579178565, + "grad_norm": 2.098895311355591, + "learning_rate": 5e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6952980756759644, + "num_tokens": 84130250.0, + "step": 3249 + }, + { + "epoch": 0.3569075334943993, + "grad_norm": 1.8551950454711914, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7296031713485718, + "num_tokens": 84157234.0, + "step": 3250 + }, + { + "epoch": 0.35701735119701294, + "grad_norm": 2.0668108463287354, + "learning_rate": 5e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6823763847351074, + "num_tokens": 84181845.0, + "step": 3251 + }, + { + "epoch": 0.35712716889962665, + "grad_norm": 2.1944189071655273, + "learning_rate": 5e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7087697386741638, + "num_tokens": 84204672.0, + "step": 3252 + }, + { + "epoch": 0.3572369866022403, + "grad_norm": 1.747528314590454, + "learning_rate": 5e-06, + "loss": 1.1031, + "mean_token_accuracy": 0.6729138493537903, + "num_tokens": 84242663.0, + "step": 3253 + }, + { + "epoch": 0.35734680430485394, + "grad_norm": 2.0168464183807373, + "learning_rate": 5e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6815308928489685, + "num_tokens": 84267579.0, + "step": 3254 + }, + { + "epoch": 0.3574566220074676, + "grad_norm": 1.909505844116211, + "learning_rate": 5e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6958306431770325, + "num_tokens": 84295506.0, + "step": 3255 + }, + { + "epoch": 0.3575664397100813, + "grad_norm": 2.0329291820526123, + "learning_rate": 5e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.6746338605880737, + "num_tokens": 84321124.0, + "step": 3256 + }, + { + "epoch": 0.35767625741269493, + "grad_norm": 1.9569718837738037, + "learning_rate": 5e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7074594497680664, + "num_tokens": 84347643.0, + "step": 3257 + }, + { + "epoch": 0.3577860751153086, + "grad_norm": 1.8301438093185425, + "learning_rate": 5e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7068374156951904, + "num_tokens": 84376425.0, + "step": 3258 + }, + { + "epoch": 0.3578958928179222, + "grad_norm": 1.8951486349105835, + "learning_rate": 5e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7055603265762329, + "num_tokens": 84404520.0, + "step": 3259 + }, + { + "epoch": 0.3580057105205359, + "grad_norm": 1.9015394449234009, + "learning_rate": 5e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6886234879493713, + "num_tokens": 84435216.0, + "step": 3260 + }, + { + "epoch": 0.35811552822314957, + "grad_norm": 2.1431055068969727, + "learning_rate": 5e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7136403322219849, + "num_tokens": 84455728.0, + "step": 3261 + }, + { + "epoch": 0.3582253459257632, + "grad_norm": 1.99531090259552, + "learning_rate": 5e-06, + "loss": 1.1167, + "mean_token_accuracy": 0.6648142337799072, + "num_tokens": 84482662.0, + "step": 3262 + }, + { + "epoch": 0.3583351636283769, + "grad_norm": 1.9388995170593262, + "learning_rate": 5e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6866664290428162, + "num_tokens": 84512192.0, + "step": 3263 + }, + { + "epoch": 0.35844498133099056, + "grad_norm": 2.199981451034546, + "learning_rate": 5e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7010424137115479, + "num_tokens": 84533326.0, + "step": 3264 + }, + { + "epoch": 0.3585547990336042, + "grad_norm": 1.9264285564422607, + "learning_rate": 5e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7078325748443604, + "num_tokens": 84560346.0, + "step": 3265 + }, + { + "epoch": 0.35866461673621786, + "grad_norm": 2.099993944168091, + "learning_rate": 5e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6807272434234619, + "num_tokens": 84586602.0, + "step": 3266 + }, + { + "epoch": 0.35877443443883156, + "grad_norm": 1.957478642463684, + "learning_rate": 5e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6820573806762695, + "num_tokens": 84614935.0, + "step": 3267 + }, + { + "epoch": 0.3588842521414452, + "grad_norm": 2.1238157749176025, + "learning_rate": 5e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6941474676132202, + "num_tokens": 84641034.0, + "step": 3268 + }, + { + "epoch": 0.35899406984405885, + "grad_norm": 2.2914679050445557, + "learning_rate": 5e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7109935879707336, + "num_tokens": 84662595.0, + "step": 3269 + }, + { + "epoch": 0.35910388754667255, + "grad_norm": 1.8719896078109741, + "learning_rate": 5e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6804676055908203, + "num_tokens": 84692705.0, + "step": 3270 + }, + { + "epoch": 0.3592137052492862, + "grad_norm": 2.0222318172454834, + "learning_rate": 5e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7136394381523132, + "num_tokens": 84716738.0, + "step": 3271 + }, + { + "epoch": 0.35932352295189984, + "grad_norm": 2.2079474925994873, + "learning_rate": 5e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6797270178794861, + "num_tokens": 84738877.0, + "step": 3272 + }, + { + "epoch": 0.3594333406545135, + "grad_norm": 1.957137942314148, + "learning_rate": 5e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6969760656356812, + "num_tokens": 84768260.0, + "step": 3273 + }, + { + "epoch": 0.3595431583571272, + "grad_norm": 2.0162580013275146, + "learning_rate": 5e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6903170943260193, + "num_tokens": 84793633.0, + "step": 3274 + }, + { + "epoch": 0.35965297605974084, + "grad_norm": 1.8689395189285278, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7163920402526855, + "num_tokens": 84820675.0, + "step": 3275 + }, + { + "epoch": 0.3597627937623545, + "grad_norm": 1.791006088256836, + "learning_rate": 5e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7031954526901245, + "num_tokens": 84854891.0, + "step": 3276 + }, + { + "epoch": 0.35987261146496813, + "grad_norm": 1.9121078252792358, + "learning_rate": 5e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6872507333755493, + "num_tokens": 84880928.0, + "step": 3277 + }, + { + "epoch": 0.35998242916758183, + "grad_norm": 1.7833555936813354, + "learning_rate": 5e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7107498645782471, + "num_tokens": 84909971.0, + "step": 3278 + }, + { + "epoch": 0.3600922468701955, + "grad_norm": 2.253303050994873, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7157728672027588, + "num_tokens": 84928508.0, + "step": 3279 + }, + { + "epoch": 0.3602020645728091, + "grad_norm": 1.8825119733810425, + "learning_rate": 5e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.689018964767456, + "num_tokens": 84958198.0, + "step": 3280 + }, + { + "epoch": 0.3603118822754228, + "grad_norm": 1.8886737823486328, + "learning_rate": 5e-06, + "loss": 1.0824, + "mean_token_accuracy": 0.6772878170013428, + "num_tokens": 84986756.0, + "step": 3281 + }, + { + "epoch": 0.36042169997803647, + "grad_norm": 2.0619900226593018, + "learning_rate": 5e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7138579487800598, + "num_tokens": 85010038.0, + "step": 3282 + }, + { + "epoch": 0.3605315176806501, + "grad_norm": 1.9726051092147827, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6957566738128662, + "num_tokens": 85038063.0, + "step": 3283 + }, + { + "epoch": 0.36064133538326376, + "grad_norm": 2.15535044670105, + "learning_rate": 5e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6921103000640869, + "num_tokens": 85059867.0, + "step": 3284 + }, + { + "epoch": 0.36075115308587746, + "grad_norm": 2.0354301929473877, + "learning_rate": 5e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.6998379826545715, + "num_tokens": 85084936.0, + "step": 3285 + }, + { + "epoch": 0.3608609707884911, + "grad_norm": 2.275038719177246, + "learning_rate": 5e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7199006080627441, + "num_tokens": 85103414.0, + "step": 3286 + }, + { + "epoch": 0.36097078849110475, + "grad_norm": 2.2190053462982178, + "learning_rate": 5e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.690667986869812, + "num_tokens": 85125219.0, + "step": 3287 + }, + { + "epoch": 0.3610806061937184, + "grad_norm": 1.7582881450653076, + "learning_rate": 5e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6849279403686523, + "num_tokens": 85159480.0, + "step": 3288 + }, + { + "epoch": 0.3611904238963321, + "grad_norm": 1.7714120149612427, + "learning_rate": 5e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6887934803962708, + "num_tokens": 85193963.0, + "step": 3289 + }, + { + "epoch": 0.36130024159894575, + "grad_norm": 2.3028228282928467, + "learning_rate": 5e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6951569318771362, + "num_tokens": 85216312.0, + "step": 3290 + }, + { + "epoch": 0.3614100593015594, + "grad_norm": 1.8121166229248047, + "learning_rate": 5e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7074036598205566, + "num_tokens": 85245946.0, + "step": 3291 + }, + { + "epoch": 0.3615198770041731, + "grad_norm": 1.9677351713180542, + "learning_rate": 5e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7029263973236084, + "num_tokens": 85270337.0, + "step": 3292 + }, + { + "epoch": 0.36162969470678674, + "grad_norm": 2.129270315170288, + "learning_rate": 5e-06, + "loss": 1.0971, + "mean_token_accuracy": 0.6762127876281738, + "num_tokens": 85297607.0, + "step": 3293 + }, + { + "epoch": 0.3617395124094004, + "grad_norm": 1.8982460498809814, + "learning_rate": 5e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6889545917510986, + "num_tokens": 85326225.0, + "step": 3294 + }, + { + "epoch": 0.36184933011201403, + "grad_norm": 2.1238391399383545, + "learning_rate": 5e-06, + "loss": 0.985, + "mean_token_accuracy": 0.6996068954467773, + "num_tokens": 85348480.0, + "step": 3295 + }, + { + "epoch": 0.36195914781462774, + "grad_norm": 1.9260330200195312, + "learning_rate": 5e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7257298231124878, + "num_tokens": 85375870.0, + "step": 3296 + }, + { + "epoch": 0.3620689655172414, + "grad_norm": 2.206033945083618, + "learning_rate": 5e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7125691175460815, + "num_tokens": 85397837.0, + "step": 3297 + }, + { + "epoch": 0.362178783219855, + "grad_norm": 1.9887996912002563, + "learning_rate": 5e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6743550300598145, + "num_tokens": 85428884.0, + "step": 3298 + }, + { + "epoch": 0.36228860092246873, + "grad_norm": 2.3761181831359863, + "learning_rate": 5e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7035507559776306, + "num_tokens": 85447965.0, + "step": 3299 + }, + { + "epoch": 0.3623984186250824, + "grad_norm": 2.245518922805786, + "learning_rate": 5e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6944006085395813, + "num_tokens": 85468176.0, + "step": 3300 + }, + { + "epoch": 0.362508236327696, + "grad_norm": 1.9017577171325684, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6964956521987915, + "num_tokens": 85499341.0, + "step": 3301 + }, + { + "epoch": 0.36261805403030967, + "grad_norm": 2.025031089782715, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7008220553398132, + "num_tokens": 85525586.0, + "step": 3302 + }, + { + "epoch": 0.36272787173292337, + "grad_norm": 2.3034932613372803, + "learning_rate": 5e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7070393562316895, + "num_tokens": 85545705.0, + "step": 3303 + }, + { + "epoch": 0.362837689435537, + "grad_norm": 1.826642394065857, + "learning_rate": 5e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7324243187904358, + "num_tokens": 85578657.0, + "step": 3304 + }, + { + "epoch": 0.36294750713815066, + "grad_norm": 1.9186811447143555, + "learning_rate": 5e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.6703631281852722, + "num_tokens": 85611021.0, + "step": 3305 + }, + { + "epoch": 0.3630573248407643, + "grad_norm": 1.9845999479293823, + "learning_rate": 5e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6965124607086182, + "num_tokens": 85634119.0, + "step": 3306 + }, + { + "epoch": 0.363167142543378, + "grad_norm": 2.070956230163574, + "learning_rate": 5e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6875413656234741, + "num_tokens": 85664139.0, + "step": 3307 + }, + { + "epoch": 0.36327696024599165, + "grad_norm": 1.8605674505233765, + "learning_rate": 5e-06, + "loss": 1.0649, + "mean_token_accuracy": 0.6875106692314148, + "num_tokens": 85693855.0, + "step": 3308 + }, + { + "epoch": 0.3633867779486053, + "grad_norm": 2.455394983291626, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7084965109825134, + "num_tokens": 85713612.0, + "step": 3309 + }, + { + "epoch": 0.363496595651219, + "grad_norm": 1.9147313833236694, + "learning_rate": 5e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6985067129135132, + "num_tokens": 85742748.0, + "step": 3310 + }, + { + "epoch": 0.36360641335383265, + "grad_norm": 1.9067809581756592, + "learning_rate": 5e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6811449527740479, + "num_tokens": 85772923.0, + "step": 3311 + }, + { + "epoch": 0.3637162310564463, + "grad_norm": 1.8571116924285889, + "learning_rate": 5e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7111847400665283, + "num_tokens": 85800348.0, + "step": 3312 + }, + { + "epoch": 0.36382604875905994, + "grad_norm": 2.054471731185913, + "learning_rate": 5e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7231128811836243, + "num_tokens": 85823144.0, + "step": 3313 + }, + { + "epoch": 0.36393586646167364, + "grad_norm": 2.074866533279419, + "learning_rate": 5e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7185251712799072, + "num_tokens": 85845465.0, + "step": 3314 + }, + { + "epoch": 0.3640456841642873, + "grad_norm": 2.228976249694824, + "learning_rate": 5e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6979786157608032, + "num_tokens": 85868107.0, + "step": 3315 + }, + { + "epoch": 0.36415550186690093, + "grad_norm": 2.2252252101898193, + "learning_rate": 5e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6963281631469727, + "num_tokens": 85889928.0, + "step": 3316 + }, + { + "epoch": 0.3642653195695146, + "grad_norm": 1.8905508518218994, + "learning_rate": 5e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6936167478561401, + "num_tokens": 85918408.0, + "step": 3317 + }, + { + "epoch": 0.3643751372721283, + "grad_norm": 1.8994070291519165, + "learning_rate": 5e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6886399388313293, + "num_tokens": 85948751.0, + "step": 3318 + }, + { + "epoch": 0.3644849549747419, + "grad_norm": 2.1950414180755615, + "learning_rate": 5e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7142793536186218, + "num_tokens": 85968417.0, + "step": 3319 + }, + { + "epoch": 0.36459477267735557, + "grad_norm": 2.059861660003662, + "learning_rate": 5e-06, + "loss": 1.053, + "mean_token_accuracy": 0.679991602897644, + "num_tokens": 85992087.0, + "step": 3320 + }, + { + "epoch": 0.3647045903799693, + "grad_norm": 1.9550435543060303, + "learning_rate": 5e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7164061069488525, + "num_tokens": 86017916.0, + "step": 3321 + }, + { + "epoch": 0.3648144080825829, + "grad_norm": 1.9742286205291748, + "learning_rate": 5e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7159203290939331, + "num_tokens": 86043931.0, + "step": 3322 + }, + { + "epoch": 0.36492422578519657, + "grad_norm": 2.0413055419921875, + "learning_rate": 5e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7023817300796509, + "num_tokens": 86067700.0, + "step": 3323 + }, + { + "epoch": 0.3650340434878102, + "grad_norm": 2.0259602069854736, + "learning_rate": 5e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.723036527633667, + "num_tokens": 86092264.0, + "step": 3324 + }, + { + "epoch": 0.3651438611904239, + "grad_norm": 2.121915340423584, + "learning_rate": 5e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7099592685699463, + "num_tokens": 86115362.0, + "step": 3325 + }, + { + "epoch": 0.36525367889303756, + "grad_norm": 2.088768243789673, + "learning_rate": 5e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7005356550216675, + "num_tokens": 86145219.0, + "step": 3326 + }, + { + "epoch": 0.3653634965956512, + "grad_norm": 2.071038007736206, + "learning_rate": 5e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7016993761062622, + "num_tokens": 86169321.0, + "step": 3327 + }, + { + "epoch": 0.3654733142982649, + "grad_norm": 2.043581962585449, + "learning_rate": 5e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7091350555419922, + "num_tokens": 86195245.0, + "step": 3328 + }, + { + "epoch": 0.36558313200087855, + "grad_norm": 2.0299453735351562, + "learning_rate": 5e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6859289407730103, + "num_tokens": 86219172.0, + "step": 3329 + }, + { + "epoch": 0.3656929497034922, + "grad_norm": 2.2195470333099365, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7089501619338989, + "num_tokens": 86239344.0, + "step": 3330 + }, + { + "epoch": 0.36580276740610584, + "grad_norm": 1.9593850374221802, + "learning_rate": 5e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7108615636825562, + "num_tokens": 86267191.0, + "step": 3331 + }, + { + "epoch": 0.36591258510871955, + "grad_norm": 1.84918212890625, + "learning_rate": 5e-06, + "loss": 1.0907, + "mean_token_accuracy": 0.6708303093910217, + "num_tokens": 86299001.0, + "step": 3332 + }, + { + "epoch": 0.3660224028113332, + "grad_norm": 2.2134101390838623, + "learning_rate": 5e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7175133228302002, + "num_tokens": 86319189.0, + "step": 3333 + }, + { + "epoch": 0.36613222051394684, + "grad_norm": 2.165095567703247, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7146693468093872, + "num_tokens": 86340729.0, + "step": 3334 + }, + { + "epoch": 0.3662420382165605, + "grad_norm": 1.9920698404312134, + "learning_rate": 5e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7000426054000854, + "num_tokens": 86367247.0, + "step": 3335 + }, + { + "epoch": 0.3663518559191742, + "grad_norm": 1.9266822338104248, + "learning_rate": 5e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7125054597854614, + "num_tokens": 86393949.0, + "step": 3336 + }, + { + "epoch": 0.36646167362178783, + "grad_norm": 1.9345625638961792, + "learning_rate": 5e-06, + "loss": 1.0778, + "mean_token_accuracy": 0.6805029511451721, + "num_tokens": 86421979.0, + "step": 3337 + }, + { + "epoch": 0.3665714913244015, + "grad_norm": 1.9814578294754028, + "learning_rate": 5e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6928300857543945, + "num_tokens": 86450974.0, + "step": 3338 + }, + { + "epoch": 0.3666813090270152, + "grad_norm": 1.935412049293518, + "learning_rate": 5e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7020220756530762, + "num_tokens": 86477063.0, + "step": 3339 + }, + { + "epoch": 0.3667911267296288, + "grad_norm": 1.9551750421524048, + "learning_rate": 5e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6898964643478394, + "num_tokens": 86505454.0, + "step": 3340 + }, + { + "epoch": 0.36690094443224247, + "grad_norm": 2.2236969470977783, + "learning_rate": 5e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7255886197090149, + "num_tokens": 86525592.0, + "step": 3341 + }, + { + "epoch": 0.3670107621348561, + "grad_norm": 2.3125815391540527, + "learning_rate": 5e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7060667872428894, + "num_tokens": 86545603.0, + "step": 3342 + }, + { + "epoch": 0.3671205798374698, + "grad_norm": 2.1497011184692383, + "learning_rate": 5e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6851032376289368, + "num_tokens": 86571169.0, + "step": 3343 + }, + { + "epoch": 0.36723039754008346, + "grad_norm": 1.908750295639038, + "learning_rate": 5e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.6751060485839844, + "num_tokens": 86599515.0, + "step": 3344 + }, + { + "epoch": 0.3673402152426971, + "grad_norm": 1.8871146440505981, + "learning_rate": 5e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6899242997169495, + "num_tokens": 86629453.0, + "step": 3345 + }, + { + "epoch": 0.3674500329453108, + "grad_norm": 2.04813551902771, + "learning_rate": 5e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.691689133644104, + "num_tokens": 86654791.0, + "step": 3346 + }, + { + "epoch": 0.36755985064792446, + "grad_norm": 1.980605125427246, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6933951377868652, + "num_tokens": 86680666.0, + "step": 3347 + }, + { + "epoch": 0.3676696683505381, + "grad_norm": 1.9991462230682373, + "learning_rate": 5e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7081615924835205, + "num_tokens": 86703731.0, + "step": 3348 + }, + { + "epoch": 0.36777948605315175, + "grad_norm": 2.1953001022338867, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6928070783615112, + "num_tokens": 86727554.0, + "step": 3349 + }, + { + "epoch": 0.36788930375576545, + "grad_norm": 1.7999316453933716, + "learning_rate": 5e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7047053575515747, + "num_tokens": 86758656.0, + "step": 3350 + }, + { + "epoch": 0.3679991214583791, + "grad_norm": 1.9119967222213745, + "learning_rate": 5e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6889946460723877, + "num_tokens": 86786234.0, + "step": 3351 + }, + { + "epoch": 0.36810893916099274, + "grad_norm": 2.194514274597168, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7021773457527161, + "num_tokens": 86806817.0, + "step": 3352 + }, + { + "epoch": 0.3682187568636064, + "grad_norm": 2.0180728435516357, + "learning_rate": 5e-06, + "loss": 1.0926, + "mean_token_accuracy": 0.6748523116111755, + "num_tokens": 86834629.0, + "step": 3353 + }, + { + "epoch": 0.3683285745662201, + "grad_norm": 1.9573115110397339, + "learning_rate": 5e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6934053897857666, + "num_tokens": 86861625.0, + "step": 3354 + }, + { + "epoch": 0.36843839226883374, + "grad_norm": 1.9119009971618652, + "learning_rate": 5e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.6991108059883118, + "num_tokens": 86890419.0, + "step": 3355 + }, + { + "epoch": 0.3685482099714474, + "grad_norm": 1.7992156744003296, + "learning_rate": 5e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.681922435760498, + "num_tokens": 86922951.0, + "step": 3356 + }, + { + "epoch": 0.3686580276740611, + "grad_norm": 1.9331351518630981, + "learning_rate": 5e-06, + "loss": 1.1134, + "mean_token_accuracy": 0.6680135726928711, + "num_tokens": 86952759.0, + "step": 3357 + }, + { + "epoch": 0.36876784537667473, + "grad_norm": 1.754090666770935, + "learning_rate": 5e-06, + "loss": 1.0726, + "mean_token_accuracy": 0.6771764755249023, + "num_tokens": 86987738.0, + "step": 3358 + }, + { + "epoch": 0.3688776630792884, + "grad_norm": 2.263679027557373, + "learning_rate": 5e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6947327852249146, + "num_tokens": 87007965.0, + "step": 3359 + }, + { + "epoch": 0.368987480781902, + "grad_norm": 1.8144862651824951, + "learning_rate": 5e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.689405620098114, + "num_tokens": 87039537.0, + "step": 3360 + }, + { + "epoch": 0.3690972984845157, + "grad_norm": 1.8565837144851685, + "learning_rate": 5e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7048347592353821, + "num_tokens": 87067594.0, + "step": 3361 + }, + { + "epoch": 0.36920711618712937, + "grad_norm": 2.077791929244995, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7038972973823547, + "num_tokens": 87092717.0, + "step": 3362 + }, + { + "epoch": 0.369316933889743, + "grad_norm": 1.8694062232971191, + "learning_rate": 5e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7070505619049072, + "num_tokens": 87120504.0, + "step": 3363 + }, + { + "epoch": 0.36942675159235666, + "grad_norm": 2.228308916091919, + "learning_rate": 5e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7163744568824768, + "num_tokens": 87140035.0, + "step": 3364 + }, + { + "epoch": 0.36953656929497036, + "grad_norm": 2.254477024078369, + "learning_rate": 5e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.697742223739624, + "num_tokens": 87160668.0, + "step": 3365 + }, + { + "epoch": 0.369646386997584, + "grad_norm": 1.937056303024292, + "learning_rate": 5e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6982567310333252, + "num_tokens": 87186419.0, + "step": 3366 + }, + { + "epoch": 0.36975620470019765, + "grad_norm": 2.0697991847991943, + "learning_rate": 5e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.7032485008239746, + "num_tokens": 87209963.0, + "step": 3367 + }, + { + "epoch": 0.36986602240281136, + "grad_norm": 2.231649160385132, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7148994207382202, + "num_tokens": 87231325.0, + "step": 3368 + }, + { + "epoch": 0.369975840105425, + "grad_norm": 2.2249464988708496, + "learning_rate": 5e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.726518988609314, + "num_tokens": 87251544.0, + "step": 3369 + }, + { + "epoch": 0.37008565780803865, + "grad_norm": 2.20102858543396, + "learning_rate": 5e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6909649968147278, + "num_tokens": 87274369.0, + "step": 3370 + }, + { + "epoch": 0.3701954755106523, + "grad_norm": 1.8765662908554077, + "learning_rate": 5e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7076013088226318, + "num_tokens": 87301955.0, + "step": 3371 + }, + { + "epoch": 0.370305293213266, + "grad_norm": 2.269545316696167, + "learning_rate": 5e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7250512838363647, + "num_tokens": 87320140.0, + "step": 3372 + }, + { + "epoch": 0.37041511091587964, + "grad_norm": 1.963215947151184, + "learning_rate": 5e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7029093503952026, + "num_tokens": 87348122.0, + "step": 3373 + }, + { + "epoch": 0.3705249286184933, + "grad_norm": 2.0451416969299316, + "learning_rate": 5e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7096916437149048, + "num_tokens": 87373755.0, + "step": 3374 + }, + { + "epoch": 0.370634746321107, + "grad_norm": 2.2319204807281494, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7308142185211182, + "num_tokens": 87392245.0, + "step": 3375 + }, + { + "epoch": 0.37074456402372064, + "grad_norm": 1.8624517917633057, + "learning_rate": 5e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6824145317077637, + "num_tokens": 87426029.0, + "step": 3376 + }, + { + "epoch": 0.3708543817263343, + "grad_norm": 2.3191049098968506, + "learning_rate": 5e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.701088547706604, + "num_tokens": 87447552.0, + "step": 3377 + }, + { + "epoch": 0.3709641994289479, + "grad_norm": 2.1784963607788086, + "learning_rate": 5e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7210894227027893, + "num_tokens": 87468326.0, + "step": 3378 + }, + { + "epoch": 0.37107401713156163, + "grad_norm": 2.3085734844207764, + "learning_rate": 5e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6958495378494263, + "num_tokens": 87490059.0, + "step": 3379 + }, + { + "epoch": 0.3711838348341753, + "grad_norm": 2.0137112140655518, + "learning_rate": 5e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6814427375793457, + "num_tokens": 87515554.0, + "step": 3380 + }, + { + "epoch": 0.3712936525367889, + "grad_norm": 2.0921623706817627, + "learning_rate": 5e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6814478635787964, + "num_tokens": 87540072.0, + "step": 3381 + }, + { + "epoch": 0.37140347023940257, + "grad_norm": 2.227182626724243, + "learning_rate": 5e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7152037620544434, + "num_tokens": 87563381.0, + "step": 3382 + }, + { + "epoch": 0.37151328794201627, + "grad_norm": 2.054154396057129, + "learning_rate": 5e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6949601173400879, + "num_tokens": 87589514.0, + "step": 3383 + }, + { + "epoch": 0.3716231056446299, + "grad_norm": 1.9475064277648926, + "learning_rate": 5e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7070544958114624, + "num_tokens": 87616314.0, + "step": 3384 + }, + { + "epoch": 0.37173292334724356, + "grad_norm": 1.8553272485733032, + "learning_rate": 5e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7032859325408936, + "num_tokens": 87645603.0, + "step": 3385 + }, + { + "epoch": 0.37184274104985726, + "grad_norm": 2.050891399383545, + "learning_rate": 5e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.735453724861145, + "num_tokens": 87668312.0, + "step": 3386 + }, + { + "epoch": 0.3719525587524709, + "grad_norm": 2.0931711196899414, + "learning_rate": 5e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7094595432281494, + "num_tokens": 87691116.0, + "step": 3387 + }, + { + "epoch": 0.37206237645508455, + "grad_norm": 1.9709768295288086, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6883620023727417, + "num_tokens": 87718433.0, + "step": 3388 + }, + { + "epoch": 0.3721721941576982, + "grad_norm": 1.9798821210861206, + "learning_rate": 5e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7026404142379761, + "num_tokens": 87743134.0, + "step": 3389 + }, + { + "epoch": 0.3722820118603119, + "grad_norm": 2.552337884902954, + "learning_rate": 5e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6925160884857178, + "num_tokens": 87768328.0, + "step": 3390 + }, + { + "epoch": 0.37239182956292555, + "grad_norm": 2.0420801639556885, + "learning_rate": 5e-06, + "loss": 1.009, + "mean_token_accuracy": 0.7112659215927124, + "num_tokens": 87793313.0, + "step": 3391 + }, + { + "epoch": 0.3725016472655392, + "grad_norm": 2.098798990249634, + "learning_rate": 5e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.7003898024559021, + "num_tokens": 87817987.0, + "step": 3392 + }, + { + "epoch": 0.37261146496815284, + "grad_norm": 2.2405576705932617, + "learning_rate": 5e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7110536694526672, + "num_tokens": 87839151.0, + "step": 3393 + }, + { + "epoch": 0.37272128267076654, + "grad_norm": 1.9095526933670044, + "learning_rate": 5e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.6984782814979553, + "num_tokens": 87867048.0, + "step": 3394 + }, + { + "epoch": 0.3728311003733802, + "grad_norm": 2.0104613304138184, + "learning_rate": 5e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6967110633850098, + "num_tokens": 87895170.0, + "step": 3395 + }, + { + "epoch": 0.37294091807599383, + "grad_norm": 2.0572056770324707, + "learning_rate": 5e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6805481910705566, + "num_tokens": 87919325.0, + "step": 3396 + }, + { + "epoch": 0.37305073577860753, + "grad_norm": 1.929801106452942, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.741910457611084, + "num_tokens": 87942427.0, + "step": 3397 + }, + { + "epoch": 0.3731605534812212, + "grad_norm": 1.979632019996643, + "learning_rate": 5e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.701088547706604, + "num_tokens": 87969745.0, + "step": 3398 + }, + { + "epoch": 0.3732703711838348, + "grad_norm": 2.074749231338501, + "learning_rate": 5e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7082704305648804, + "num_tokens": 87992947.0, + "step": 3399 + }, + { + "epoch": 0.37338018888644847, + "grad_norm": 1.765660285949707, + "learning_rate": 5e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7034757137298584, + "num_tokens": 88021789.0, + "step": 3400 + }, + { + "epoch": 0.3734900065890622, + "grad_norm": 2.1181037425994873, + "learning_rate": 5e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7130913138389587, + "num_tokens": 88042986.0, + "step": 3401 + }, + { + "epoch": 0.3735998242916758, + "grad_norm": 1.940219521522522, + "learning_rate": 5e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7068969011306763, + "num_tokens": 88072513.0, + "step": 3402 + }, + { + "epoch": 0.37370964199428947, + "grad_norm": 2.0279486179351807, + "learning_rate": 5e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7189177870750427, + "num_tokens": 88096909.0, + "step": 3403 + }, + { + "epoch": 0.37381945969690317, + "grad_norm": 2.4379465579986572, + "learning_rate": 5e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6969934105873108, + "num_tokens": 88114073.0, + "step": 3404 + }, + { + "epoch": 0.3739292773995168, + "grad_norm": 1.9631521701812744, + "learning_rate": 5e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7211297154426575, + "num_tokens": 88140170.0, + "step": 3405 + }, + { + "epoch": 0.37403909510213046, + "grad_norm": 1.8534023761749268, + "learning_rate": 5e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6954426765441895, + "num_tokens": 88168775.0, + "step": 3406 + }, + { + "epoch": 0.3741489128047441, + "grad_norm": 2.039919376373291, + "learning_rate": 5e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6873152852058411, + "num_tokens": 88194656.0, + "step": 3407 + }, + { + "epoch": 0.3742587305073578, + "grad_norm": 2.09057354927063, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6896403431892395, + "num_tokens": 88218779.0, + "step": 3408 + }, + { + "epoch": 0.37436854820997145, + "grad_norm": 2.0041134357452393, + "learning_rate": 5e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7101699113845825, + "num_tokens": 88242455.0, + "step": 3409 + }, + { + "epoch": 0.3744783659125851, + "grad_norm": 1.9549421072006226, + "learning_rate": 5e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7169619798660278, + "num_tokens": 88266838.0, + "step": 3410 + }, + { + "epoch": 0.37458818361519874, + "grad_norm": 1.9731981754302979, + "learning_rate": 5e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6819767951965332, + "num_tokens": 88292880.0, + "step": 3411 + }, + { + "epoch": 0.37469800131781245, + "grad_norm": 2.0939695835113525, + "learning_rate": 5e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7063230276107788, + "num_tokens": 88315117.0, + "step": 3412 + }, + { + "epoch": 0.3748078190204261, + "grad_norm": 2.190258026123047, + "learning_rate": 5e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6797625422477722, + "num_tokens": 88338095.0, + "step": 3413 + }, + { + "epoch": 0.37491763672303974, + "grad_norm": 1.9877903461456299, + "learning_rate": 5e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7043966054916382, + "num_tokens": 88362759.0, + "step": 3414 + }, + { + "epoch": 0.37502745442565344, + "grad_norm": 2.1758289337158203, + "learning_rate": 5e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7096436619758606, + "num_tokens": 88382096.0, + "step": 3415 + }, + { + "epoch": 0.3751372721282671, + "grad_norm": 1.9730082750320435, + "learning_rate": 5e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6923114657402039, + "num_tokens": 88407182.0, + "step": 3416 + }, + { + "epoch": 0.37524708983088073, + "grad_norm": 1.9573894739151, + "learning_rate": 5e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7025731801986694, + "num_tokens": 88433227.0, + "step": 3417 + }, + { + "epoch": 0.3753569075334944, + "grad_norm": 1.9068326950073242, + "learning_rate": 5e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6943667531013489, + "num_tokens": 88458430.0, + "step": 3418 + }, + { + "epoch": 0.3754667252361081, + "grad_norm": 1.9881471395492554, + "learning_rate": 5e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7366015911102295, + "num_tokens": 88482112.0, + "step": 3419 + }, + { + "epoch": 0.3755765429387217, + "grad_norm": 1.8337340354919434, + "learning_rate": 5e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6883993148803711, + "num_tokens": 88514471.0, + "step": 3420 + }, + { + "epoch": 0.37568636064133537, + "grad_norm": 2.0212557315826416, + "learning_rate": 5e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6907212734222412, + "num_tokens": 88538734.0, + "step": 3421 + }, + { + "epoch": 0.37579617834394907, + "grad_norm": 2.014228343963623, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7191593050956726, + "num_tokens": 88561672.0, + "step": 3422 + }, + { + "epoch": 0.3759059960465627, + "grad_norm": 1.9460614919662476, + "learning_rate": 5e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6914850473403931, + "num_tokens": 88587518.0, + "step": 3423 + }, + { + "epoch": 0.37601581374917636, + "grad_norm": 2.0368924140930176, + "learning_rate": 5e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7176394462585449, + "num_tokens": 88610670.0, + "step": 3424 + }, + { + "epoch": 0.37612563145179, + "grad_norm": 1.9443460702896118, + "learning_rate": 5e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7065889835357666, + "num_tokens": 88634930.0, + "step": 3425 + }, + { + "epoch": 0.3762354491544037, + "grad_norm": 1.8327916860580444, + "learning_rate": 5e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6806532144546509, + "num_tokens": 88666129.0, + "step": 3426 + }, + { + "epoch": 0.37634526685701736, + "grad_norm": 1.9606324434280396, + "learning_rate": 5e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7024261951446533, + "num_tokens": 88692786.0, + "step": 3427 + }, + { + "epoch": 0.376455084559631, + "grad_norm": 1.8313552141189575, + "learning_rate": 5e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6858816146850586, + "num_tokens": 88722682.0, + "step": 3428 + }, + { + "epoch": 0.37656490226224465, + "grad_norm": 2.150499105453491, + "learning_rate": 5e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6818463206291199, + "num_tokens": 88745164.0, + "step": 3429 + }, + { + "epoch": 0.37667471996485835, + "grad_norm": 2.1272902488708496, + "learning_rate": 5e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7165842056274414, + "num_tokens": 88769062.0, + "step": 3430 + }, + { + "epoch": 0.376784537667472, + "grad_norm": 2.369305372238159, + "learning_rate": 5e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.716681718826294, + "num_tokens": 88787708.0, + "step": 3431 + }, + { + "epoch": 0.37689435537008564, + "grad_norm": 2.0369369983673096, + "learning_rate": 5e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6994223594665527, + "num_tokens": 88816217.0, + "step": 3432 + }, + { + "epoch": 0.37700417307269934, + "grad_norm": 1.9391398429870605, + "learning_rate": 5e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6952246427536011, + "num_tokens": 88843485.0, + "step": 3433 + }, + { + "epoch": 0.377113990775313, + "grad_norm": 1.832851767539978, + "learning_rate": 5e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6819702982902527, + "num_tokens": 88872629.0, + "step": 3434 + }, + { + "epoch": 0.37722380847792664, + "grad_norm": 1.8460588455200195, + "learning_rate": 5e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6999516487121582, + "num_tokens": 88902709.0, + "step": 3435 + }, + { + "epoch": 0.3773336261805403, + "grad_norm": 1.9322882890701294, + "learning_rate": 5e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7001217603683472, + "num_tokens": 88928029.0, + "step": 3436 + }, + { + "epoch": 0.377443443883154, + "grad_norm": 1.9054226875305176, + "learning_rate": 5e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.695981502532959, + "num_tokens": 88956894.0, + "step": 3437 + }, + { + "epoch": 0.37755326158576763, + "grad_norm": 2.126169204711914, + "learning_rate": 5e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7035273313522339, + "num_tokens": 88982889.0, + "step": 3438 + }, + { + "epoch": 0.3776630792883813, + "grad_norm": 2.0579583644866943, + "learning_rate": 5e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6955587863922119, + "num_tokens": 89006833.0, + "step": 3439 + }, + { + "epoch": 0.3777728969909949, + "grad_norm": 2.1694886684417725, + "learning_rate": 5e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6883152723312378, + "num_tokens": 89031094.0, + "step": 3440 + }, + { + "epoch": 0.3778827146936086, + "grad_norm": 1.8244812488555908, + "learning_rate": 5e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6946576833724976, + "num_tokens": 89061887.0, + "step": 3441 + }, + { + "epoch": 0.37799253239622227, + "grad_norm": 2.067939519882202, + "learning_rate": 5e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7160428762435913, + "num_tokens": 89087194.0, + "step": 3442 + }, + { + "epoch": 0.3781023500988359, + "grad_norm": 2.2217071056365967, + "learning_rate": 5e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6876025199890137, + "num_tokens": 89111165.0, + "step": 3443 + }, + { + "epoch": 0.3782121678014496, + "grad_norm": 2.124412775039673, + "learning_rate": 5e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7234561443328857, + "num_tokens": 89134523.0, + "step": 3444 + }, + { + "epoch": 0.37832198550406326, + "grad_norm": 1.8040125370025635, + "learning_rate": 5e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6869790554046631, + "num_tokens": 89167102.0, + "step": 3445 + }, + { + "epoch": 0.3784318032066769, + "grad_norm": 1.9320216178894043, + "learning_rate": 5e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6835180521011353, + "num_tokens": 89193212.0, + "step": 3446 + }, + { + "epoch": 0.37854162090929055, + "grad_norm": 2.1261820793151855, + "learning_rate": 5e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6962190866470337, + "num_tokens": 89215785.0, + "step": 3447 + }, + { + "epoch": 0.37865143861190426, + "grad_norm": 2.0476112365722656, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7079221606254578, + "num_tokens": 89239034.0, + "step": 3448 + }, + { + "epoch": 0.3787612563145179, + "grad_norm": 1.7971014976501465, + "learning_rate": 5e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6939480304718018, + "num_tokens": 89267565.0, + "step": 3449 + }, + { + "epoch": 0.37887107401713155, + "grad_norm": 1.9101163148880005, + "learning_rate": 5e-06, + "loss": 1.1247, + "mean_token_accuracy": 0.6677748560905457, + "num_tokens": 89297805.0, + "step": 3450 + }, + { + "epoch": 0.37898089171974525, + "grad_norm": 1.8149036169052124, + "learning_rate": 5e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6926556825637817, + "num_tokens": 89326418.0, + "step": 3451 + }, + { + "epoch": 0.3790907094223589, + "grad_norm": 1.920240879058838, + "learning_rate": 5e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7026773691177368, + "num_tokens": 89352310.0, + "step": 3452 + }, + { + "epoch": 0.37920052712497254, + "grad_norm": 1.9855715036392212, + "learning_rate": 5e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6980472803115845, + "num_tokens": 89383251.0, + "step": 3453 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 1.730363130569458, + "learning_rate": 5e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.6859509944915771, + "num_tokens": 89414260.0, + "step": 3454 + }, + { + "epoch": 0.3794201625301999, + "grad_norm": 1.9993300437927246, + "learning_rate": 5e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7084075212478638, + "num_tokens": 89438468.0, + "step": 3455 + }, + { + "epoch": 0.37952998023281354, + "grad_norm": 2.0087835788726807, + "learning_rate": 5e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6786717772483826, + "num_tokens": 89465235.0, + "step": 3456 + }, + { + "epoch": 0.3796397979354272, + "grad_norm": 2.1004838943481445, + "learning_rate": 5e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7098674178123474, + "num_tokens": 89486900.0, + "step": 3457 + }, + { + "epoch": 0.3797496156380408, + "grad_norm": 2.013705253601074, + "learning_rate": 5e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6808750629425049, + "num_tokens": 89512940.0, + "step": 3458 + }, + { + "epoch": 0.37985943334065453, + "grad_norm": 1.7972766160964966, + "learning_rate": 5e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6925894021987915, + "num_tokens": 89544239.0, + "step": 3459 + }, + { + "epoch": 0.3799692510432682, + "grad_norm": 2.270007610321045, + "learning_rate": 5e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7024746537208557, + "num_tokens": 89565039.0, + "step": 3460 + }, + { + "epoch": 0.3800790687458818, + "grad_norm": 1.857405424118042, + "learning_rate": 5e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7053809762001038, + "num_tokens": 89594213.0, + "step": 3461 + }, + { + "epoch": 0.3801888864484955, + "grad_norm": 2.17311692237854, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.750807523727417, + "num_tokens": 89612975.0, + "step": 3462 + }, + { + "epoch": 0.38029870415110917, + "grad_norm": 2.081556558609009, + "learning_rate": 5e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7019672989845276, + "num_tokens": 89636729.0, + "step": 3463 + }, + { + "epoch": 0.3804085218537228, + "grad_norm": 2.042443037033081, + "learning_rate": 5e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6981787085533142, + "num_tokens": 89661955.0, + "step": 3464 + }, + { + "epoch": 0.38051833955633646, + "grad_norm": 1.937813639640808, + "learning_rate": 5e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6933190822601318, + "num_tokens": 89689580.0, + "step": 3465 + }, + { + "epoch": 0.38062815725895016, + "grad_norm": 2.089777946472168, + "learning_rate": 5e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7092682123184204, + "num_tokens": 89712351.0, + "step": 3466 + }, + { + "epoch": 0.3807379749615638, + "grad_norm": 1.8872705698013306, + "learning_rate": 5e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.6972216963768005, + "num_tokens": 89739036.0, + "step": 3467 + }, + { + "epoch": 0.38084779266417745, + "grad_norm": 2.110960006713867, + "learning_rate": 5e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7030814290046692, + "num_tokens": 89762969.0, + "step": 3468 + }, + { + "epoch": 0.3809576103667911, + "grad_norm": 1.8389869928359985, + "learning_rate": 5e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.6715657711029053, + "num_tokens": 89796306.0, + "step": 3469 + }, + { + "epoch": 0.3810674280694048, + "grad_norm": 2.3566901683807373, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7233291268348694, + "num_tokens": 89814083.0, + "step": 3470 + }, + { + "epoch": 0.38117724577201845, + "grad_norm": 1.8562582731246948, + "learning_rate": 5e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7047679424285889, + "num_tokens": 89839606.0, + "step": 3471 + }, + { + "epoch": 0.3812870634746321, + "grad_norm": 1.8633840084075928, + "learning_rate": 5e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6935372352600098, + "num_tokens": 89867145.0, + "step": 3472 + }, + { + "epoch": 0.3813968811772458, + "grad_norm": 1.861441969871521, + "learning_rate": 5e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.694059431552887, + "num_tokens": 89896537.0, + "step": 3473 + }, + { + "epoch": 0.38150669887985944, + "grad_norm": 1.9942916631698608, + "learning_rate": 5e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7065978050231934, + "num_tokens": 89921337.0, + "step": 3474 + }, + { + "epoch": 0.3816165165824731, + "grad_norm": 1.9979504346847534, + "learning_rate": 5e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.675115704536438, + "num_tokens": 89949239.0, + "step": 3475 + }, + { + "epoch": 0.38172633428508673, + "grad_norm": 2.229508876800537, + "learning_rate": 5e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7018028497695923, + "num_tokens": 89969254.0, + "step": 3476 + }, + { + "epoch": 0.38183615198770043, + "grad_norm": 2.0151634216308594, + "learning_rate": 5e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7265689373016357, + "num_tokens": 89992184.0, + "step": 3477 + }, + { + "epoch": 0.3819459696903141, + "grad_norm": 2.0025124549865723, + "learning_rate": 5e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6881987452507019, + "num_tokens": 90016702.0, + "step": 3478 + }, + { + "epoch": 0.3820557873929277, + "grad_norm": 2.048907518386841, + "learning_rate": 5e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.739168107509613, + "num_tokens": 90037930.0, + "step": 3479 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 2.057636260986328, + "learning_rate": 5e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7147098779678345, + "num_tokens": 90059880.0, + "step": 3480 + }, + { + "epoch": 0.3822754227981551, + "grad_norm": 1.997823715209961, + "learning_rate": 5e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6949590444564819, + "num_tokens": 90088834.0, + "step": 3481 + }, + { + "epoch": 0.3823852405007687, + "grad_norm": 1.9322748184204102, + "learning_rate": 5e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6931934952735901, + "num_tokens": 90116992.0, + "step": 3482 + }, + { + "epoch": 0.38249505820338237, + "grad_norm": 2.0179100036621094, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.713658332824707, + "num_tokens": 90141803.0, + "step": 3483 + }, + { + "epoch": 0.38260487590599607, + "grad_norm": 2.217337131500244, + "learning_rate": 5e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7105355858802795, + "num_tokens": 90164909.0, + "step": 3484 + }, + { + "epoch": 0.3827146936086097, + "grad_norm": 1.9684139490127563, + "learning_rate": 5e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6886044144630432, + "num_tokens": 90191389.0, + "step": 3485 + }, + { + "epoch": 0.38282451131122336, + "grad_norm": 2.207623243331909, + "learning_rate": 5e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6973809003829956, + "num_tokens": 90213735.0, + "step": 3486 + }, + { + "epoch": 0.382934329013837, + "grad_norm": 2.0706305503845215, + "learning_rate": 5e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6943693161010742, + "num_tokens": 90239025.0, + "step": 3487 + }, + { + "epoch": 0.3830441467164507, + "grad_norm": 2.0893776416778564, + "learning_rate": 5e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7131762504577637, + "num_tokens": 90262494.0, + "step": 3488 + }, + { + "epoch": 0.38315396441906435, + "grad_norm": 1.97258460521698, + "learning_rate": 5e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6927455067634583, + "num_tokens": 90289507.0, + "step": 3489 + }, + { + "epoch": 0.383263782121678, + "grad_norm": 1.8146966695785522, + "learning_rate": 5e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6890035271644592, + "num_tokens": 90321379.0, + "step": 3490 + }, + { + "epoch": 0.3833735998242917, + "grad_norm": 1.8424608707427979, + "learning_rate": 5e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6772027611732483, + "num_tokens": 90351261.0, + "step": 3491 + }, + { + "epoch": 0.38348341752690535, + "grad_norm": 1.7800270318984985, + "learning_rate": 5e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6955441236495972, + "num_tokens": 90382165.0, + "step": 3492 + }, + { + "epoch": 0.383593235229519, + "grad_norm": 2.1779208183288574, + "learning_rate": 5e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7038249969482422, + "num_tokens": 90402977.0, + "step": 3493 + }, + { + "epoch": 0.38370305293213264, + "grad_norm": 1.9847948551177979, + "learning_rate": 5e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6969742178916931, + "num_tokens": 90428832.0, + "step": 3494 + }, + { + "epoch": 0.38381287063474634, + "grad_norm": 2.207603693008423, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7116858959197998, + "num_tokens": 90448790.0, + "step": 3495 + }, + { + "epoch": 0.38392268833736, + "grad_norm": 2.1200966835021973, + "learning_rate": 5e-06, + "loss": 1.1197, + "mean_token_accuracy": 0.6659378409385681, + "num_tokens": 90475403.0, + "step": 3496 + }, + { + "epoch": 0.38403250603997363, + "grad_norm": 2.0118489265441895, + "learning_rate": 5e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7034597992897034, + "num_tokens": 90499619.0, + "step": 3497 + }, + { + "epoch": 0.38414232374258733, + "grad_norm": 1.9803755283355713, + "learning_rate": 5e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7123801708221436, + "num_tokens": 90529976.0, + "step": 3498 + }, + { + "epoch": 0.384252141445201, + "grad_norm": 2.312330961227417, + "learning_rate": 5e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7088953256607056, + "num_tokens": 90550056.0, + "step": 3499 + }, + { + "epoch": 0.3843619591478146, + "grad_norm": 1.955531120300293, + "learning_rate": 5e-06, + "loss": 0.919, + "mean_token_accuracy": 0.716185450553894, + "num_tokens": 90578482.0, + "step": 3500 + }, + { + "epoch": 0.38447177685042827, + "grad_norm": 2.1749110221862793, + "learning_rate": 5e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6939045786857605, + "num_tokens": 90601770.0, + "step": 3501 + }, + { + "epoch": 0.38458159455304197, + "grad_norm": 2.059537172317505, + "learning_rate": 5e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7046366930007935, + "num_tokens": 90626307.0, + "step": 3502 + }, + { + "epoch": 0.3846914122556556, + "grad_norm": 2.364518880844116, + "learning_rate": 5e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7211155891418457, + "num_tokens": 90643816.0, + "step": 3503 + }, + { + "epoch": 0.38480122995826926, + "grad_norm": 1.9116580486297607, + "learning_rate": 5e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6845639944076538, + "num_tokens": 90671131.0, + "step": 3504 + }, + { + "epoch": 0.3849110476608829, + "grad_norm": 1.9266691207885742, + "learning_rate": 5e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6790202856063843, + "num_tokens": 90697230.0, + "step": 3505 + }, + { + "epoch": 0.3850208653634966, + "grad_norm": 1.8178504705429077, + "learning_rate": 5e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6889246106147766, + "num_tokens": 90725870.0, + "step": 3506 + }, + { + "epoch": 0.38513068306611026, + "grad_norm": 1.749586582183838, + "learning_rate": 5e-06, + "loss": 1.1004, + "mean_token_accuracy": 0.6733672618865967, + "num_tokens": 90756992.0, + "step": 3507 + }, + { + "epoch": 0.3852405007687239, + "grad_norm": 1.7856873273849487, + "learning_rate": 5e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7061476707458496, + "num_tokens": 90785825.0, + "step": 3508 + }, + { + "epoch": 0.3853503184713376, + "grad_norm": 1.8703199625015259, + "learning_rate": 5e-06, + "loss": 1.08, + "mean_token_accuracy": 0.671619713306427, + "num_tokens": 90816913.0, + "step": 3509 + }, + { + "epoch": 0.38546013617395125, + "grad_norm": 2.0545620918273926, + "learning_rate": 5e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6964131593704224, + "num_tokens": 90840127.0, + "step": 3510 + }, + { + "epoch": 0.3855699538765649, + "grad_norm": 1.8768110275268555, + "learning_rate": 5e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6954010128974915, + "num_tokens": 90865959.0, + "step": 3511 + }, + { + "epoch": 0.38567977157917854, + "grad_norm": 1.9593820571899414, + "learning_rate": 5e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7001072764396667, + "num_tokens": 90891623.0, + "step": 3512 + }, + { + "epoch": 0.38578958928179224, + "grad_norm": 2.330193042755127, + "learning_rate": 5e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7035477161407471, + "num_tokens": 90909845.0, + "step": 3513 + }, + { + "epoch": 0.3858994069844059, + "grad_norm": 2.1281464099884033, + "learning_rate": 5e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7010043859481812, + "num_tokens": 90932349.0, + "step": 3514 + }, + { + "epoch": 0.38600922468701954, + "grad_norm": 2.055480480194092, + "learning_rate": 5e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6947430968284607, + "num_tokens": 90957461.0, + "step": 3515 + }, + { + "epoch": 0.3861190423896332, + "grad_norm": 2.1049985885620117, + "learning_rate": 5e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7114108800888062, + "num_tokens": 90980673.0, + "step": 3516 + }, + { + "epoch": 0.3862288600922469, + "grad_norm": 2.0190043449401855, + "learning_rate": 5e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.707445502281189, + "num_tokens": 91007268.0, + "step": 3517 + }, + { + "epoch": 0.38633867779486053, + "grad_norm": 1.9317156076431274, + "learning_rate": 5e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.691279411315918, + "num_tokens": 91033743.0, + "step": 3518 + }, + { + "epoch": 0.3864484954974742, + "grad_norm": 1.6826509237289429, + "learning_rate": 5e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7114551067352295, + "num_tokens": 91065395.0, + "step": 3519 + }, + { + "epoch": 0.3865583132000879, + "grad_norm": 1.8716697692871094, + "learning_rate": 5e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7256695032119751, + "num_tokens": 91092593.0, + "step": 3520 + }, + { + "epoch": 0.3866681309027015, + "grad_norm": 2.033902645111084, + "learning_rate": 5e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7134302258491516, + "num_tokens": 91116992.0, + "step": 3521 + }, + { + "epoch": 0.38677794860531517, + "grad_norm": 2.230727195739746, + "learning_rate": 5e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.698765218257904, + "num_tokens": 91138626.0, + "step": 3522 + }, + { + "epoch": 0.3868877663079288, + "grad_norm": 1.9415817260742188, + "learning_rate": 5e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6963762044906616, + "num_tokens": 91168407.0, + "step": 3523 + }, + { + "epoch": 0.3869975840105425, + "grad_norm": 2.0997326374053955, + "learning_rate": 5e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6836186647415161, + "num_tokens": 91192523.0, + "step": 3524 + }, + { + "epoch": 0.38710740171315616, + "grad_norm": 2.3429784774780273, + "learning_rate": 5e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7078113555908203, + "num_tokens": 91212379.0, + "step": 3525 + }, + { + "epoch": 0.3872172194157698, + "grad_norm": 2.2631044387817383, + "learning_rate": 5e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7005794048309326, + "num_tokens": 91233722.0, + "step": 3526 + }, + { + "epoch": 0.3873270371183835, + "grad_norm": 2.1454503536224365, + "learning_rate": 5e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.685265064239502, + "num_tokens": 91258403.0, + "step": 3527 + }, + { + "epoch": 0.38743685482099716, + "grad_norm": 2.1621978282928467, + "learning_rate": 5e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7052996158599854, + "num_tokens": 91279555.0, + "step": 3528 + }, + { + "epoch": 0.3875466725236108, + "grad_norm": 2.042968511581421, + "learning_rate": 5e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.6990291476249695, + "num_tokens": 91304854.0, + "step": 3529 + }, + { + "epoch": 0.38765649022622445, + "grad_norm": 2.272094249725342, + "learning_rate": 5e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.689976692199707, + "num_tokens": 91327437.0, + "step": 3530 + }, + { + "epoch": 0.38776630792883815, + "grad_norm": 1.847188949584961, + "learning_rate": 5e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6807447075843811, + "num_tokens": 91358384.0, + "step": 3531 + }, + { + "epoch": 0.3878761256314518, + "grad_norm": 2.0826964378356934, + "learning_rate": 5e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6909064054489136, + "num_tokens": 91382145.0, + "step": 3532 + }, + { + "epoch": 0.38798594333406544, + "grad_norm": 1.787561297416687, + "learning_rate": 5e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6975072622299194, + "num_tokens": 91410861.0, + "step": 3533 + }, + { + "epoch": 0.3880957610366791, + "grad_norm": 1.8452244997024536, + "learning_rate": 5e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6941264867782593, + "num_tokens": 91442625.0, + "step": 3534 + }, + { + "epoch": 0.3882055787392928, + "grad_norm": 1.7897028923034668, + "learning_rate": 5e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6884145140647888, + "num_tokens": 91474754.0, + "step": 3535 + }, + { + "epoch": 0.38831539644190644, + "grad_norm": 1.9361262321472168, + "learning_rate": 5e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.703182578086853, + "num_tokens": 91501248.0, + "step": 3536 + }, + { + "epoch": 0.3884252141445201, + "grad_norm": 1.8857591152191162, + "learning_rate": 5e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7162636518478394, + "num_tokens": 91527971.0, + "step": 3537 + }, + { + "epoch": 0.3885350318471338, + "grad_norm": 1.9366552829742432, + "learning_rate": 5e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7032873630523682, + "num_tokens": 91553863.0, + "step": 3538 + }, + { + "epoch": 0.38864484954974743, + "grad_norm": 1.9821598529815674, + "learning_rate": 5e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7088035345077515, + "num_tokens": 91579998.0, + "step": 3539 + }, + { + "epoch": 0.3887546672523611, + "grad_norm": 2.09836745262146, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7234673500061035, + "num_tokens": 91602468.0, + "step": 3540 + }, + { + "epoch": 0.3888644849549747, + "grad_norm": 1.8589290380477905, + "learning_rate": 5e-06, + "loss": 1.1108, + "mean_token_accuracy": 0.6773033142089844, + "num_tokens": 91631190.0, + "step": 3541 + }, + { + "epoch": 0.3889743026575884, + "grad_norm": 2.15531063079834, + "learning_rate": 5e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7322432398796082, + "num_tokens": 91653856.0, + "step": 3542 + }, + { + "epoch": 0.38908412036020207, + "grad_norm": 1.9738181829452515, + "learning_rate": 5e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7163459062576294, + "num_tokens": 91677476.0, + "step": 3543 + }, + { + "epoch": 0.3891939380628157, + "grad_norm": 2.2167885303497314, + "learning_rate": 5e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7317972183227539, + "num_tokens": 91698401.0, + "step": 3544 + }, + { + "epoch": 0.38930375576542936, + "grad_norm": 1.8776514530181885, + "learning_rate": 5e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.702802300453186, + "num_tokens": 91729494.0, + "step": 3545 + }, + { + "epoch": 0.38941357346804306, + "grad_norm": 1.8907405138015747, + "learning_rate": 5e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7044346332550049, + "num_tokens": 91757235.0, + "step": 3546 + }, + { + "epoch": 0.3895233911706567, + "grad_norm": 1.8374295234680176, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6952124834060669, + "num_tokens": 91785711.0, + "step": 3547 + }, + { + "epoch": 0.38963320887327035, + "grad_norm": 1.8880696296691895, + "learning_rate": 5e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.693009078502655, + "num_tokens": 91814309.0, + "step": 3548 + }, + { + "epoch": 0.38974302657588406, + "grad_norm": 2.221775770187378, + "learning_rate": 5e-06, + "loss": 1.1501, + "mean_token_accuracy": 0.6695991158485413, + "num_tokens": 91840486.0, + "step": 3549 + }, + { + "epoch": 0.3898528442784977, + "grad_norm": 1.846632719039917, + "learning_rate": 5e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6816602349281311, + "num_tokens": 91868267.0, + "step": 3550 + }, + { + "epoch": 0.38996266198111135, + "grad_norm": 2.011309862136841, + "learning_rate": 5e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6831139326095581, + "num_tokens": 91892361.0, + "step": 3551 + }, + { + "epoch": 0.390072479683725, + "grad_norm": 2.2554516792297363, + "learning_rate": 5e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7008424997329712, + "num_tokens": 91913007.0, + "step": 3552 + }, + { + "epoch": 0.3901822973863387, + "grad_norm": 2.0099503993988037, + "learning_rate": 5e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.6857770085334778, + "num_tokens": 91939161.0, + "step": 3553 + }, + { + "epoch": 0.39029211508895234, + "grad_norm": 2.0243189334869385, + "learning_rate": 5e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6986114978790283, + "num_tokens": 91963675.0, + "step": 3554 + }, + { + "epoch": 0.390401932791566, + "grad_norm": 1.9970433712005615, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7307853698730469, + "num_tokens": 91985120.0, + "step": 3555 + }, + { + "epoch": 0.3905117504941797, + "grad_norm": 1.856498122215271, + "learning_rate": 5e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7090187072753906, + "num_tokens": 92011694.0, + "step": 3556 + }, + { + "epoch": 0.39062156819679333, + "grad_norm": 1.6795742511749268, + "learning_rate": 5e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.717325747013092, + "num_tokens": 92042614.0, + "step": 3557 + }, + { + "epoch": 0.390731385899407, + "grad_norm": 1.8544312715530396, + "learning_rate": 5e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7114626169204712, + "num_tokens": 92070055.0, + "step": 3558 + }, + { + "epoch": 0.3908412036020206, + "grad_norm": 1.965846300125122, + "learning_rate": 5e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.699282705783844, + "num_tokens": 92096312.0, + "step": 3559 + }, + { + "epoch": 0.3909510213046343, + "grad_norm": 1.9499685764312744, + "learning_rate": 5e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6976438760757446, + "num_tokens": 92124043.0, + "step": 3560 + }, + { + "epoch": 0.391060839007248, + "grad_norm": 2.055772542953491, + "learning_rate": 5e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7078074812889099, + "num_tokens": 92148872.0, + "step": 3561 + }, + { + "epoch": 0.3911706567098616, + "grad_norm": 2.101503372192383, + "learning_rate": 5e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6925573348999023, + "num_tokens": 92172835.0, + "step": 3562 + }, + { + "epoch": 0.39128047441247527, + "grad_norm": 2.1991963386535645, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7073467969894409, + "num_tokens": 92194721.0, + "step": 3563 + }, + { + "epoch": 0.39139029211508897, + "grad_norm": 2.003251075744629, + "learning_rate": 5e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7198014259338379, + "num_tokens": 92217431.0, + "step": 3564 + }, + { + "epoch": 0.3915001098177026, + "grad_norm": 2.1533820629119873, + "learning_rate": 5e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6935124397277832, + "num_tokens": 92241369.0, + "step": 3565 + }, + { + "epoch": 0.39160992752031626, + "grad_norm": 1.9827333688735962, + "learning_rate": 5e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7072107195854187, + "num_tokens": 92266137.0, + "step": 3566 + }, + { + "epoch": 0.39171974522292996, + "grad_norm": 1.875352144241333, + "learning_rate": 5e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7105570435523987, + "num_tokens": 92293887.0, + "step": 3567 + }, + { + "epoch": 0.3918295629255436, + "grad_norm": 1.9232475757598877, + "learning_rate": 5e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6820678114891052, + "num_tokens": 92321188.0, + "step": 3568 + }, + { + "epoch": 0.39193938062815725, + "grad_norm": 2.101478338241577, + "learning_rate": 5e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7082011103630066, + "num_tokens": 92345553.0, + "step": 3569 + }, + { + "epoch": 0.3920491983307709, + "grad_norm": 1.9897598028182983, + "learning_rate": 5e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7045153379440308, + "num_tokens": 92373465.0, + "step": 3570 + }, + { + "epoch": 0.3921590160333846, + "grad_norm": 1.9830936193466187, + "learning_rate": 5e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.6814529895782471, + "num_tokens": 92399947.0, + "step": 3571 + }, + { + "epoch": 0.39226883373599825, + "grad_norm": 1.9416730403900146, + "learning_rate": 5e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7109541893005371, + "num_tokens": 92423111.0, + "step": 3572 + }, + { + "epoch": 0.3923786514386119, + "grad_norm": 1.9781975746154785, + "learning_rate": 5e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6866426467895508, + "num_tokens": 92446644.0, + "step": 3573 + }, + { + "epoch": 0.3924884691412256, + "grad_norm": 1.9064140319824219, + "learning_rate": 5e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.69126296043396, + "num_tokens": 92473373.0, + "step": 3574 + }, + { + "epoch": 0.39259828684383924, + "grad_norm": 2.0599215030670166, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7165883183479309, + "num_tokens": 92495004.0, + "step": 3575 + }, + { + "epoch": 0.3927081045464529, + "grad_norm": 1.815698504447937, + "learning_rate": 5e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6891965270042419, + "num_tokens": 92524651.0, + "step": 3576 + }, + { + "epoch": 0.39281792224906653, + "grad_norm": 1.7932356595993042, + "learning_rate": 5e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7010937929153442, + "num_tokens": 92554880.0, + "step": 3577 + }, + { + "epoch": 0.39292773995168023, + "grad_norm": 2.132270336151123, + "learning_rate": 5e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6987971067428589, + "num_tokens": 92577479.0, + "step": 3578 + }, + { + "epoch": 0.3930375576542939, + "grad_norm": 2.207761287689209, + "learning_rate": 5e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6988062858581543, + "num_tokens": 92597690.0, + "step": 3579 + }, + { + "epoch": 0.3931473753569075, + "grad_norm": 1.8630635738372803, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7049117088317871, + "num_tokens": 92624103.0, + "step": 3580 + }, + { + "epoch": 0.39325719305952117, + "grad_norm": 1.895917534828186, + "learning_rate": 5e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6861813068389893, + "num_tokens": 92651066.0, + "step": 3581 + }, + { + "epoch": 0.39336701076213487, + "grad_norm": 2.00395131111145, + "learning_rate": 5e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6882580518722534, + "num_tokens": 92675001.0, + "step": 3582 + }, + { + "epoch": 0.3934768284647485, + "grad_norm": 1.777087926864624, + "learning_rate": 5e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7033911347389221, + "num_tokens": 92703629.0, + "step": 3583 + }, + { + "epoch": 0.39358664616736216, + "grad_norm": 2.102538824081421, + "learning_rate": 5e-06, + "loss": 0.999, + "mean_token_accuracy": 0.694861650466919, + "num_tokens": 92726934.0, + "step": 3584 + }, + { + "epoch": 0.39369646386997587, + "grad_norm": 1.8612945079803467, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7025901079177856, + "num_tokens": 92753565.0, + "step": 3585 + }, + { + "epoch": 0.3938062815725895, + "grad_norm": 2.157240867614746, + "learning_rate": 5e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7031422853469849, + "num_tokens": 92775063.0, + "step": 3586 + }, + { + "epoch": 0.39391609927520316, + "grad_norm": 2.2408175468444824, + "learning_rate": 5e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7081637382507324, + "num_tokens": 92794837.0, + "step": 3587 + }, + { + "epoch": 0.3940259169778168, + "grad_norm": 1.9299503564834595, + "learning_rate": 5e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7030185461044312, + "num_tokens": 92819281.0, + "step": 3588 + }, + { + "epoch": 0.3941357346804305, + "grad_norm": 1.911668062210083, + "learning_rate": 5e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7021489143371582, + "num_tokens": 92848133.0, + "step": 3589 + }, + { + "epoch": 0.39424555238304415, + "grad_norm": 1.9605416059494019, + "learning_rate": 5e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7067151069641113, + "num_tokens": 92873991.0, + "step": 3590 + }, + { + "epoch": 0.3943553700856578, + "grad_norm": 1.863853931427002, + "learning_rate": 5e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6999378204345703, + "num_tokens": 92900600.0, + "step": 3591 + }, + { + "epoch": 0.39446518778827144, + "grad_norm": 2.164140462875366, + "learning_rate": 5e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7254531979560852, + "num_tokens": 92920147.0, + "step": 3592 + }, + { + "epoch": 0.39457500549088514, + "grad_norm": 2.019266366958618, + "learning_rate": 5e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.7001277804374695, + "num_tokens": 92945370.0, + "step": 3593 + }, + { + "epoch": 0.3946848231934988, + "grad_norm": 1.998831033706665, + "learning_rate": 5e-06, + "loss": 0.972, + "mean_token_accuracy": 0.701839029788971, + "num_tokens": 92969974.0, + "step": 3594 + }, + { + "epoch": 0.39479464089611244, + "grad_norm": 2.0506527423858643, + "learning_rate": 5e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6945521831512451, + "num_tokens": 92995534.0, + "step": 3595 + }, + { + "epoch": 0.39490445859872614, + "grad_norm": 1.795583963394165, + "learning_rate": 5e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7190043330192566, + "num_tokens": 93024646.0, + "step": 3596 + }, + { + "epoch": 0.3950142763013398, + "grad_norm": 1.966966152191162, + "learning_rate": 5e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6942922472953796, + "num_tokens": 93050562.0, + "step": 3597 + }, + { + "epoch": 0.39512409400395343, + "grad_norm": 2.142328977584839, + "learning_rate": 5e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7125459313392639, + "num_tokens": 93072228.0, + "step": 3598 + }, + { + "epoch": 0.3952339117065671, + "grad_norm": 2.086437463760376, + "learning_rate": 5e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6871461272239685, + "num_tokens": 93099202.0, + "step": 3599 + }, + { + "epoch": 0.3953437294091808, + "grad_norm": 1.916297435760498, + "learning_rate": 5e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.693394124507904, + "num_tokens": 93128966.0, + "step": 3600 + }, + { + "epoch": 0.3954535471117944, + "grad_norm": 1.9402111768722534, + "learning_rate": 5e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6966906785964966, + "num_tokens": 93155081.0, + "step": 3601 + }, + { + "epoch": 0.39556336481440807, + "grad_norm": 1.8900752067565918, + "learning_rate": 5e-06, + "loss": 1.0882, + "mean_token_accuracy": 0.685797929763794, + "num_tokens": 93182746.0, + "step": 3602 + }, + { + "epoch": 0.39567318251702177, + "grad_norm": 1.9367964267730713, + "learning_rate": 5e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.716480016708374, + "num_tokens": 93209602.0, + "step": 3603 + }, + { + "epoch": 0.3957830002196354, + "grad_norm": 2.0710434913635254, + "learning_rate": 5e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6865181922912598, + "num_tokens": 93239723.0, + "step": 3604 + }, + { + "epoch": 0.39589281792224906, + "grad_norm": 1.8412895202636719, + "learning_rate": 5e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6886342763900757, + "num_tokens": 93270447.0, + "step": 3605 + }, + { + "epoch": 0.3960026356248627, + "grad_norm": 1.789982795715332, + "learning_rate": 5e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.701073408126831, + "num_tokens": 93301563.0, + "step": 3606 + }, + { + "epoch": 0.3961124533274764, + "grad_norm": 1.9950705766677856, + "learning_rate": 5e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6832462549209595, + "num_tokens": 93328548.0, + "step": 3607 + }, + { + "epoch": 0.39622227103009006, + "grad_norm": 2.1204841136932373, + "learning_rate": 5e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.702318549156189, + "num_tokens": 93354069.0, + "step": 3608 + }, + { + "epoch": 0.3963320887327037, + "grad_norm": 1.9688827991485596, + "learning_rate": 5e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6816729307174683, + "num_tokens": 93384242.0, + "step": 3609 + }, + { + "epoch": 0.39644190643531735, + "grad_norm": 1.886683702468872, + "learning_rate": 5e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6843988299369812, + "num_tokens": 93413715.0, + "step": 3610 + }, + { + "epoch": 0.39655172413793105, + "grad_norm": 2.0718557834625244, + "learning_rate": 5e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7017847299575806, + "num_tokens": 93436513.0, + "step": 3611 + }, + { + "epoch": 0.3966615418405447, + "grad_norm": 1.798830270767212, + "learning_rate": 5e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6813624501228333, + "num_tokens": 93467161.0, + "step": 3612 + }, + { + "epoch": 0.39677135954315834, + "grad_norm": 1.718147873878479, + "learning_rate": 5e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6908148527145386, + "num_tokens": 93498108.0, + "step": 3613 + }, + { + "epoch": 0.39688117724577204, + "grad_norm": 2.225206136703491, + "learning_rate": 5e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.708742618560791, + "num_tokens": 93521211.0, + "step": 3614 + }, + { + "epoch": 0.3969909949483857, + "grad_norm": 2.174253463745117, + "learning_rate": 5e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6859108805656433, + "num_tokens": 93543642.0, + "step": 3615 + }, + { + "epoch": 0.39710081265099934, + "grad_norm": 2.0149176120758057, + "learning_rate": 5e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.688960075378418, + "num_tokens": 93567917.0, + "step": 3616 + }, + { + "epoch": 0.397210630353613, + "grad_norm": 1.9533578157424927, + "learning_rate": 5e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.679036021232605, + "num_tokens": 93595748.0, + "step": 3617 + }, + { + "epoch": 0.3973204480562267, + "grad_norm": 1.9017118215560913, + "learning_rate": 5e-06, + "loss": 1.1123, + "mean_token_accuracy": 0.6754132509231567, + "num_tokens": 93625804.0, + "step": 3618 + }, + { + "epoch": 0.39743026575884033, + "grad_norm": 2.1169397830963135, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7187187671661377, + "num_tokens": 93647083.0, + "step": 3619 + }, + { + "epoch": 0.397540083461454, + "grad_norm": 2.0854904651641846, + "learning_rate": 5e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.702070415019989, + "num_tokens": 93670771.0, + "step": 3620 + }, + { + "epoch": 0.3976499011640676, + "grad_norm": 2.3932945728302, + "learning_rate": 5e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6816883683204651, + "num_tokens": 93691629.0, + "step": 3621 + }, + { + "epoch": 0.3977597188666813, + "grad_norm": 2.0935189723968506, + "learning_rate": 5e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.7004801034927368, + "num_tokens": 93713794.0, + "step": 3622 + }, + { + "epoch": 0.39786953656929497, + "grad_norm": 1.8864221572875977, + "learning_rate": 5e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.708147406578064, + "num_tokens": 93741559.0, + "step": 3623 + }, + { + "epoch": 0.3979793542719086, + "grad_norm": 2.021829843521118, + "learning_rate": 5e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7109376192092896, + "num_tokens": 93766639.0, + "step": 3624 + }, + { + "epoch": 0.3980891719745223, + "grad_norm": 1.8738727569580078, + "learning_rate": 5e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6960980892181396, + "num_tokens": 93793117.0, + "step": 3625 + }, + { + "epoch": 0.39819898967713596, + "grad_norm": 1.877729892730713, + "learning_rate": 5e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6916764974594116, + "num_tokens": 93820275.0, + "step": 3626 + }, + { + "epoch": 0.3983088073797496, + "grad_norm": 1.922964096069336, + "learning_rate": 5e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7004340887069702, + "num_tokens": 93845967.0, + "step": 3627 + }, + { + "epoch": 0.39841862508236325, + "grad_norm": 1.8798941373825073, + "learning_rate": 5e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7293625473976135, + "num_tokens": 93871139.0, + "step": 3628 + }, + { + "epoch": 0.39852844278497696, + "grad_norm": 1.9712059497833252, + "learning_rate": 5e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7127978801727295, + "num_tokens": 93894781.0, + "step": 3629 + }, + { + "epoch": 0.3986382604875906, + "grad_norm": 1.6911109685897827, + "learning_rate": 5e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6963709592819214, + "num_tokens": 93927624.0, + "step": 3630 + }, + { + "epoch": 0.39874807819020425, + "grad_norm": 2.0834238529205322, + "learning_rate": 5e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6935830116271973, + "num_tokens": 93950308.0, + "step": 3631 + }, + { + "epoch": 0.39885789589281795, + "grad_norm": 1.7712661027908325, + "learning_rate": 5e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.697170615196228, + "num_tokens": 93979268.0, + "step": 3632 + }, + { + "epoch": 0.3989677135954316, + "grad_norm": 1.9890202283859253, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7134431600570679, + "num_tokens": 94003099.0, + "step": 3633 + }, + { + "epoch": 0.39907753129804524, + "grad_norm": 2.12896728515625, + "learning_rate": 5e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7219836711883545, + "num_tokens": 94026148.0, + "step": 3634 + }, + { + "epoch": 0.3991873490006589, + "grad_norm": 2.128643035888672, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7140116691589355, + "num_tokens": 94047971.0, + "step": 3635 + }, + { + "epoch": 0.3992971667032726, + "grad_norm": 2.2966508865356445, + "learning_rate": 5e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7305975556373596, + "num_tokens": 94067954.0, + "step": 3636 + }, + { + "epoch": 0.39940698440588623, + "grad_norm": 2.244067907333374, + "learning_rate": 5e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7259286046028137, + "num_tokens": 94086968.0, + "step": 3637 + }, + { + "epoch": 0.3995168021084999, + "grad_norm": 1.8705986738204956, + "learning_rate": 5e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6979113817214966, + "num_tokens": 94115045.0, + "step": 3638 + }, + { + "epoch": 0.3996266198111135, + "grad_norm": 2.0476603507995605, + "learning_rate": 5e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7125404477119446, + "num_tokens": 94137356.0, + "step": 3639 + }, + { + "epoch": 0.3997364375137272, + "grad_norm": 2.1114487648010254, + "learning_rate": 5e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7232734560966492, + "num_tokens": 94161908.0, + "step": 3640 + }, + { + "epoch": 0.3998462552163409, + "grad_norm": 1.9686485528945923, + "learning_rate": 5e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6875855326652527, + "num_tokens": 94189061.0, + "step": 3641 + }, + { + "epoch": 0.3999560729189545, + "grad_norm": 2.0945887565612793, + "learning_rate": 5e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6838595271110535, + "num_tokens": 94213336.0, + "step": 3642 + }, + { + "epoch": 0.4000658906215682, + "grad_norm": 1.8913965225219727, + "learning_rate": 5e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6796854734420776, + "num_tokens": 94243348.0, + "step": 3643 + }, + { + "epoch": 0.40017570832418187, + "grad_norm": 2.0851125717163086, + "learning_rate": 5e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7039336562156677, + "num_tokens": 94266237.0, + "step": 3644 + }, + { + "epoch": 0.4002855260267955, + "grad_norm": 1.782180905342102, + "learning_rate": 5e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7213263511657715, + "num_tokens": 94293557.0, + "step": 3645 + }, + { + "epoch": 0.40039534372940916, + "grad_norm": 2.0143306255340576, + "learning_rate": 5e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6974656581878662, + "num_tokens": 94319308.0, + "step": 3646 + }, + { + "epoch": 0.40050516143202286, + "grad_norm": 1.9561244249343872, + "learning_rate": 5e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6848874092102051, + "num_tokens": 94345574.0, + "step": 3647 + }, + { + "epoch": 0.4006149791346365, + "grad_norm": 2.205092430114746, + "learning_rate": 5e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7082740068435669, + "num_tokens": 94367195.0, + "step": 3648 + }, + { + "epoch": 0.40072479683725015, + "grad_norm": 2.1374757289886475, + "learning_rate": 5e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7290775775909424, + "num_tokens": 94386386.0, + "step": 3649 + }, + { + "epoch": 0.4008346145398638, + "grad_norm": 1.966662883758545, + "learning_rate": 5e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7068933844566345, + "num_tokens": 94412740.0, + "step": 3650 + }, + { + "epoch": 0.4009444322424775, + "grad_norm": 2.0720362663269043, + "learning_rate": 5e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7157590389251709, + "num_tokens": 94433897.0, + "step": 3651 + }, + { + "epoch": 0.40105424994509115, + "grad_norm": 1.9982106685638428, + "learning_rate": 5e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6929251551628113, + "num_tokens": 94460372.0, + "step": 3652 + }, + { + "epoch": 0.4011640676477048, + "grad_norm": 1.9228246212005615, + "learning_rate": 5e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6800940036773682, + "num_tokens": 94487439.0, + "step": 3653 + }, + { + "epoch": 0.4012738853503185, + "grad_norm": 2.152000665664673, + "learning_rate": 5e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6911520957946777, + "num_tokens": 94511517.0, + "step": 3654 + }, + { + "epoch": 0.40138370305293214, + "grad_norm": 2.1008570194244385, + "learning_rate": 5e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7278298139572144, + "num_tokens": 94532980.0, + "step": 3655 + }, + { + "epoch": 0.4014935207555458, + "grad_norm": 1.8922672271728516, + "learning_rate": 5e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7057113647460938, + "num_tokens": 94560621.0, + "step": 3656 + }, + { + "epoch": 0.40160333845815943, + "grad_norm": 2.1278014183044434, + "learning_rate": 5e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6963781714439392, + "num_tokens": 94585063.0, + "step": 3657 + }, + { + "epoch": 0.40171315616077313, + "grad_norm": 2.0778961181640625, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7019788026809692, + "num_tokens": 94609251.0, + "step": 3658 + }, + { + "epoch": 0.4018229738633868, + "grad_norm": 1.858763575553894, + "learning_rate": 5e-06, + "loss": 1.1089, + "mean_token_accuracy": 0.672469973564148, + "num_tokens": 94642903.0, + "step": 3659 + }, + { + "epoch": 0.4019327915660004, + "grad_norm": 2.1316394805908203, + "learning_rate": 5e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7163065671920776, + "num_tokens": 94664896.0, + "step": 3660 + }, + { + "epoch": 0.4020426092686141, + "grad_norm": 2.053943395614624, + "learning_rate": 5e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7132480144500732, + "num_tokens": 94688433.0, + "step": 3661 + }, + { + "epoch": 0.40215242697122777, + "grad_norm": 2.029762029647827, + "learning_rate": 5e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6978834867477417, + "num_tokens": 94714608.0, + "step": 3662 + }, + { + "epoch": 0.4022622446738414, + "grad_norm": 2.1912667751312256, + "learning_rate": 5e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.699982762336731, + "num_tokens": 94737374.0, + "step": 3663 + }, + { + "epoch": 0.40237206237645506, + "grad_norm": 2.1301791667938232, + "learning_rate": 5e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7044004201889038, + "num_tokens": 94759639.0, + "step": 3664 + }, + { + "epoch": 0.40248188007906877, + "grad_norm": 1.8737797737121582, + "learning_rate": 5e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7107871770858765, + "num_tokens": 94786020.0, + "step": 3665 + }, + { + "epoch": 0.4025916977816824, + "grad_norm": 1.942496657371521, + "learning_rate": 5e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7128461599349976, + "num_tokens": 94811089.0, + "step": 3666 + }, + { + "epoch": 0.40270151548429606, + "grad_norm": 1.9150444269180298, + "learning_rate": 5e-06, + "loss": 1.0972, + "mean_token_accuracy": 0.6759079694747925, + "num_tokens": 94842893.0, + "step": 3667 + }, + { + "epoch": 0.4028113331869097, + "grad_norm": 1.879067301750183, + "learning_rate": 5e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6853350400924683, + "num_tokens": 94872800.0, + "step": 3668 + }, + { + "epoch": 0.4029211508895234, + "grad_norm": 1.9386353492736816, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.708418071269989, + "num_tokens": 94897198.0, + "step": 3669 + }, + { + "epoch": 0.40303096859213705, + "grad_norm": 2.0614051818847656, + "learning_rate": 5e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6977476477622986, + "num_tokens": 94920340.0, + "step": 3670 + }, + { + "epoch": 0.4031407862947507, + "grad_norm": 1.9229727983474731, + "learning_rate": 5e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7017596960067749, + "num_tokens": 94945364.0, + "step": 3671 + }, + { + "epoch": 0.4032506039973644, + "grad_norm": 1.706725835800171, + "learning_rate": 5e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6977247595787048, + "num_tokens": 94977401.0, + "step": 3672 + }, + { + "epoch": 0.40336042169997804, + "grad_norm": 1.8477387428283691, + "learning_rate": 5e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6942765116691589, + "num_tokens": 95006495.0, + "step": 3673 + }, + { + "epoch": 0.4034702394025917, + "grad_norm": 1.9879719018936157, + "learning_rate": 5e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6850213408470154, + "num_tokens": 95033333.0, + "step": 3674 + }, + { + "epoch": 0.40358005710520534, + "grad_norm": 1.9238471984863281, + "learning_rate": 5e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6753562092781067, + "num_tokens": 95061485.0, + "step": 3675 + }, + { + "epoch": 0.40368987480781904, + "grad_norm": 1.9070570468902588, + "learning_rate": 5e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7079105973243713, + "num_tokens": 95089466.0, + "step": 3676 + }, + { + "epoch": 0.4037996925104327, + "grad_norm": 1.8061845302581787, + "learning_rate": 5e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6894013285636902, + "num_tokens": 95117655.0, + "step": 3677 + }, + { + "epoch": 0.40390951021304633, + "grad_norm": 2.12150239944458, + "learning_rate": 5e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6940271854400635, + "num_tokens": 95141089.0, + "step": 3678 + }, + { + "epoch": 0.40401932791566003, + "grad_norm": 1.9894018173217773, + "learning_rate": 5e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6993917226791382, + "num_tokens": 95167680.0, + "step": 3679 + }, + { + "epoch": 0.4041291456182737, + "grad_norm": 2.1229400634765625, + "learning_rate": 5e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6920803785324097, + "num_tokens": 95191668.0, + "step": 3680 + }, + { + "epoch": 0.4042389633208873, + "grad_norm": 1.9083423614501953, + "learning_rate": 5e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7086189985275269, + "num_tokens": 95216250.0, + "step": 3681 + }, + { + "epoch": 0.40434878102350097, + "grad_norm": 1.8224756717681885, + "learning_rate": 5e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7032550573348999, + "num_tokens": 95244939.0, + "step": 3682 + }, + { + "epoch": 0.40445859872611467, + "grad_norm": 2.297531843185425, + "learning_rate": 5e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7143083810806274, + "num_tokens": 95264570.0, + "step": 3683 + }, + { + "epoch": 0.4045684164287283, + "grad_norm": 2.1142563819885254, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.696387529373169, + "num_tokens": 95288825.0, + "step": 3684 + }, + { + "epoch": 0.40467823413134196, + "grad_norm": 2.0403971672058105, + "learning_rate": 5e-06, + "loss": 1.0897, + "mean_token_accuracy": 0.6742109656333923, + "num_tokens": 95314323.0, + "step": 3685 + }, + { + "epoch": 0.4047880518339556, + "grad_norm": 1.8623600006103516, + "learning_rate": 5e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6927733421325684, + "num_tokens": 95340432.0, + "step": 3686 + }, + { + "epoch": 0.4048978695365693, + "grad_norm": 1.7554492950439453, + "learning_rate": 5e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6868979930877686, + "num_tokens": 95374471.0, + "step": 3687 + }, + { + "epoch": 0.40500768723918296, + "grad_norm": 1.8106216192245483, + "learning_rate": 5e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7111243009567261, + "num_tokens": 95403256.0, + "step": 3688 + }, + { + "epoch": 0.4051175049417966, + "grad_norm": 1.7893717288970947, + "learning_rate": 5e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7120785713195801, + "num_tokens": 95433629.0, + "step": 3689 + }, + { + "epoch": 0.4052273226444103, + "grad_norm": 1.7818286418914795, + "learning_rate": 5e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.690311849117279, + "num_tokens": 95464389.0, + "step": 3690 + }, + { + "epoch": 0.40533714034702395, + "grad_norm": 1.9141205549240112, + "learning_rate": 5e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7070183753967285, + "num_tokens": 95489671.0, + "step": 3691 + }, + { + "epoch": 0.4054469580496376, + "grad_norm": 1.9052581787109375, + "learning_rate": 5e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6854193806648254, + "num_tokens": 95517059.0, + "step": 3692 + }, + { + "epoch": 0.40555677575225124, + "grad_norm": 2.0560874938964844, + "learning_rate": 5e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7153751254081726, + "num_tokens": 95539515.0, + "step": 3693 + }, + { + "epoch": 0.40566659345486494, + "grad_norm": 1.9273114204406738, + "learning_rate": 5e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7145824432373047, + "num_tokens": 95569196.0, + "step": 3694 + }, + { + "epoch": 0.4057764111574786, + "grad_norm": 2.087671995162964, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7131603956222534, + "num_tokens": 95591922.0, + "step": 3695 + }, + { + "epoch": 0.40588622886009224, + "grad_norm": 2.0052008628845215, + "learning_rate": 5e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7184504270553589, + "num_tokens": 95614874.0, + "step": 3696 + }, + { + "epoch": 0.4059960465627059, + "grad_norm": 2.109506368637085, + "learning_rate": 5e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7170557975769043, + "num_tokens": 95635748.0, + "step": 3697 + }, + { + "epoch": 0.4061058642653196, + "grad_norm": 2.0725080966949463, + "learning_rate": 5e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6876260042190552, + "num_tokens": 95659078.0, + "step": 3698 + }, + { + "epoch": 0.40621568196793323, + "grad_norm": 2.0309181213378906, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7017650604248047, + "num_tokens": 95682999.0, + "step": 3699 + }, + { + "epoch": 0.4063254996705469, + "grad_norm": 1.8008003234863281, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7092180252075195, + "num_tokens": 95711741.0, + "step": 3700 + }, + { + "epoch": 0.4064353173731606, + "grad_norm": 1.9943383932113647, + "learning_rate": 5e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7137535214424133, + "num_tokens": 95736906.0, + "step": 3701 + }, + { + "epoch": 0.4065451350757742, + "grad_norm": 2.110488176345825, + "learning_rate": 5e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7227262258529663, + "num_tokens": 95757136.0, + "step": 3702 + }, + { + "epoch": 0.40665495277838787, + "grad_norm": 2.0988636016845703, + "learning_rate": 5e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7009736895561218, + "num_tokens": 95779305.0, + "step": 3703 + }, + { + "epoch": 0.4067647704810015, + "grad_norm": 1.9169021844863892, + "learning_rate": 5e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7037032842636108, + "num_tokens": 95806651.0, + "step": 3704 + }, + { + "epoch": 0.4068745881836152, + "grad_norm": 2.167180061340332, + "learning_rate": 5e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7185279130935669, + "num_tokens": 95826980.0, + "step": 3705 + }, + { + "epoch": 0.40698440588622886, + "grad_norm": 2.00727915763855, + "learning_rate": 5e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6877087950706482, + "num_tokens": 95852587.0, + "step": 3706 + }, + { + "epoch": 0.4070942235888425, + "grad_norm": 1.9052678346633911, + "learning_rate": 5e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7051867842674255, + "num_tokens": 95878652.0, + "step": 3707 + }, + { + "epoch": 0.4072040412914562, + "grad_norm": 2.2658865451812744, + "learning_rate": 5e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7164647579193115, + "num_tokens": 95898222.0, + "step": 3708 + }, + { + "epoch": 0.40731385899406986, + "grad_norm": 1.844305396080017, + "learning_rate": 5e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.707398533821106, + "num_tokens": 95926624.0, + "step": 3709 + }, + { + "epoch": 0.4074236766966835, + "grad_norm": 2.1623592376708984, + "learning_rate": 5e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6827664375305176, + "num_tokens": 95948314.0, + "step": 3710 + }, + { + "epoch": 0.40753349439929715, + "grad_norm": 1.950618863105774, + "learning_rate": 5e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7154215574264526, + "num_tokens": 95971635.0, + "step": 3711 + }, + { + "epoch": 0.40764331210191085, + "grad_norm": 2.224186658859253, + "learning_rate": 5e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7179054617881775, + "num_tokens": 95990695.0, + "step": 3712 + }, + { + "epoch": 0.4077531298045245, + "grad_norm": 2.2713747024536133, + "learning_rate": 5e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7230429649353027, + "num_tokens": 96010404.0, + "step": 3713 + }, + { + "epoch": 0.40786294750713814, + "grad_norm": 2.2525978088378906, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.720210075378418, + "num_tokens": 96029844.0, + "step": 3714 + }, + { + "epoch": 0.4079727652097518, + "grad_norm": 1.8196604251861572, + "learning_rate": 5e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6898866891860962, + "num_tokens": 96057348.0, + "step": 3715 + }, + { + "epoch": 0.4080825829123655, + "grad_norm": 1.9115769863128662, + "learning_rate": 5e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6857055425643921, + "num_tokens": 96088429.0, + "step": 3716 + }, + { + "epoch": 0.40819240061497913, + "grad_norm": 1.7145369052886963, + "learning_rate": 5e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6748181581497192, + "num_tokens": 96120417.0, + "step": 3717 + }, + { + "epoch": 0.4083022183175928, + "grad_norm": 1.905784010887146, + "learning_rate": 5e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7019485235214233, + "num_tokens": 96147326.0, + "step": 3718 + }, + { + "epoch": 0.4084120360202065, + "grad_norm": 1.952425479888916, + "learning_rate": 5e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7299677729606628, + "num_tokens": 96172568.0, + "step": 3719 + }, + { + "epoch": 0.4085218537228201, + "grad_norm": 2.4173431396484375, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7192115783691406, + "num_tokens": 96188666.0, + "step": 3720 + }, + { + "epoch": 0.4086316714254338, + "grad_norm": 2.034238338470459, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7173117995262146, + "num_tokens": 96210056.0, + "step": 3721 + }, + { + "epoch": 0.4087414891280474, + "grad_norm": 2.2566959857940674, + "learning_rate": 5e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.7032396793365479, + "num_tokens": 96229918.0, + "step": 3722 + }, + { + "epoch": 0.4088513068306611, + "grad_norm": 1.9846575260162354, + "learning_rate": 5e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7151106595993042, + "num_tokens": 96253158.0, + "step": 3723 + }, + { + "epoch": 0.40896112453327477, + "grad_norm": 1.955216646194458, + "learning_rate": 5e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6940592527389526, + "num_tokens": 96278886.0, + "step": 3724 + }, + { + "epoch": 0.4090709422358884, + "grad_norm": 2.069230794906616, + "learning_rate": 5e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7146208882331848, + "num_tokens": 96299366.0, + "step": 3725 + }, + { + "epoch": 0.40918075993850206, + "grad_norm": 2.000866174697876, + "learning_rate": 5e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.695664644241333, + "num_tokens": 96323077.0, + "step": 3726 + }, + { + "epoch": 0.40929057764111576, + "grad_norm": 1.9826122522354126, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7093980312347412, + "num_tokens": 96346397.0, + "step": 3727 + }, + { + "epoch": 0.4094003953437294, + "grad_norm": 1.8308429718017578, + "learning_rate": 5e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6816787123680115, + "num_tokens": 96377947.0, + "step": 3728 + }, + { + "epoch": 0.40951021304634305, + "grad_norm": 1.8004519939422607, + "learning_rate": 5e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7105810642242432, + "num_tokens": 96405697.0, + "step": 3729 + }, + { + "epoch": 0.40962003074895675, + "grad_norm": 2.0223801136016846, + "learning_rate": 5e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7098449468612671, + "num_tokens": 96433114.0, + "step": 3730 + }, + { + "epoch": 0.4097298484515704, + "grad_norm": 2.0533390045166016, + "learning_rate": 5e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6950109601020813, + "num_tokens": 96456908.0, + "step": 3731 + }, + { + "epoch": 0.40983966615418405, + "grad_norm": 2.0179078578948975, + "learning_rate": 5e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6985883116722107, + "num_tokens": 96481405.0, + "step": 3732 + }, + { + "epoch": 0.4099494838567977, + "grad_norm": 1.9078582525253296, + "learning_rate": 5e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6806405186653137, + "num_tokens": 96509107.0, + "step": 3733 + }, + { + "epoch": 0.4100593015594114, + "grad_norm": 2.2810680866241455, + "learning_rate": 5e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.6995696425437927, + "num_tokens": 96528839.0, + "step": 3734 + }, + { + "epoch": 0.41016911926202504, + "grad_norm": 1.7915937900543213, + "learning_rate": 5e-06, + "loss": 1.1068, + "mean_token_accuracy": 0.6690686345100403, + "num_tokens": 96561922.0, + "step": 3735 + }, + { + "epoch": 0.4102789369646387, + "grad_norm": 2.2716119289398193, + "learning_rate": 5e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7080976366996765, + "num_tokens": 96582870.0, + "step": 3736 + }, + { + "epoch": 0.4103887546672524, + "grad_norm": 1.767078161239624, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7230620980262756, + "num_tokens": 96610948.0, + "step": 3737 + }, + { + "epoch": 0.41049857236986603, + "grad_norm": 2.257272243499756, + "learning_rate": 5e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7017757892608643, + "num_tokens": 96631679.0, + "step": 3738 + }, + { + "epoch": 0.4106083900724797, + "grad_norm": 1.988541841506958, + "learning_rate": 5e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7283587455749512, + "num_tokens": 96657357.0, + "step": 3739 + }, + { + "epoch": 0.4107182077750933, + "grad_norm": 2.186248302459717, + "learning_rate": 5e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.7034323215484619, + "num_tokens": 96679977.0, + "step": 3740 + }, + { + "epoch": 0.410828025477707, + "grad_norm": 2.019589424133301, + "learning_rate": 5e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.6760019063949585, + "num_tokens": 96705728.0, + "step": 3741 + }, + { + "epoch": 0.41093784318032067, + "grad_norm": 1.8511886596679688, + "learning_rate": 5e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7003252506256104, + "num_tokens": 96735301.0, + "step": 3742 + }, + { + "epoch": 0.4110476608829343, + "grad_norm": 1.954778790473938, + "learning_rate": 5e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6785355806350708, + "num_tokens": 96764170.0, + "step": 3743 + }, + { + "epoch": 0.41115747858554796, + "grad_norm": 1.9861088991165161, + "learning_rate": 5e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7135821580886841, + "num_tokens": 96787894.0, + "step": 3744 + }, + { + "epoch": 0.41126729628816167, + "grad_norm": 2.1673085689544678, + "learning_rate": 5e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7109513282775879, + "num_tokens": 96809267.0, + "step": 3745 + }, + { + "epoch": 0.4113771139907753, + "grad_norm": 1.7519376277923584, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7032946348190308, + "num_tokens": 96839310.0, + "step": 3746 + }, + { + "epoch": 0.41148693169338896, + "grad_norm": 2.0905420780181885, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.717750608921051, + "num_tokens": 96861077.0, + "step": 3747 + }, + { + "epoch": 0.41159674939600266, + "grad_norm": 1.9276823997497559, + "learning_rate": 5e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7174625396728516, + "num_tokens": 96886488.0, + "step": 3748 + }, + { + "epoch": 0.4117065670986163, + "grad_norm": 1.8188135623931885, + "learning_rate": 5e-06, + "loss": 1.1487, + "mean_token_accuracy": 0.6699514985084534, + "num_tokens": 96919268.0, + "step": 3749 + }, + { + "epoch": 0.41181638480122995, + "grad_norm": 1.7886178493499756, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7181382179260254, + "num_tokens": 96946279.0, + "step": 3750 + }, + { + "epoch": 0.4119262025038436, + "grad_norm": 1.9406136274337769, + "learning_rate": 5e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6940368413925171, + "num_tokens": 96971196.0, + "step": 3751 + }, + { + "epoch": 0.4120360202064573, + "grad_norm": 2.1406242847442627, + "learning_rate": 5e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.6976600885391235, + "num_tokens": 96993923.0, + "step": 3752 + }, + { + "epoch": 0.41214583790907094, + "grad_norm": 1.9657268524169922, + "learning_rate": 5e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7162409424781799, + "num_tokens": 97018246.0, + "step": 3753 + }, + { + "epoch": 0.4122556556116846, + "grad_norm": 1.937493920326233, + "learning_rate": 5e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.715807318687439, + "num_tokens": 97039735.0, + "step": 3754 + }, + { + "epoch": 0.4123654733142983, + "grad_norm": 2.158695697784424, + "learning_rate": 5e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6844571828842163, + "num_tokens": 97063590.0, + "step": 3755 + }, + { + "epoch": 0.41247529101691194, + "grad_norm": 1.715136170387268, + "learning_rate": 5e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.6840248703956604, + "num_tokens": 97094872.0, + "step": 3756 + }, + { + "epoch": 0.4125851087195256, + "grad_norm": 1.930029034614563, + "learning_rate": 5e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6925640106201172, + "num_tokens": 97121497.0, + "step": 3757 + }, + { + "epoch": 0.41269492642213923, + "grad_norm": 1.932766079902649, + "learning_rate": 5e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7029696106910706, + "num_tokens": 97150883.0, + "step": 3758 + }, + { + "epoch": 0.41280474412475293, + "grad_norm": 1.8911887407302856, + "learning_rate": 5e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6882469654083252, + "num_tokens": 97178381.0, + "step": 3759 + }, + { + "epoch": 0.4129145618273666, + "grad_norm": 2.025052547454834, + "learning_rate": 5e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6849278211593628, + "num_tokens": 97203762.0, + "step": 3760 + }, + { + "epoch": 0.4130243795299802, + "grad_norm": 1.864823818206787, + "learning_rate": 5e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7055476903915405, + "num_tokens": 97228332.0, + "step": 3761 + }, + { + "epoch": 0.41313419723259387, + "grad_norm": 2.3599588871002197, + "learning_rate": 5e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7113913893699646, + "num_tokens": 97246506.0, + "step": 3762 + }, + { + "epoch": 0.41324401493520757, + "grad_norm": 2.3366613388061523, + "learning_rate": 5e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7195196151733398, + "num_tokens": 97264567.0, + "step": 3763 + }, + { + "epoch": 0.4133538326378212, + "grad_norm": 1.7054567337036133, + "learning_rate": 5e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7069064974784851, + "num_tokens": 97298977.0, + "step": 3764 + }, + { + "epoch": 0.41346365034043486, + "grad_norm": 2.077500581741333, + "learning_rate": 5e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6883034706115723, + "num_tokens": 97322409.0, + "step": 3765 + }, + { + "epoch": 0.41357346804304856, + "grad_norm": 1.846350073814392, + "learning_rate": 5e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7036875486373901, + "num_tokens": 97352182.0, + "step": 3766 + }, + { + "epoch": 0.4136832857456622, + "grad_norm": 2.024984359741211, + "learning_rate": 5e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7043816447257996, + "num_tokens": 97377266.0, + "step": 3767 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 1.8181674480438232, + "learning_rate": 5e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6816520094871521, + "num_tokens": 97407434.0, + "step": 3768 + }, + { + "epoch": 0.4139029211508895, + "grad_norm": 1.962684988975525, + "learning_rate": 5e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7021694183349609, + "num_tokens": 97431349.0, + "step": 3769 + }, + { + "epoch": 0.4140127388535032, + "grad_norm": 2.392101287841797, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7187646627426147, + "num_tokens": 97448823.0, + "step": 3770 + }, + { + "epoch": 0.41412255655611685, + "grad_norm": 1.9968278408050537, + "learning_rate": 5e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6889066696166992, + "num_tokens": 97473242.0, + "step": 3771 + }, + { + "epoch": 0.4142323742587305, + "grad_norm": 2.038426399230957, + "learning_rate": 5e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6991997361183167, + "num_tokens": 97496692.0, + "step": 3772 + }, + { + "epoch": 0.41434219196134414, + "grad_norm": 2.062861919403076, + "learning_rate": 5e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6981746554374695, + "num_tokens": 97520092.0, + "step": 3773 + }, + { + "epoch": 0.41445200966395784, + "grad_norm": 2.080306053161621, + "learning_rate": 5e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.680478036403656, + "num_tokens": 97545879.0, + "step": 3774 + }, + { + "epoch": 0.4145618273665715, + "grad_norm": 1.8362598419189453, + "learning_rate": 5e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7082308530807495, + "num_tokens": 97576203.0, + "step": 3775 + }, + { + "epoch": 0.41467164506918514, + "grad_norm": 2.140857696533203, + "learning_rate": 5e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6835907697677612, + "num_tokens": 97598494.0, + "step": 3776 + }, + { + "epoch": 0.41478146277179884, + "grad_norm": 2.2141449451446533, + "learning_rate": 5e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.70238196849823, + "num_tokens": 97619769.0, + "step": 3777 + }, + { + "epoch": 0.4148912804744125, + "grad_norm": 1.9834421873092651, + "learning_rate": 5e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6842734217643738, + "num_tokens": 97644111.0, + "step": 3778 + }, + { + "epoch": 0.41500109817702613, + "grad_norm": 1.9125890731811523, + "learning_rate": 5e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7111220955848694, + "num_tokens": 97671648.0, + "step": 3779 + }, + { + "epoch": 0.4151109158796398, + "grad_norm": 2.0515999794006348, + "learning_rate": 5e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7109179496765137, + "num_tokens": 97694756.0, + "step": 3780 + }, + { + "epoch": 0.4152207335822535, + "grad_norm": 2.243443727493286, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7119970321655273, + "num_tokens": 97714825.0, + "step": 3781 + }, + { + "epoch": 0.4153305512848671, + "grad_norm": 1.9830150604248047, + "learning_rate": 5e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6933249831199646, + "num_tokens": 97740602.0, + "step": 3782 + }, + { + "epoch": 0.41544036898748077, + "grad_norm": 1.852048635482788, + "learning_rate": 5e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.704595685005188, + "num_tokens": 97771320.0, + "step": 3783 + }, + { + "epoch": 0.41555018669009447, + "grad_norm": 2.1054892539978027, + "learning_rate": 5e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6925687789916992, + "num_tokens": 97796039.0, + "step": 3784 + }, + { + "epoch": 0.4156600043927081, + "grad_norm": 2.1126973628997803, + "learning_rate": 5e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6919965744018555, + "num_tokens": 97819430.0, + "step": 3785 + }, + { + "epoch": 0.41576982209532176, + "grad_norm": 1.7896769046783447, + "learning_rate": 5e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6790587306022644, + "num_tokens": 97851662.0, + "step": 3786 + }, + { + "epoch": 0.4158796397979354, + "grad_norm": 1.9789934158325195, + "learning_rate": 5e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7171390056610107, + "num_tokens": 97876806.0, + "step": 3787 + }, + { + "epoch": 0.4159894575005491, + "grad_norm": 1.9800783395767212, + "learning_rate": 5e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7116798758506775, + "num_tokens": 97901252.0, + "step": 3788 + }, + { + "epoch": 0.41609927520316276, + "grad_norm": 1.805116057395935, + "learning_rate": 5e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6852110624313354, + "num_tokens": 97932351.0, + "step": 3789 + }, + { + "epoch": 0.4162090929057764, + "grad_norm": 1.9385727643966675, + "learning_rate": 5e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6965742707252502, + "num_tokens": 97959750.0, + "step": 3790 + }, + { + "epoch": 0.41631891060839005, + "grad_norm": 1.9443265199661255, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.702910304069519, + "num_tokens": 97983285.0, + "step": 3791 + }, + { + "epoch": 0.41642872831100375, + "grad_norm": 1.9775086641311646, + "learning_rate": 5e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6789369583129883, + "num_tokens": 98011465.0, + "step": 3792 + }, + { + "epoch": 0.4165385460136174, + "grad_norm": 1.8314601182937622, + "learning_rate": 5e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6936805248260498, + "num_tokens": 98043053.0, + "step": 3793 + }, + { + "epoch": 0.41664836371623104, + "grad_norm": 1.8351458311080933, + "learning_rate": 5e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6999526023864746, + "num_tokens": 98072696.0, + "step": 3794 + }, + { + "epoch": 0.41675818141884474, + "grad_norm": 2.1108860969543457, + "learning_rate": 5e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6935847997665405, + "num_tokens": 98094317.0, + "step": 3795 + }, + { + "epoch": 0.4168679991214584, + "grad_norm": 2.0490787029266357, + "learning_rate": 5e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6997712850570679, + "num_tokens": 98116159.0, + "step": 3796 + }, + { + "epoch": 0.41697781682407203, + "grad_norm": 2.1300716400146484, + "learning_rate": 5e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6977993249893188, + "num_tokens": 98138725.0, + "step": 3797 + }, + { + "epoch": 0.4170876345266857, + "grad_norm": 1.740293025970459, + "learning_rate": 5e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7229998111724854, + "num_tokens": 98169235.0, + "step": 3798 + }, + { + "epoch": 0.4171974522292994, + "grad_norm": 1.8995739221572876, + "learning_rate": 5e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6890758872032166, + "num_tokens": 98201371.0, + "step": 3799 + }, + { + "epoch": 0.417307269931913, + "grad_norm": 1.9470014572143555, + "learning_rate": 5e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.703946590423584, + "num_tokens": 98226901.0, + "step": 3800 + }, + { + "epoch": 0.4174170876345267, + "grad_norm": 1.812225341796875, + "learning_rate": 5e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6997966766357422, + "num_tokens": 98257526.0, + "step": 3801 + }, + { + "epoch": 0.4175269053371403, + "grad_norm": 1.916782259941101, + "learning_rate": 5e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6785832643508911, + "num_tokens": 98287742.0, + "step": 3802 + }, + { + "epoch": 0.417636723039754, + "grad_norm": 2.01104474067688, + "learning_rate": 5e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7265772223472595, + "num_tokens": 98311959.0, + "step": 3803 + }, + { + "epoch": 0.41774654074236767, + "grad_norm": 1.9712883234024048, + "learning_rate": 5e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6917439699172974, + "num_tokens": 98339373.0, + "step": 3804 + }, + { + "epoch": 0.4178563584449813, + "grad_norm": 1.859873652458191, + "learning_rate": 5e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7010774612426758, + "num_tokens": 98370026.0, + "step": 3805 + }, + { + "epoch": 0.417966176147595, + "grad_norm": 2.004268169403076, + "learning_rate": 5e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.695900559425354, + "num_tokens": 98393619.0, + "step": 3806 + }, + { + "epoch": 0.41807599385020866, + "grad_norm": 2.066570281982422, + "learning_rate": 5e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6829241514205933, + "num_tokens": 98416192.0, + "step": 3807 + }, + { + "epoch": 0.4181858115528223, + "grad_norm": 1.8185690641403198, + "learning_rate": 5e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.6718853116035461, + "num_tokens": 98447067.0, + "step": 3808 + }, + { + "epoch": 0.41829562925543595, + "grad_norm": 1.9332154989242554, + "learning_rate": 5e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6956461668014526, + "num_tokens": 98473205.0, + "step": 3809 + }, + { + "epoch": 0.41840544695804965, + "grad_norm": 2.392737865447998, + "learning_rate": 5e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7098122835159302, + "num_tokens": 98491847.0, + "step": 3810 + }, + { + "epoch": 0.4185152646606633, + "grad_norm": 1.9464244842529297, + "learning_rate": 5e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6695044040679932, + "num_tokens": 98523406.0, + "step": 3811 + }, + { + "epoch": 0.41862508236327695, + "grad_norm": 1.9585363864898682, + "learning_rate": 5e-06, + "loss": 1.0749, + "mean_token_accuracy": 0.6709147691726685, + "num_tokens": 98550735.0, + "step": 3812 + }, + { + "epoch": 0.41873490006589065, + "grad_norm": 1.9799951314926147, + "learning_rate": 5e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7134404182434082, + "num_tokens": 98575649.0, + "step": 3813 + }, + { + "epoch": 0.4188447177685043, + "grad_norm": 2.101835250854492, + "learning_rate": 5e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7144808769226074, + "num_tokens": 98598522.0, + "step": 3814 + }, + { + "epoch": 0.41895453547111794, + "grad_norm": 2.2413060665130615, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7202035188674927, + "num_tokens": 98620706.0, + "step": 3815 + }, + { + "epoch": 0.4190643531737316, + "grad_norm": 2.0218935012817383, + "learning_rate": 5e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.727558970451355, + "num_tokens": 98644918.0, + "step": 3816 + }, + { + "epoch": 0.4191741708763453, + "grad_norm": 1.8789819478988647, + "learning_rate": 5e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.716387152671814, + "num_tokens": 98670730.0, + "step": 3817 + }, + { + "epoch": 0.41928398857895893, + "grad_norm": 1.768084168434143, + "learning_rate": 5e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.677943229675293, + "num_tokens": 98701828.0, + "step": 3818 + }, + { + "epoch": 0.4193938062815726, + "grad_norm": 2.0909790992736816, + "learning_rate": 5e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7215701341629028, + "num_tokens": 98724698.0, + "step": 3819 + }, + { + "epoch": 0.4195036239841862, + "grad_norm": 1.8566049337387085, + "learning_rate": 5e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7002040147781372, + "num_tokens": 98755349.0, + "step": 3820 + }, + { + "epoch": 0.4196134416867999, + "grad_norm": 1.9123891592025757, + "learning_rate": 5e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7084806561470032, + "num_tokens": 98782063.0, + "step": 3821 + }, + { + "epoch": 0.41972325938941357, + "grad_norm": 1.9952930212020874, + "learning_rate": 5e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7065082788467407, + "num_tokens": 98807894.0, + "step": 3822 + }, + { + "epoch": 0.4198330770920272, + "grad_norm": 2.102189064025879, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7177257537841797, + "num_tokens": 98830120.0, + "step": 3823 + }, + { + "epoch": 0.4199428947946409, + "grad_norm": 1.9949226379394531, + "learning_rate": 5e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.685791015625, + "num_tokens": 98854782.0, + "step": 3824 + }, + { + "epoch": 0.42005271249725457, + "grad_norm": 1.86625075340271, + "learning_rate": 5e-06, + "loss": 1.1284, + "mean_token_accuracy": 0.6702574491500854, + "num_tokens": 98886875.0, + "step": 3825 + }, + { + "epoch": 0.4201625301998682, + "grad_norm": 2.0524215698242188, + "learning_rate": 5e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6967815160751343, + "num_tokens": 98908588.0, + "step": 3826 + }, + { + "epoch": 0.42027234790248186, + "grad_norm": 1.7780967950820923, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7172062397003174, + "num_tokens": 98936714.0, + "step": 3827 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 1.7317904233932495, + "learning_rate": 5e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6848467588424683, + "num_tokens": 98970421.0, + "step": 3828 + }, + { + "epoch": 0.4204919833077092, + "grad_norm": 1.6837190389633179, + "learning_rate": 5e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7082924842834473, + "num_tokens": 99001718.0, + "step": 3829 + }, + { + "epoch": 0.42060180101032285, + "grad_norm": 2.162137031555176, + "learning_rate": 5e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6988881826400757, + "num_tokens": 99023958.0, + "step": 3830 + }, + { + "epoch": 0.42071161871293655, + "grad_norm": 2.019376039505005, + "learning_rate": 5e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.692121148109436, + "num_tokens": 99048100.0, + "step": 3831 + }, + { + "epoch": 0.4208214364155502, + "grad_norm": 1.7692179679870605, + "learning_rate": 5e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7122000455856323, + "num_tokens": 99077787.0, + "step": 3832 + }, + { + "epoch": 0.42093125411816384, + "grad_norm": 1.9240188598632812, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6959591507911682, + "num_tokens": 99104565.0, + "step": 3833 + }, + { + "epoch": 0.4210410718207775, + "grad_norm": 2.0246164798736572, + "learning_rate": 5e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7123621106147766, + "num_tokens": 99129025.0, + "step": 3834 + }, + { + "epoch": 0.4211508895233912, + "grad_norm": 2.0820958614349365, + "learning_rate": 5e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7037518620491028, + "num_tokens": 99151692.0, + "step": 3835 + }, + { + "epoch": 0.42126070722600484, + "grad_norm": 2.2375879287719727, + "learning_rate": 5e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7086284160614014, + "num_tokens": 99173454.0, + "step": 3836 + }, + { + "epoch": 0.4213705249286185, + "grad_norm": 2.48310923576355, + "learning_rate": 5e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7265874743461609, + "num_tokens": 99191977.0, + "step": 3837 + }, + { + "epoch": 0.42148034263123213, + "grad_norm": 1.9220499992370605, + "learning_rate": 5e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6900359392166138, + "num_tokens": 99219718.0, + "step": 3838 + }, + { + "epoch": 0.42159016033384583, + "grad_norm": 1.8601465225219727, + "learning_rate": 5e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.701697826385498, + "num_tokens": 99251069.0, + "step": 3839 + }, + { + "epoch": 0.4216999780364595, + "grad_norm": 1.8996171951293945, + "learning_rate": 5e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7159678339958191, + "num_tokens": 99276423.0, + "step": 3840 + }, + { + "epoch": 0.4218097957390731, + "grad_norm": 2.0497448444366455, + "learning_rate": 5e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.712886393070221, + "num_tokens": 99299466.0, + "step": 3841 + }, + { + "epoch": 0.4219196134416868, + "grad_norm": 2.1414239406585693, + "learning_rate": 5e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7133437395095825, + "num_tokens": 99320391.0, + "step": 3842 + }, + { + "epoch": 0.42202943114430047, + "grad_norm": 2.0319199562072754, + "learning_rate": 5e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6946298480033875, + "num_tokens": 99346067.0, + "step": 3843 + }, + { + "epoch": 0.4221392488469141, + "grad_norm": 1.8080288171768188, + "learning_rate": 5e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6787741184234619, + "num_tokens": 99377654.0, + "step": 3844 + }, + { + "epoch": 0.42224906654952776, + "grad_norm": 1.9847116470336914, + "learning_rate": 5e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7122015357017517, + "num_tokens": 99402912.0, + "step": 3845 + }, + { + "epoch": 0.42235888425214146, + "grad_norm": 1.9918133020401, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7234129309654236, + "num_tokens": 99423704.0, + "step": 3846 + }, + { + "epoch": 0.4224687019547551, + "grad_norm": 1.8357340097427368, + "learning_rate": 5e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7323169112205505, + "num_tokens": 99449856.0, + "step": 3847 + }, + { + "epoch": 0.42257851965736876, + "grad_norm": 1.7952656745910645, + "learning_rate": 5e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6872671842575073, + "num_tokens": 99482367.0, + "step": 3848 + }, + { + "epoch": 0.4226883373599824, + "grad_norm": 1.9154242277145386, + "learning_rate": 5e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6967850923538208, + "num_tokens": 99511160.0, + "step": 3849 + }, + { + "epoch": 0.4227981550625961, + "grad_norm": 2.2543892860412598, + "learning_rate": 5e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7071908712387085, + "num_tokens": 99532060.0, + "step": 3850 + }, + { + "epoch": 0.42290797276520975, + "grad_norm": 1.9748733043670654, + "learning_rate": 5e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7098413705825806, + "num_tokens": 99556955.0, + "step": 3851 + }, + { + "epoch": 0.4230177904678234, + "grad_norm": 1.763373613357544, + "learning_rate": 5e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6807280778884888, + "num_tokens": 99590587.0, + "step": 3852 + }, + { + "epoch": 0.4231276081704371, + "grad_norm": 1.8696255683898926, + "learning_rate": 5e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6960023641586304, + "num_tokens": 99620598.0, + "step": 3853 + }, + { + "epoch": 0.42323742587305074, + "grad_norm": 1.8556360006332397, + "learning_rate": 5e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6923744678497314, + "num_tokens": 99650721.0, + "step": 3854 + }, + { + "epoch": 0.4233472435756644, + "grad_norm": 2.170499086380005, + "learning_rate": 5e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7151401042938232, + "num_tokens": 99671578.0, + "step": 3855 + }, + { + "epoch": 0.42345706127827804, + "grad_norm": 1.780742883682251, + "learning_rate": 5e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6791353821754456, + "num_tokens": 99705527.0, + "step": 3856 + }, + { + "epoch": 0.42356687898089174, + "grad_norm": 1.7848166227340698, + "learning_rate": 5e-06, + "loss": 1.037, + "mean_token_accuracy": 0.698224663734436, + "num_tokens": 99735730.0, + "step": 3857 + }, + { + "epoch": 0.4236766966835054, + "grad_norm": 1.8966163396835327, + "learning_rate": 5e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6765247583389282, + "num_tokens": 99763811.0, + "step": 3858 + }, + { + "epoch": 0.42378651438611903, + "grad_norm": 1.9680379629135132, + "learning_rate": 5e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6954211592674255, + "num_tokens": 99791204.0, + "step": 3859 + }, + { + "epoch": 0.42389633208873273, + "grad_norm": 1.8585760593414307, + "learning_rate": 5e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6803963780403137, + "num_tokens": 99823864.0, + "step": 3860 + }, + { + "epoch": 0.4240061497913464, + "grad_norm": 1.9213483333587646, + "learning_rate": 5e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7038087248802185, + "num_tokens": 99850816.0, + "step": 3861 + }, + { + "epoch": 0.42411596749396, + "grad_norm": 2.2221381664276123, + "learning_rate": 5e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7133602499961853, + "num_tokens": 99870394.0, + "step": 3862 + }, + { + "epoch": 0.42422578519657367, + "grad_norm": 2.146305561065674, + "learning_rate": 5e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7048300504684448, + "num_tokens": 99892917.0, + "step": 3863 + }, + { + "epoch": 0.42433560289918737, + "grad_norm": 2.133650064468384, + "learning_rate": 5e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7023547887802124, + "num_tokens": 99917418.0, + "step": 3864 + }, + { + "epoch": 0.424445420601801, + "grad_norm": 1.9555227756500244, + "learning_rate": 5e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6993181705474854, + "num_tokens": 99944885.0, + "step": 3865 + }, + { + "epoch": 0.42455523830441466, + "grad_norm": 1.9912469387054443, + "learning_rate": 5e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7056576013565063, + "num_tokens": 99971198.0, + "step": 3866 + }, + { + "epoch": 0.4246650560070283, + "grad_norm": 1.7849080562591553, + "learning_rate": 5e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.69386225938797, + "num_tokens": 100003785.0, + "step": 3867 + }, + { + "epoch": 0.424774873709642, + "grad_norm": 2.0654056072235107, + "learning_rate": 5e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6962441205978394, + "num_tokens": 100027191.0, + "step": 3868 + }, + { + "epoch": 0.42488469141225566, + "grad_norm": 1.7388544082641602, + "learning_rate": 5e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6934837102890015, + "num_tokens": 100061270.0, + "step": 3869 + }, + { + "epoch": 0.4249945091148693, + "grad_norm": 1.9558883905410767, + "learning_rate": 5e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7005964517593384, + "num_tokens": 100087748.0, + "step": 3870 + }, + { + "epoch": 0.425104326817483, + "grad_norm": 1.7595683336257935, + "learning_rate": 5e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7020437717437744, + "num_tokens": 100119784.0, + "step": 3871 + }, + { + "epoch": 0.42521414452009665, + "grad_norm": 2.004990577697754, + "learning_rate": 5e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7091265916824341, + "num_tokens": 100144851.0, + "step": 3872 + }, + { + "epoch": 0.4253239622227103, + "grad_norm": 1.8207191228866577, + "learning_rate": 5e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7132321000099182, + "num_tokens": 100175950.0, + "step": 3873 + }, + { + "epoch": 0.42543377992532394, + "grad_norm": 2.15262770652771, + "learning_rate": 5e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7251452207565308, + "num_tokens": 100197431.0, + "step": 3874 + }, + { + "epoch": 0.42554359762793764, + "grad_norm": 2.0870769023895264, + "learning_rate": 5e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7073330283164978, + "num_tokens": 100218686.0, + "step": 3875 + }, + { + "epoch": 0.4256534153305513, + "grad_norm": 2.0615274906158447, + "learning_rate": 5e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7065079212188721, + "num_tokens": 100240205.0, + "step": 3876 + }, + { + "epoch": 0.42576323303316493, + "grad_norm": 1.9640904664993286, + "learning_rate": 5e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7094250917434692, + "num_tokens": 100265642.0, + "step": 3877 + }, + { + "epoch": 0.4258730507357786, + "grad_norm": 2.0188608169555664, + "learning_rate": 5e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.724604606628418, + "num_tokens": 100288011.0, + "step": 3878 + }, + { + "epoch": 0.4259828684383923, + "grad_norm": 1.968388557434082, + "learning_rate": 5e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6940547227859497, + "num_tokens": 100314815.0, + "step": 3879 + }, + { + "epoch": 0.4260926861410059, + "grad_norm": 2.071140766143799, + "learning_rate": 5e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6965428590774536, + "num_tokens": 100341016.0, + "step": 3880 + }, + { + "epoch": 0.4262025038436196, + "grad_norm": 1.9575828313827515, + "learning_rate": 5e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7212255597114563, + "num_tokens": 100366688.0, + "step": 3881 + }, + { + "epoch": 0.4263123215462333, + "grad_norm": 1.9248764514923096, + "learning_rate": 5e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7024539709091187, + "num_tokens": 100393567.0, + "step": 3882 + }, + { + "epoch": 0.4264221392488469, + "grad_norm": 1.9974132776260376, + "learning_rate": 5e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6851711273193359, + "num_tokens": 100417907.0, + "step": 3883 + }, + { + "epoch": 0.42653195695146057, + "grad_norm": 2.1856906414031982, + "learning_rate": 5e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7055373191833496, + "num_tokens": 100439570.0, + "step": 3884 + }, + { + "epoch": 0.4266417746540742, + "grad_norm": 1.8676892518997192, + "learning_rate": 5e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7162994146347046, + "num_tokens": 100465060.0, + "step": 3885 + }, + { + "epoch": 0.4267515923566879, + "grad_norm": 1.8060905933380127, + "learning_rate": 5e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7213636636734009, + "num_tokens": 100492176.0, + "step": 3886 + }, + { + "epoch": 0.42686141005930156, + "grad_norm": 1.9034779071807861, + "learning_rate": 5e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6956679821014404, + "num_tokens": 100519671.0, + "step": 3887 + }, + { + "epoch": 0.4269712277619152, + "grad_norm": 1.8870660066604614, + "learning_rate": 5e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.6979407072067261, + "num_tokens": 100546756.0, + "step": 3888 + }, + { + "epoch": 0.4270810454645289, + "grad_norm": 1.897226095199585, + "learning_rate": 5e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7047514319419861, + "num_tokens": 100578056.0, + "step": 3889 + }, + { + "epoch": 0.42719086316714255, + "grad_norm": 1.9651423692703247, + "learning_rate": 5e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7184559106826782, + "num_tokens": 100602411.0, + "step": 3890 + }, + { + "epoch": 0.4273006808697562, + "grad_norm": 1.7313498258590698, + "learning_rate": 5e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7131228446960449, + "num_tokens": 100632675.0, + "step": 3891 + }, + { + "epoch": 0.42741049857236985, + "grad_norm": 2.112753391265869, + "learning_rate": 5e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6992321014404297, + "num_tokens": 100655571.0, + "step": 3892 + }, + { + "epoch": 0.42752031627498355, + "grad_norm": 1.918487310409546, + "learning_rate": 5e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7048739194869995, + "num_tokens": 100682169.0, + "step": 3893 + }, + { + "epoch": 0.4276301339775972, + "grad_norm": 1.893276333808899, + "learning_rate": 5e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7056841850280762, + "num_tokens": 100706276.0, + "step": 3894 + }, + { + "epoch": 0.42773995168021084, + "grad_norm": 1.9446886777877808, + "learning_rate": 5e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6962928771972656, + "num_tokens": 100735624.0, + "step": 3895 + }, + { + "epoch": 0.4278497693828245, + "grad_norm": 2.026200294494629, + "learning_rate": 5e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7025647163391113, + "num_tokens": 100759946.0, + "step": 3896 + }, + { + "epoch": 0.4279595870854382, + "grad_norm": 1.7916532754898071, + "learning_rate": 5e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6932480335235596, + "num_tokens": 100790702.0, + "step": 3897 + }, + { + "epoch": 0.42806940478805183, + "grad_norm": 2.108704090118408, + "learning_rate": 5e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7081494331359863, + "num_tokens": 100811540.0, + "step": 3898 + }, + { + "epoch": 0.4281792224906655, + "grad_norm": 1.8711652755737305, + "learning_rate": 5e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6914925575256348, + "num_tokens": 100845714.0, + "step": 3899 + }, + { + "epoch": 0.4282890401932792, + "grad_norm": 2.0241382122039795, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7195839881896973, + "num_tokens": 100868886.0, + "step": 3900 + }, + { + "epoch": 0.4283988578958928, + "grad_norm": 2.1805055141448975, + "learning_rate": 5e-06, + "loss": 1.0956, + "mean_token_accuracy": 0.6874992847442627, + "num_tokens": 100893263.0, + "step": 3901 + }, + { + "epoch": 0.42850867559850647, + "grad_norm": 1.8995096683502197, + "learning_rate": 5e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7206445336341858, + "num_tokens": 100917972.0, + "step": 3902 + }, + { + "epoch": 0.4286184933011201, + "grad_norm": 2.201758861541748, + "learning_rate": 5e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7263615131378174, + "num_tokens": 100937686.0, + "step": 3903 + }, + { + "epoch": 0.4287283110037338, + "grad_norm": 2.0626707077026367, + "learning_rate": 5e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7350145578384399, + "num_tokens": 100961732.0, + "step": 3904 + }, + { + "epoch": 0.42883812870634747, + "grad_norm": 1.9227298498153687, + "learning_rate": 5e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7117193341255188, + "num_tokens": 100989324.0, + "step": 3905 + }, + { + "epoch": 0.4289479464089611, + "grad_norm": 1.7266019582748413, + "learning_rate": 5e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7085262537002563, + "num_tokens": 101020466.0, + "step": 3906 + }, + { + "epoch": 0.4290577641115748, + "grad_norm": 1.8993622064590454, + "learning_rate": 5e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.676455020904541, + "num_tokens": 101049154.0, + "step": 3907 + }, + { + "epoch": 0.42916758181418846, + "grad_norm": 1.911379098892212, + "learning_rate": 5e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7209010720252991, + "num_tokens": 101073122.0, + "step": 3908 + }, + { + "epoch": 0.4292773995168021, + "grad_norm": 1.673473834991455, + "learning_rate": 5e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7034109830856323, + "num_tokens": 101105581.0, + "step": 3909 + }, + { + "epoch": 0.42938721721941575, + "grad_norm": 2.3788609504699707, + "learning_rate": 5e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7074616551399231, + "num_tokens": 101124449.0, + "step": 3910 + }, + { + "epoch": 0.42949703492202945, + "grad_norm": 1.8391135931015015, + "learning_rate": 5e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7094380855560303, + "num_tokens": 101150905.0, + "step": 3911 + }, + { + "epoch": 0.4296068526246431, + "grad_norm": 2.00590443611145, + "learning_rate": 5e-06, + "loss": 1.0628, + "mean_token_accuracy": 0.6883001327514648, + "num_tokens": 101175466.0, + "step": 3912 + }, + { + "epoch": 0.42971667032725674, + "grad_norm": 1.7362946271896362, + "learning_rate": 5e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6928563117980957, + "num_tokens": 101208839.0, + "step": 3913 + }, + { + "epoch": 0.4298264880298704, + "grad_norm": 1.7211066484451294, + "learning_rate": 5e-06, + "loss": 1.1237, + "mean_token_accuracy": 0.6695321202278137, + "num_tokens": 101241376.0, + "step": 3914 + }, + { + "epoch": 0.4299363057324841, + "grad_norm": 1.9308561086654663, + "learning_rate": 5e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.6991598606109619, + "num_tokens": 101267395.0, + "step": 3915 + }, + { + "epoch": 0.43004612343509774, + "grad_norm": 1.9169621467590332, + "learning_rate": 5e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6874505281448364, + "num_tokens": 101292142.0, + "step": 3916 + }, + { + "epoch": 0.4301559411377114, + "grad_norm": 1.954326868057251, + "learning_rate": 5e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6930121779441833, + "num_tokens": 101320383.0, + "step": 3917 + }, + { + "epoch": 0.4302657588403251, + "grad_norm": 1.802148699760437, + "learning_rate": 5e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6906863451004028, + "num_tokens": 101348593.0, + "step": 3918 + }, + { + "epoch": 0.43037557654293873, + "grad_norm": 1.8103432655334473, + "learning_rate": 5e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7135549187660217, + "num_tokens": 101375412.0, + "step": 3919 + }, + { + "epoch": 0.4304853942455524, + "grad_norm": 1.9924979209899902, + "learning_rate": 5e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.719490647315979, + "num_tokens": 101396866.0, + "step": 3920 + }, + { + "epoch": 0.430595211948166, + "grad_norm": 2.136687755584717, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.718163013458252, + "num_tokens": 101419406.0, + "step": 3921 + }, + { + "epoch": 0.4307050296507797, + "grad_norm": 1.8016570806503296, + "learning_rate": 5e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7135006189346313, + "num_tokens": 101447360.0, + "step": 3922 + }, + { + "epoch": 0.43081484735339337, + "grad_norm": 1.7195786237716675, + "learning_rate": 5e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7099177837371826, + "num_tokens": 101482877.0, + "step": 3923 + }, + { + "epoch": 0.430924665056007, + "grad_norm": 1.9781420230865479, + "learning_rate": 5e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6776705980300903, + "num_tokens": 101512548.0, + "step": 3924 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 1.7895233631134033, + "learning_rate": 5e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6945875883102417, + "num_tokens": 101543903.0, + "step": 3925 + }, + { + "epoch": 0.43114430046123436, + "grad_norm": 2.007814407348633, + "learning_rate": 5e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7053995132446289, + "num_tokens": 101568170.0, + "step": 3926 + }, + { + "epoch": 0.431254118163848, + "grad_norm": 2.041099786758423, + "learning_rate": 5e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6869181394577026, + "num_tokens": 101593104.0, + "step": 3927 + }, + { + "epoch": 0.43136393586646166, + "grad_norm": 1.9615037441253662, + "learning_rate": 5e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.6997716426849365, + "num_tokens": 101618884.0, + "step": 3928 + }, + { + "epoch": 0.43147375356907536, + "grad_norm": 2.0979597568511963, + "learning_rate": 5e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6915963888168335, + "num_tokens": 101643658.0, + "step": 3929 + }, + { + "epoch": 0.431583571271689, + "grad_norm": 1.8251922130584717, + "learning_rate": 5e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6918790936470032, + "num_tokens": 101672656.0, + "step": 3930 + }, + { + "epoch": 0.43169338897430265, + "grad_norm": 1.9021469354629517, + "learning_rate": 5e-06, + "loss": 1.1214, + "mean_token_accuracy": 0.6643630266189575, + "num_tokens": 101700810.0, + "step": 3931 + }, + { + "epoch": 0.4318032066769163, + "grad_norm": 2.061955213546753, + "learning_rate": 5e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.711258053779602, + "num_tokens": 101724021.0, + "step": 3932 + }, + { + "epoch": 0.43191302437953, + "grad_norm": 1.8400226831436157, + "learning_rate": 5e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7041422128677368, + "num_tokens": 101751884.0, + "step": 3933 + }, + { + "epoch": 0.43202284208214364, + "grad_norm": 1.759745478630066, + "learning_rate": 5e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6864815950393677, + "num_tokens": 101781897.0, + "step": 3934 + }, + { + "epoch": 0.4321326597847573, + "grad_norm": 2.175271987915039, + "learning_rate": 5e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7101691961288452, + "num_tokens": 101802922.0, + "step": 3935 + }, + { + "epoch": 0.432242477487371, + "grad_norm": 2.196321725845337, + "learning_rate": 5e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7165119647979736, + "num_tokens": 101823395.0, + "step": 3936 + }, + { + "epoch": 0.43235229518998464, + "grad_norm": 2.1676182746887207, + "learning_rate": 5e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6994136571884155, + "num_tokens": 101844660.0, + "step": 3937 + }, + { + "epoch": 0.4324621128925983, + "grad_norm": 2.06390643119812, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7212523221969604, + "num_tokens": 101865904.0, + "step": 3938 + }, + { + "epoch": 0.43257193059521193, + "grad_norm": 1.9371910095214844, + "learning_rate": 5e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6895408034324646, + "num_tokens": 101891094.0, + "step": 3939 + }, + { + "epoch": 0.43268174829782563, + "grad_norm": 1.9888372421264648, + "learning_rate": 5e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6929476261138916, + "num_tokens": 101917491.0, + "step": 3940 + }, + { + "epoch": 0.4327915660004393, + "grad_norm": 2.064177989959717, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7260682582855225, + "num_tokens": 101936491.0, + "step": 3941 + }, + { + "epoch": 0.4329013837030529, + "grad_norm": 2.088296413421631, + "learning_rate": 5e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7072886228561401, + "num_tokens": 101960618.0, + "step": 3942 + }, + { + "epoch": 0.43301120140566657, + "grad_norm": 1.6571028232574463, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.696489691734314, + "num_tokens": 101993308.0, + "step": 3943 + }, + { + "epoch": 0.43312101910828027, + "grad_norm": 2.1523916721343994, + "learning_rate": 5e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7058778405189514, + "num_tokens": 102016502.0, + "step": 3944 + }, + { + "epoch": 0.4332308368108939, + "grad_norm": 2.1045608520507812, + "learning_rate": 5e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7004804611206055, + "num_tokens": 102038980.0, + "step": 3945 + }, + { + "epoch": 0.43334065451350756, + "grad_norm": 1.9591566324234009, + "learning_rate": 5e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6894124150276184, + "num_tokens": 102064079.0, + "step": 3946 + }, + { + "epoch": 0.43345047221612126, + "grad_norm": 1.9927000999450684, + "learning_rate": 5e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6938231587409973, + "num_tokens": 102088999.0, + "step": 3947 + }, + { + "epoch": 0.4335602899187349, + "grad_norm": 1.7664271593093872, + "learning_rate": 5e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6978912949562073, + "num_tokens": 102118032.0, + "step": 3948 + }, + { + "epoch": 0.43367010762134856, + "grad_norm": 1.7408779859542847, + "learning_rate": 5e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6943168640136719, + "num_tokens": 102148368.0, + "step": 3949 + }, + { + "epoch": 0.4337799253239622, + "grad_norm": 1.99765944480896, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7188506126403809, + "num_tokens": 102171189.0, + "step": 3950 + }, + { + "epoch": 0.4338897430265759, + "grad_norm": 1.7060482501983643, + "learning_rate": 5e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6944324970245361, + "num_tokens": 102199911.0, + "step": 3951 + }, + { + "epoch": 0.43399956072918955, + "grad_norm": 2.228806495666504, + "learning_rate": 5e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7023597359657288, + "num_tokens": 102218896.0, + "step": 3952 + }, + { + "epoch": 0.4341093784318032, + "grad_norm": 1.8263816833496094, + "learning_rate": 5e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7030547261238098, + "num_tokens": 102247125.0, + "step": 3953 + }, + { + "epoch": 0.43421919613441684, + "grad_norm": 2.0298216342926025, + "learning_rate": 5e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6943133473396301, + "num_tokens": 102273288.0, + "step": 3954 + }, + { + "epoch": 0.43432901383703054, + "grad_norm": 1.9918971061706543, + "learning_rate": 5e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6927766799926758, + "num_tokens": 102298580.0, + "step": 3955 + }, + { + "epoch": 0.4344388315396442, + "grad_norm": 1.7231833934783936, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7082518339157104, + "num_tokens": 102326526.0, + "step": 3956 + }, + { + "epoch": 0.43454864924225783, + "grad_norm": 1.692734718322754, + "learning_rate": 5e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7077391147613525, + "num_tokens": 102357207.0, + "step": 3957 + }, + { + "epoch": 0.43465846694487154, + "grad_norm": 1.8409165143966675, + "learning_rate": 5e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7261502742767334, + "num_tokens": 102383801.0, + "step": 3958 + }, + { + "epoch": 0.4347682846474852, + "grad_norm": 2.2286508083343506, + "learning_rate": 5e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7351133823394775, + "num_tokens": 102401406.0, + "step": 3959 + }, + { + "epoch": 0.4348781023500988, + "grad_norm": 1.6844321489334106, + "learning_rate": 5e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7026774883270264, + "num_tokens": 102432086.0, + "step": 3960 + }, + { + "epoch": 0.4349879200527125, + "grad_norm": 2.056710958480835, + "learning_rate": 5e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.6873794198036194, + "num_tokens": 102456778.0, + "step": 3961 + }, + { + "epoch": 0.4350977377553262, + "grad_norm": 1.9245028495788574, + "learning_rate": 5e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6910958290100098, + "num_tokens": 102483636.0, + "step": 3962 + }, + { + "epoch": 0.4352075554579398, + "grad_norm": 2.0976054668426514, + "learning_rate": 5e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6965999007225037, + "num_tokens": 102505569.0, + "step": 3963 + }, + { + "epoch": 0.43531737316055347, + "grad_norm": 2.066422462463379, + "learning_rate": 5e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7094283699989319, + "num_tokens": 102528357.0, + "step": 3964 + }, + { + "epoch": 0.43542719086316717, + "grad_norm": 1.808274745941162, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7105657458305359, + "num_tokens": 102556603.0, + "step": 3965 + }, + { + "epoch": 0.4355370085657808, + "grad_norm": 1.9393904209136963, + "learning_rate": 5e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7045207023620605, + "num_tokens": 102582236.0, + "step": 3966 + }, + { + "epoch": 0.43564682626839446, + "grad_norm": 1.7143046855926514, + "learning_rate": 5e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6908210515975952, + "num_tokens": 102612978.0, + "step": 3967 + }, + { + "epoch": 0.4357566439710081, + "grad_norm": 2.0230681896209717, + "learning_rate": 5e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7078221440315247, + "num_tokens": 102635943.0, + "step": 3968 + }, + { + "epoch": 0.4358664616736218, + "grad_norm": 1.826360821723938, + "learning_rate": 5e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6936321258544922, + "num_tokens": 102663166.0, + "step": 3969 + }, + { + "epoch": 0.43597627937623545, + "grad_norm": 1.8946431875228882, + "learning_rate": 5e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.6996082663536072, + "num_tokens": 102689279.0, + "step": 3970 + }, + { + "epoch": 0.4360860970788491, + "grad_norm": 2.220917224884033, + "learning_rate": 5e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.721927285194397, + "num_tokens": 102708231.0, + "step": 3971 + }, + { + "epoch": 0.43619591478146275, + "grad_norm": 2.2380406856536865, + "learning_rate": 5e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7125802636146545, + "num_tokens": 102736169.0, + "step": 3972 + }, + { + "epoch": 0.43630573248407645, + "grad_norm": 1.9267617464065552, + "learning_rate": 5e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7082613110542297, + "num_tokens": 102761378.0, + "step": 3973 + }, + { + "epoch": 0.4364155501866901, + "grad_norm": 1.9812071323394775, + "learning_rate": 5e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6949629187583923, + "num_tokens": 102786168.0, + "step": 3974 + }, + { + "epoch": 0.43652536788930374, + "grad_norm": 2.056899070739746, + "learning_rate": 5e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7141931056976318, + "num_tokens": 102808167.0, + "step": 3975 + }, + { + "epoch": 0.43663518559191744, + "grad_norm": 1.8380628824234009, + "learning_rate": 5e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7111059427261353, + "num_tokens": 102836887.0, + "step": 3976 + }, + { + "epoch": 0.4367450032945311, + "grad_norm": 1.8676183223724365, + "learning_rate": 5e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7149251699447632, + "num_tokens": 102863031.0, + "step": 3977 + }, + { + "epoch": 0.43685482099714473, + "grad_norm": 1.794487476348877, + "learning_rate": 5e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7163117527961731, + "num_tokens": 102889427.0, + "step": 3978 + }, + { + "epoch": 0.4369646386997584, + "grad_norm": 1.8060922622680664, + "learning_rate": 5e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7108234167098999, + "num_tokens": 102918050.0, + "step": 3979 + }, + { + "epoch": 0.4370744564023721, + "grad_norm": 1.821020245552063, + "learning_rate": 5e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6985533237457275, + "num_tokens": 102946720.0, + "step": 3980 + }, + { + "epoch": 0.4371842741049857, + "grad_norm": 2.0458130836486816, + "learning_rate": 5e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6796380877494812, + "num_tokens": 102971401.0, + "step": 3981 + }, + { + "epoch": 0.43729409180759937, + "grad_norm": 1.818835735321045, + "learning_rate": 5e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.676033616065979, + "num_tokens": 103000278.0, + "step": 3982 + }, + { + "epoch": 0.4374039095102131, + "grad_norm": 1.863350749015808, + "learning_rate": 5e-06, + "loss": 0.976, + "mean_token_accuracy": 0.6997073888778687, + "num_tokens": 103027077.0, + "step": 3983 + }, + { + "epoch": 0.4375137272128267, + "grad_norm": 1.6017382144927979, + "learning_rate": 5e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7072746753692627, + "num_tokens": 103060426.0, + "step": 3984 + }, + { + "epoch": 0.43762354491544037, + "grad_norm": 1.8721927404403687, + "learning_rate": 5e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7056933641433716, + "num_tokens": 103092269.0, + "step": 3985 + }, + { + "epoch": 0.437733362618054, + "grad_norm": 1.7572546005249023, + "learning_rate": 5e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.697756290435791, + "num_tokens": 103122237.0, + "step": 3986 + }, + { + "epoch": 0.4378431803206677, + "grad_norm": 1.8938484191894531, + "learning_rate": 5e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7037559747695923, + "num_tokens": 103150673.0, + "step": 3987 + }, + { + "epoch": 0.43795299802328136, + "grad_norm": 1.9962767362594604, + "learning_rate": 5e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6900984644889832, + "num_tokens": 103176927.0, + "step": 3988 + }, + { + "epoch": 0.438062815725895, + "grad_norm": 1.9725589752197266, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7085869908332825, + "num_tokens": 103200122.0, + "step": 3989 + }, + { + "epoch": 0.43817263342850865, + "grad_norm": 2.1679091453552246, + "learning_rate": 5e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.680912971496582, + "num_tokens": 103222550.0, + "step": 3990 + }, + { + "epoch": 0.43828245113112235, + "grad_norm": 1.999011754989624, + "learning_rate": 5e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.699565052986145, + "num_tokens": 103248188.0, + "step": 3991 + }, + { + "epoch": 0.438392268833736, + "grad_norm": 2.0709388256073, + "learning_rate": 5e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7031365633010864, + "num_tokens": 103269683.0, + "step": 3992 + }, + { + "epoch": 0.43850208653634964, + "grad_norm": 1.8537235260009766, + "learning_rate": 5e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.724426805973053, + "num_tokens": 103297118.0, + "step": 3993 + }, + { + "epoch": 0.43861190423896335, + "grad_norm": 2.118805408477783, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7127088308334351, + "num_tokens": 103319295.0, + "step": 3994 + }, + { + "epoch": 0.438721721941577, + "grad_norm": 1.7012478113174438, + "learning_rate": 5e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7281192541122437, + "num_tokens": 103351374.0, + "step": 3995 + }, + { + "epoch": 0.43883153964419064, + "grad_norm": 1.872854232788086, + "learning_rate": 5e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.7018487453460693, + "num_tokens": 103380979.0, + "step": 3996 + }, + { + "epoch": 0.4389413573468043, + "grad_norm": 1.8513911962509155, + "learning_rate": 5e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6857377886772156, + "num_tokens": 103410308.0, + "step": 3997 + }, + { + "epoch": 0.439051175049418, + "grad_norm": 2.1057984828948975, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7063994407653809, + "num_tokens": 103432473.0, + "step": 3998 + }, + { + "epoch": 0.43916099275203163, + "grad_norm": 2.0195960998535156, + "learning_rate": 5e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.6965851783752441, + "num_tokens": 103456832.0, + "step": 3999 + }, + { + "epoch": 0.4392708104546453, + "grad_norm": 2.002790689468384, + "learning_rate": 5e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7060288786888123, + "num_tokens": 103480891.0, + "step": 4000 + }, + { + "epoch": 0.4393806281572589, + "grad_norm": 1.9142130613327026, + "learning_rate": 5e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6947957277297974, + "num_tokens": 103506514.0, + "step": 4001 + }, + { + "epoch": 0.4394904458598726, + "grad_norm": 1.7829395532608032, + "learning_rate": 5e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.696480393409729, + "num_tokens": 103532566.0, + "step": 4002 + }, + { + "epoch": 0.43960026356248627, + "grad_norm": 1.9509583711624146, + "learning_rate": 5e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6963920593261719, + "num_tokens": 103557782.0, + "step": 4003 + }, + { + "epoch": 0.4397100812650999, + "grad_norm": 2.0559043884277344, + "learning_rate": 5e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.71296226978302, + "num_tokens": 103580416.0, + "step": 4004 + }, + { + "epoch": 0.4398198989677136, + "grad_norm": 1.821999430656433, + "learning_rate": 5e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.711886465549469, + "num_tokens": 103608359.0, + "step": 4005 + }, + { + "epoch": 0.43992971667032726, + "grad_norm": 1.950402855873108, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7172862887382507, + "num_tokens": 103631987.0, + "step": 4006 + }, + { + "epoch": 0.4400395343729409, + "grad_norm": 2.0393426418304443, + "learning_rate": 5e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6980862021446228, + "num_tokens": 103656245.0, + "step": 4007 + }, + { + "epoch": 0.44014935207555456, + "grad_norm": 1.8361581563949585, + "learning_rate": 5e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6978147029876709, + "num_tokens": 103688079.0, + "step": 4008 + }, + { + "epoch": 0.44025916977816826, + "grad_norm": 1.6578940153121948, + "learning_rate": 5e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7010778188705444, + "num_tokens": 103719981.0, + "step": 4009 + }, + { + "epoch": 0.4403689874807819, + "grad_norm": 1.9202150106430054, + "learning_rate": 5e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7007542252540588, + "num_tokens": 103743761.0, + "step": 4010 + }, + { + "epoch": 0.44047880518339555, + "grad_norm": 1.920334815979004, + "learning_rate": 5e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7116034030914307, + "num_tokens": 103769529.0, + "step": 4011 + }, + { + "epoch": 0.44058862288600925, + "grad_norm": 1.8857767581939697, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7193981409072876, + "num_tokens": 103794467.0, + "step": 4012 + }, + { + "epoch": 0.4406984405886229, + "grad_norm": 1.9441733360290527, + "learning_rate": 5e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6877629160881042, + "num_tokens": 103821245.0, + "step": 4013 + }, + { + "epoch": 0.44080825829123654, + "grad_norm": 1.9950543642044067, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6886505484580994, + "num_tokens": 103846621.0, + "step": 4014 + }, + { + "epoch": 0.4409180759938502, + "grad_norm": 2.7121152877807617, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7038000822067261, + "num_tokens": 103869584.0, + "step": 4015 + }, + { + "epoch": 0.4410278936964639, + "grad_norm": 2.0442426204681396, + "learning_rate": 5e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7018109560012817, + "num_tokens": 103896368.0, + "step": 4016 + }, + { + "epoch": 0.44113771139907754, + "grad_norm": 4.518025875091553, + "learning_rate": 5e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.70078045129776, + "num_tokens": 103922456.0, + "step": 4017 + }, + { + "epoch": 0.4412475291016912, + "grad_norm": 2.0924575328826904, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7221446633338928, + "num_tokens": 103946183.0, + "step": 4018 + }, + { + "epoch": 0.44135734680430483, + "grad_norm": 2.019289016723633, + "learning_rate": 5e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7191385626792908, + "num_tokens": 103970489.0, + "step": 4019 + }, + { + "epoch": 0.44146716450691853, + "grad_norm": 1.9554197788238525, + "learning_rate": 5e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7098884582519531, + "num_tokens": 103994462.0, + "step": 4020 + }, + { + "epoch": 0.4415769822095322, + "grad_norm": 1.7887159585952759, + "learning_rate": 5e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6856757402420044, + "num_tokens": 104021513.0, + "step": 4021 + }, + { + "epoch": 0.4416867999121458, + "grad_norm": 1.86651611328125, + "learning_rate": 5e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7122811079025269, + "num_tokens": 104049136.0, + "step": 4022 + }, + { + "epoch": 0.4417966176147595, + "grad_norm": 2.0999128818511963, + "learning_rate": 5e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6908116340637207, + "num_tokens": 104073514.0, + "step": 4023 + }, + { + "epoch": 0.44190643531737317, + "grad_norm": 1.9323813915252686, + "learning_rate": 5e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6821398735046387, + "num_tokens": 104099915.0, + "step": 4024 + }, + { + "epoch": 0.4420162530199868, + "grad_norm": 1.841973066329956, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7202939391136169, + "num_tokens": 104125295.0, + "step": 4025 + }, + { + "epoch": 0.44212607072260046, + "grad_norm": 1.625004529953003, + "learning_rate": 5e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6919214725494385, + "num_tokens": 104162428.0, + "step": 4026 + }, + { + "epoch": 0.44223588842521416, + "grad_norm": 2.0454492568969727, + "learning_rate": 5e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7076772451400757, + "num_tokens": 104184714.0, + "step": 4027 + }, + { + "epoch": 0.4423457061278278, + "grad_norm": 1.9202719926834106, + "learning_rate": 5e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.6696711778640747, + "num_tokens": 104214232.0, + "step": 4028 + }, + { + "epoch": 0.44245552383044146, + "grad_norm": 2.204768180847168, + "learning_rate": 5e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7159836292266846, + "num_tokens": 104236982.0, + "step": 4029 + }, + { + "epoch": 0.4425653415330551, + "grad_norm": 1.9225517511367798, + "learning_rate": 5e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7570313215255737, + "num_tokens": 104258467.0, + "step": 4030 + }, + { + "epoch": 0.4426751592356688, + "grad_norm": 1.7409507036209106, + "learning_rate": 5e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6936998963356018, + "num_tokens": 104288875.0, + "step": 4031 + }, + { + "epoch": 0.44278497693828245, + "grad_norm": 1.8024520874023438, + "learning_rate": 5e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6941861510276794, + "num_tokens": 104318424.0, + "step": 4032 + }, + { + "epoch": 0.4428947946408961, + "grad_norm": 1.9572608470916748, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6979261636734009, + "num_tokens": 104343720.0, + "step": 4033 + }, + { + "epoch": 0.4430046123435098, + "grad_norm": 1.932750940322876, + "learning_rate": 5e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7106261253356934, + "num_tokens": 104370432.0, + "step": 4034 + }, + { + "epoch": 0.44311443004612344, + "grad_norm": 1.935870885848999, + "learning_rate": 5e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.683972954750061, + "num_tokens": 104396400.0, + "step": 4035 + }, + { + "epoch": 0.4432242477487371, + "grad_norm": 1.9512028694152832, + "learning_rate": 5e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6876912117004395, + "num_tokens": 104421584.0, + "step": 4036 + }, + { + "epoch": 0.44333406545135073, + "grad_norm": 1.9768925905227661, + "learning_rate": 5e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7092212438583374, + "num_tokens": 104446339.0, + "step": 4037 + }, + { + "epoch": 0.44344388315396444, + "grad_norm": 1.919503092765808, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7184082865715027, + "num_tokens": 104469699.0, + "step": 4038 + }, + { + "epoch": 0.4435537008565781, + "grad_norm": 1.925748586654663, + "learning_rate": 5e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6736140251159668, + "num_tokens": 104496078.0, + "step": 4039 + }, + { + "epoch": 0.4436635185591917, + "grad_norm": 1.8712568283081055, + "learning_rate": 5e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7348552942276001, + "num_tokens": 104523848.0, + "step": 4040 + }, + { + "epoch": 0.44377333626180543, + "grad_norm": 1.7177027463912964, + "learning_rate": 5e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7106122374534607, + "num_tokens": 104556398.0, + "step": 4041 + }, + { + "epoch": 0.4438831539644191, + "grad_norm": 1.8420673608779907, + "learning_rate": 5e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7103720903396606, + "num_tokens": 104583121.0, + "step": 4042 + }, + { + "epoch": 0.4439929716670327, + "grad_norm": 1.8655130863189697, + "learning_rate": 5e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6876513957977295, + "num_tokens": 104612496.0, + "step": 4043 + }, + { + "epoch": 0.44410278936964637, + "grad_norm": 1.885434865951538, + "learning_rate": 5e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6856964826583862, + "num_tokens": 104640425.0, + "step": 4044 + }, + { + "epoch": 0.44421260707226007, + "grad_norm": 1.9141076803207397, + "learning_rate": 5e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7181949615478516, + "num_tokens": 104665029.0, + "step": 4045 + }, + { + "epoch": 0.4443224247748737, + "grad_norm": 2.0831222534179688, + "learning_rate": 5e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.697921097278595, + "num_tokens": 104688250.0, + "step": 4046 + }, + { + "epoch": 0.44443224247748736, + "grad_norm": 2.0086851119995117, + "learning_rate": 5e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7237329483032227, + "num_tokens": 104715060.0, + "step": 4047 + }, + { + "epoch": 0.444542060180101, + "grad_norm": 1.9194703102111816, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.70953369140625, + "num_tokens": 104743917.0, + "step": 4048 + }, + { + "epoch": 0.4446518778827147, + "grad_norm": 2.1395652294158936, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7310787439346313, + "num_tokens": 104763849.0, + "step": 4049 + }, + { + "epoch": 0.44476169558532835, + "grad_norm": 1.969133973121643, + "learning_rate": 5e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7065402269363403, + "num_tokens": 104788284.0, + "step": 4050 + }, + { + "epoch": 0.444871513287942, + "grad_norm": 2.0115623474121094, + "learning_rate": 5e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6935338973999023, + "num_tokens": 104813038.0, + "step": 4051 + }, + { + "epoch": 0.4449813309905557, + "grad_norm": 2.041412115097046, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7189571857452393, + "num_tokens": 104834993.0, + "step": 4052 + }, + { + "epoch": 0.44509114869316935, + "grad_norm": 1.8102984428405762, + "learning_rate": 5e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6851779818534851, + "num_tokens": 104866325.0, + "step": 4053 + }, + { + "epoch": 0.445200966395783, + "grad_norm": 1.8685883283615112, + "learning_rate": 5e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7037556767463684, + "num_tokens": 104892215.0, + "step": 4054 + }, + { + "epoch": 0.44531078409839664, + "grad_norm": 1.7089288234710693, + "learning_rate": 5e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6908316612243652, + "num_tokens": 104922495.0, + "step": 4055 + }, + { + "epoch": 0.44542060180101034, + "grad_norm": 1.9722366333007812, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.718543291091919, + "num_tokens": 104946653.0, + "step": 4056 + }, + { + "epoch": 0.445530419503624, + "grad_norm": 2.156264543533325, + "learning_rate": 5e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7002186179161072, + "num_tokens": 104969117.0, + "step": 4057 + }, + { + "epoch": 0.44564023720623763, + "grad_norm": 1.982948899269104, + "learning_rate": 5e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6850752830505371, + "num_tokens": 104994886.0, + "step": 4058 + }, + { + "epoch": 0.44575005490885133, + "grad_norm": 1.8237431049346924, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7238032817840576, + "num_tokens": 105022690.0, + "step": 4059 + }, + { + "epoch": 0.445859872611465, + "grad_norm": 1.9789204597473145, + "learning_rate": 5e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7088108062744141, + "num_tokens": 105046372.0, + "step": 4060 + }, + { + "epoch": 0.4459696903140786, + "grad_norm": 2.041999578475952, + "learning_rate": 5e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7362014055252075, + "num_tokens": 105068776.0, + "step": 4061 + }, + { + "epoch": 0.44607950801669227, + "grad_norm": 1.9859539270401, + "learning_rate": 5e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6851739883422852, + "num_tokens": 105093196.0, + "step": 4062 + }, + { + "epoch": 0.446189325719306, + "grad_norm": 2.1081349849700928, + "learning_rate": 5e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.6770923137664795, + "num_tokens": 105119644.0, + "step": 4063 + }, + { + "epoch": 0.4462991434219196, + "grad_norm": 1.8235775232315063, + "learning_rate": 5e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7042226195335388, + "num_tokens": 105146836.0, + "step": 4064 + }, + { + "epoch": 0.44640896112453327, + "grad_norm": 1.999161958694458, + "learning_rate": 5e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7006126046180725, + "num_tokens": 105170148.0, + "step": 4065 + }, + { + "epoch": 0.4465187788271469, + "grad_norm": 1.905159831047058, + "learning_rate": 5e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7034614086151123, + "num_tokens": 105195665.0, + "step": 4066 + }, + { + "epoch": 0.4466285965297606, + "grad_norm": 1.8185068368911743, + "learning_rate": 5e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6967105269432068, + "num_tokens": 105223242.0, + "step": 4067 + }, + { + "epoch": 0.44673841423237426, + "grad_norm": 1.7795071601867676, + "learning_rate": 5e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6975491642951965, + "num_tokens": 105255691.0, + "step": 4068 + }, + { + "epoch": 0.4468482319349879, + "grad_norm": 1.9016259908676147, + "learning_rate": 5e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6789798140525818, + "num_tokens": 105283327.0, + "step": 4069 + }, + { + "epoch": 0.4469580496376016, + "grad_norm": 1.87152099609375, + "learning_rate": 5e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7057156562805176, + "num_tokens": 105308813.0, + "step": 4070 + }, + { + "epoch": 0.44706786734021525, + "grad_norm": 1.9689414501190186, + "learning_rate": 5e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6936085224151611, + "num_tokens": 105332742.0, + "step": 4071 + }, + { + "epoch": 0.4471776850428289, + "grad_norm": 1.9872184991836548, + "learning_rate": 5e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.689830482006073, + "num_tokens": 105358322.0, + "step": 4072 + }, + { + "epoch": 0.44728750274544254, + "grad_norm": 2.0815868377685547, + "learning_rate": 5e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6950440406799316, + "num_tokens": 105379973.0, + "step": 4073 + }, + { + "epoch": 0.44739732044805625, + "grad_norm": 1.9217197895050049, + "learning_rate": 5e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7070025205612183, + "num_tokens": 105405032.0, + "step": 4074 + }, + { + "epoch": 0.4475071381506699, + "grad_norm": 2.0256383419036865, + "learning_rate": 5e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.705773115158081, + "num_tokens": 105427672.0, + "step": 4075 + }, + { + "epoch": 0.44761695585328354, + "grad_norm": 2.312880754470825, + "learning_rate": 5e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7272151708602905, + "num_tokens": 105445983.0, + "step": 4076 + }, + { + "epoch": 0.4477267735558972, + "grad_norm": 1.689944863319397, + "learning_rate": 5e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7059404850006104, + "num_tokens": 105476232.0, + "step": 4077 + }, + { + "epoch": 0.4478365912585109, + "grad_norm": 1.7907507419586182, + "learning_rate": 5e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6904484033584595, + "num_tokens": 105506139.0, + "step": 4078 + }, + { + "epoch": 0.44794640896112453, + "grad_norm": 2.2012722492218018, + "learning_rate": 5e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.693952202796936, + "num_tokens": 105527095.0, + "step": 4079 + }, + { + "epoch": 0.4480562266637382, + "grad_norm": 1.9336167573928833, + "learning_rate": 5e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7093725204467773, + "num_tokens": 105552302.0, + "step": 4080 + }, + { + "epoch": 0.4481660443663519, + "grad_norm": 2.212183713912964, + "learning_rate": 5e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.6970518231391907, + "num_tokens": 105572600.0, + "step": 4081 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 2.0454442501068115, + "learning_rate": 5e-06, + "loss": 1.0769, + "mean_token_accuracy": 0.6754294633865356, + "num_tokens": 105596835.0, + "step": 4082 + }, + { + "epoch": 0.44838567977157917, + "grad_norm": 1.7667038440704346, + "learning_rate": 5e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6902449131011963, + "num_tokens": 105631265.0, + "step": 4083 + }, + { + "epoch": 0.4484954974741928, + "grad_norm": 2.270988941192627, + "learning_rate": 5e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.7083949446678162, + "num_tokens": 105654198.0, + "step": 4084 + }, + { + "epoch": 0.4486053151768065, + "grad_norm": 1.960925817489624, + "learning_rate": 5e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7028655409812927, + "num_tokens": 105677209.0, + "step": 4085 + }, + { + "epoch": 0.44871513287942016, + "grad_norm": 2.1169750690460205, + "learning_rate": 5e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6999370455741882, + "num_tokens": 105699273.0, + "step": 4086 + }, + { + "epoch": 0.4488249505820338, + "grad_norm": 1.902841567993164, + "learning_rate": 5e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.704778790473938, + "num_tokens": 105724131.0, + "step": 4087 + }, + { + "epoch": 0.4489347682846475, + "grad_norm": 1.9048136472702026, + "learning_rate": 5e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6854944825172424, + "num_tokens": 105752725.0, + "step": 4088 + }, + { + "epoch": 0.44904458598726116, + "grad_norm": 1.9188674688339233, + "learning_rate": 5e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6982306241989136, + "num_tokens": 105781433.0, + "step": 4089 + }, + { + "epoch": 0.4491544036898748, + "grad_norm": 2.3162901401519775, + "learning_rate": 5e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6925349831581116, + "num_tokens": 105802423.0, + "step": 4090 + }, + { + "epoch": 0.44926422139248845, + "grad_norm": 2.0020580291748047, + "learning_rate": 5e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6935847997665405, + "num_tokens": 105829891.0, + "step": 4091 + }, + { + "epoch": 0.44937403909510215, + "grad_norm": 1.866777777671814, + "learning_rate": 5e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6844085454940796, + "num_tokens": 105857112.0, + "step": 4092 + }, + { + "epoch": 0.4494838567977158, + "grad_norm": 2.1311304569244385, + "learning_rate": 5e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6912317276000977, + "num_tokens": 105879728.0, + "step": 4093 + }, + { + "epoch": 0.44959367450032944, + "grad_norm": 1.7577669620513916, + "learning_rate": 5e-06, + "loss": 1.0661, + "mean_token_accuracy": 0.6767157316207886, + "num_tokens": 105910527.0, + "step": 4094 + }, + { + "epoch": 0.4497034922029431, + "grad_norm": 1.980668306350708, + "learning_rate": 5e-06, + "loss": 0.965, + "mean_token_accuracy": 0.702700138092041, + "num_tokens": 105934356.0, + "step": 4095 + }, + { + "epoch": 0.4498133099055568, + "grad_norm": 1.784298062324524, + "learning_rate": 5e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6881077885627747, + "num_tokens": 105963414.0, + "step": 4096 + }, + { + "epoch": 0.44992312760817044, + "grad_norm": 1.8350929021835327, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7128546237945557, + "num_tokens": 105988331.0, + "step": 4097 + }, + { + "epoch": 0.4500329453107841, + "grad_norm": 1.927871823310852, + "learning_rate": 5e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7069148421287537, + "num_tokens": 106015110.0, + "step": 4098 + }, + { + "epoch": 0.4501427630133978, + "grad_norm": 1.7495275735855103, + "learning_rate": 5e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6755725145339966, + "num_tokens": 106047233.0, + "step": 4099 + }, + { + "epoch": 0.45025258071601143, + "grad_norm": 1.8751970529556274, + "learning_rate": 5e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.706879734992981, + "num_tokens": 106073003.0, + "step": 4100 + }, + { + "epoch": 0.4503623984186251, + "grad_norm": 1.8015174865722656, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7231708765029907, + "num_tokens": 106098628.0, + "step": 4101 + }, + { + "epoch": 0.4504722161212387, + "grad_norm": 1.9978771209716797, + "learning_rate": 5e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7035908699035645, + "num_tokens": 106123184.0, + "step": 4102 + }, + { + "epoch": 0.4505820338238524, + "grad_norm": 1.892073631286621, + "learning_rate": 5e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6723883748054504, + "num_tokens": 106152585.0, + "step": 4103 + }, + { + "epoch": 0.45069185152646607, + "grad_norm": 2.2403922080993652, + "learning_rate": 5e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6914063692092896, + "num_tokens": 106175812.0, + "step": 4104 + }, + { + "epoch": 0.4508016692290797, + "grad_norm": 1.9378741979599, + "learning_rate": 5e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6802738904953003, + "num_tokens": 106204422.0, + "step": 4105 + }, + { + "epoch": 0.45091148693169336, + "grad_norm": 2.134492874145508, + "learning_rate": 5e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7021424174308777, + "num_tokens": 106225789.0, + "step": 4106 + }, + { + "epoch": 0.45102130463430706, + "grad_norm": 1.9013255834579468, + "learning_rate": 5e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7181665301322937, + "num_tokens": 106249911.0, + "step": 4107 + }, + { + "epoch": 0.4511311223369207, + "grad_norm": 1.840586543083191, + "learning_rate": 5e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.680656909942627, + "num_tokens": 106277352.0, + "step": 4108 + }, + { + "epoch": 0.45124094003953436, + "grad_norm": 1.9470982551574707, + "learning_rate": 5e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6807568669319153, + "num_tokens": 106302769.0, + "step": 4109 + }, + { + "epoch": 0.45135075774214806, + "grad_norm": 1.9372947216033936, + "learning_rate": 5e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7123606204986572, + "num_tokens": 106325698.0, + "step": 4110 + }, + { + "epoch": 0.4514605754447617, + "grad_norm": 1.9161481857299805, + "learning_rate": 5e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6948897838592529, + "num_tokens": 106349094.0, + "step": 4111 + }, + { + "epoch": 0.45157039314737535, + "grad_norm": 2.3529365062713623, + "learning_rate": 5e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7120020389556885, + "num_tokens": 106368736.0, + "step": 4112 + }, + { + "epoch": 0.451680210849989, + "grad_norm": 2.1775407791137695, + "learning_rate": 5e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7180723547935486, + "num_tokens": 106389084.0, + "step": 4113 + }, + { + "epoch": 0.4517900285526027, + "grad_norm": 2.096889019012451, + "learning_rate": 5e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6864967346191406, + "num_tokens": 106414881.0, + "step": 4114 + }, + { + "epoch": 0.45189984625521634, + "grad_norm": 1.8519933223724365, + "learning_rate": 5e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7160179615020752, + "num_tokens": 106441320.0, + "step": 4115 + }, + { + "epoch": 0.45200966395783, + "grad_norm": 1.7903491258621216, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7456400990486145, + "num_tokens": 106467830.0, + "step": 4116 + }, + { + "epoch": 0.4521194816604437, + "grad_norm": 2.0412981510162354, + "learning_rate": 5e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7193589210510254, + "num_tokens": 106491317.0, + "step": 4117 + }, + { + "epoch": 0.45222929936305734, + "grad_norm": 1.9062731266021729, + "learning_rate": 5e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7035601735115051, + "num_tokens": 106519465.0, + "step": 4118 + }, + { + "epoch": 0.452339117065671, + "grad_norm": 1.906276822090149, + "learning_rate": 5e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6798346042633057, + "num_tokens": 106548959.0, + "step": 4119 + }, + { + "epoch": 0.4524489347682846, + "grad_norm": 2.007502317428589, + "learning_rate": 5e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6917815208435059, + "num_tokens": 106572369.0, + "step": 4120 + }, + { + "epoch": 0.45255875247089833, + "grad_norm": 2.0458762645721436, + "learning_rate": 5e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.698494553565979, + "num_tokens": 106595928.0, + "step": 4121 + }, + { + "epoch": 0.452668570173512, + "grad_norm": 2.078799247741699, + "learning_rate": 5e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.697331428527832, + "num_tokens": 106619735.0, + "step": 4122 + }, + { + "epoch": 0.4527783878761256, + "grad_norm": 1.9856730699539185, + "learning_rate": 5e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7008934020996094, + "num_tokens": 106643465.0, + "step": 4123 + }, + { + "epoch": 0.45288820557873927, + "grad_norm": 1.915998935699463, + "learning_rate": 5e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7096706628799438, + "num_tokens": 106668790.0, + "step": 4124 + }, + { + "epoch": 0.45299802328135297, + "grad_norm": 1.8371825218200684, + "learning_rate": 5e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7003001570701599, + "num_tokens": 106696351.0, + "step": 4125 + }, + { + "epoch": 0.4531078409839666, + "grad_norm": 2.007983922958374, + "learning_rate": 5e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.700736403465271, + "num_tokens": 106718922.0, + "step": 4126 + }, + { + "epoch": 0.45321765868658026, + "grad_norm": 2.0195565223693848, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.719953179359436, + "num_tokens": 106740890.0, + "step": 4127 + }, + { + "epoch": 0.45332747638919396, + "grad_norm": 1.9203490018844604, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7147597670555115, + "num_tokens": 106769771.0, + "step": 4128 + }, + { + "epoch": 0.4534372940918076, + "grad_norm": 2.1298036575317383, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7293713092803955, + "num_tokens": 106788996.0, + "step": 4129 + }, + { + "epoch": 0.45354711179442125, + "grad_norm": 1.726337194442749, + "learning_rate": 5e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7146607041358948, + "num_tokens": 106822123.0, + "step": 4130 + }, + { + "epoch": 0.4536569294970349, + "grad_norm": 2.0226969718933105, + "learning_rate": 5e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.6993066072463989, + "num_tokens": 106846813.0, + "step": 4131 + }, + { + "epoch": 0.4537667471996486, + "grad_norm": 1.7964863777160645, + "learning_rate": 5e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6831183433532715, + "num_tokens": 106875985.0, + "step": 4132 + }, + { + "epoch": 0.45387656490226225, + "grad_norm": 2.0644640922546387, + "learning_rate": 5e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7131290435791016, + "num_tokens": 106898626.0, + "step": 4133 + }, + { + "epoch": 0.4539863826048759, + "grad_norm": 2.0907702445983887, + "learning_rate": 5e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6840262413024902, + "num_tokens": 106924375.0, + "step": 4134 + }, + { + "epoch": 0.4540962003074896, + "grad_norm": 2.1166038513183594, + "learning_rate": 5e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7081091403961182, + "num_tokens": 106945507.0, + "step": 4135 + }, + { + "epoch": 0.45420601801010324, + "grad_norm": 1.9715811014175415, + "learning_rate": 5e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6911954879760742, + "num_tokens": 106970424.0, + "step": 4136 + }, + { + "epoch": 0.4543158357127169, + "grad_norm": 1.7905399799346924, + "learning_rate": 5e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7110903263092041, + "num_tokens": 106998132.0, + "step": 4137 + }, + { + "epoch": 0.45442565341533053, + "grad_norm": 2.0874722003936768, + "learning_rate": 5e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7211333513259888, + "num_tokens": 107020342.0, + "step": 4138 + }, + { + "epoch": 0.45453547111794423, + "grad_norm": 2.1797077655792236, + "learning_rate": 5e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7044249773025513, + "num_tokens": 107040979.0, + "step": 4139 + }, + { + "epoch": 0.4546452888205579, + "grad_norm": 1.9789328575134277, + "learning_rate": 5e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7264686822891235, + "num_tokens": 107065543.0, + "step": 4140 + }, + { + "epoch": 0.4547551065231715, + "grad_norm": 1.8455253839492798, + "learning_rate": 5e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7077780961990356, + "num_tokens": 107091235.0, + "step": 4141 + }, + { + "epoch": 0.45486492422578517, + "grad_norm": 1.9213447570800781, + "learning_rate": 5e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7160005569458008, + "num_tokens": 107115633.0, + "step": 4142 + }, + { + "epoch": 0.4549747419283989, + "grad_norm": 1.8993251323699951, + "learning_rate": 5e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6886295080184937, + "num_tokens": 107143754.0, + "step": 4143 + }, + { + "epoch": 0.4550845596310125, + "grad_norm": 2.0893073081970215, + "learning_rate": 5e-06, + "loss": 1.15, + "mean_token_accuracy": 0.6757117509841919, + "num_tokens": 107170029.0, + "step": 4144 + }, + { + "epoch": 0.45519437733362617, + "grad_norm": 1.8733189105987549, + "learning_rate": 5e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7045813798904419, + "num_tokens": 107196605.0, + "step": 4145 + }, + { + "epoch": 0.45530419503623987, + "grad_norm": 1.6934194564819336, + "learning_rate": 5e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7178218960762024, + "num_tokens": 107226698.0, + "step": 4146 + }, + { + "epoch": 0.4554140127388535, + "grad_norm": 1.885201334953308, + "learning_rate": 5e-06, + "loss": 1.069, + "mean_token_accuracy": 0.6805329322814941, + "num_tokens": 107253646.0, + "step": 4147 + }, + { + "epoch": 0.45552383044146716, + "grad_norm": 1.9847015142440796, + "learning_rate": 5e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7182780504226685, + "num_tokens": 107275965.0, + "step": 4148 + }, + { + "epoch": 0.4556336481440808, + "grad_norm": 1.8720389604568481, + "learning_rate": 5e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6973186731338501, + "num_tokens": 107301925.0, + "step": 4149 + }, + { + "epoch": 0.4557434658466945, + "grad_norm": 1.9295812845230103, + "learning_rate": 5e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6923722624778748, + "num_tokens": 107326859.0, + "step": 4150 + }, + { + "epoch": 0.45585328354930815, + "grad_norm": 2.379427194595337, + "learning_rate": 5e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6921977996826172, + "num_tokens": 107346803.0, + "step": 4151 + }, + { + "epoch": 0.4559631012519218, + "grad_norm": 2.1128787994384766, + "learning_rate": 5e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6900424957275391, + "num_tokens": 107369689.0, + "step": 4152 + }, + { + "epoch": 0.45607291895453544, + "grad_norm": 1.9064342975616455, + "learning_rate": 5e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7147035598754883, + "num_tokens": 107397017.0, + "step": 4153 + }, + { + "epoch": 0.45618273665714915, + "grad_norm": 1.9542039632797241, + "learning_rate": 5e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7076994180679321, + "num_tokens": 107422252.0, + "step": 4154 + }, + { + "epoch": 0.4562925543597628, + "grad_norm": 1.6830812692642212, + "learning_rate": 5e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7000441551208496, + "num_tokens": 107452185.0, + "step": 4155 + }, + { + "epoch": 0.45640237206237644, + "grad_norm": 1.973787784576416, + "learning_rate": 5e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7118844985961914, + "num_tokens": 107476107.0, + "step": 4156 + }, + { + "epoch": 0.45651218976499014, + "grad_norm": 1.8772752285003662, + "learning_rate": 5e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7033633589744568, + "num_tokens": 107504088.0, + "step": 4157 + }, + { + "epoch": 0.4566220074676038, + "grad_norm": 2.299790382385254, + "learning_rate": 5e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.693401575088501, + "num_tokens": 107524795.0, + "step": 4158 + }, + { + "epoch": 0.45673182517021743, + "grad_norm": 1.8648985624313354, + "learning_rate": 5e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7199997305870056, + "num_tokens": 107550059.0, + "step": 4159 + }, + { + "epoch": 0.4568416428728311, + "grad_norm": 1.7729474306106567, + "learning_rate": 5e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.726998507976532, + "num_tokens": 107576140.0, + "step": 4160 + }, + { + "epoch": 0.4569514605754448, + "grad_norm": 1.980231761932373, + "learning_rate": 5e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6856001615524292, + "num_tokens": 107604161.0, + "step": 4161 + }, + { + "epoch": 0.4570612782780584, + "grad_norm": 1.870469331741333, + "learning_rate": 5e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6833384037017822, + "num_tokens": 107632009.0, + "step": 4162 + }, + { + "epoch": 0.45717109598067207, + "grad_norm": 1.935984492301941, + "learning_rate": 5e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.694174587726593, + "num_tokens": 107660317.0, + "step": 4163 + }, + { + "epoch": 0.4572809136832858, + "grad_norm": 1.9718468189239502, + "learning_rate": 5e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7081059217453003, + "num_tokens": 107686258.0, + "step": 4164 + }, + { + "epoch": 0.4573907313858994, + "grad_norm": 2.039257287979126, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7087362408638, + "num_tokens": 107710429.0, + "step": 4165 + }, + { + "epoch": 0.45750054908851306, + "grad_norm": 1.6674803495407104, + "learning_rate": 5e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7167775630950928, + "num_tokens": 107740088.0, + "step": 4166 + }, + { + "epoch": 0.4576103667911267, + "grad_norm": 1.833262324333191, + "learning_rate": 5e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6947203874588013, + "num_tokens": 107769428.0, + "step": 4167 + }, + { + "epoch": 0.4577201844937404, + "grad_norm": 1.7257719039916992, + "learning_rate": 5e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.7019000053405762, + "num_tokens": 107802252.0, + "step": 4168 + }, + { + "epoch": 0.45783000219635406, + "grad_norm": 1.881089687347412, + "learning_rate": 5e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6943549513816833, + "num_tokens": 107827614.0, + "step": 4169 + }, + { + "epoch": 0.4579398198989677, + "grad_norm": 1.9227317571640015, + "learning_rate": 5e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.714805543422699, + "num_tokens": 107853019.0, + "step": 4170 + }, + { + "epoch": 0.45804963760158135, + "grad_norm": 2.0925517082214355, + "learning_rate": 5e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6854932904243469, + "num_tokens": 107877005.0, + "step": 4171 + }, + { + "epoch": 0.45815945530419505, + "grad_norm": 1.9181692600250244, + "learning_rate": 5e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6861501932144165, + "num_tokens": 107903956.0, + "step": 4172 + }, + { + "epoch": 0.4582692730068087, + "grad_norm": 1.7378462553024292, + "learning_rate": 5e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7180261611938477, + "num_tokens": 107931780.0, + "step": 4173 + }, + { + "epoch": 0.45837909070942234, + "grad_norm": 1.7812424898147583, + "learning_rate": 5e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7102825045585632, + "num_tokens": 107961869.0, + "step": 4174 + }, + { + "epoch": 0.45848890841203604, + "grad_norm": 2.0544795989990234, + "learning_rate": 5e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7056329846382141, + "num_tokens": 107985102.0, + "step": 4175 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 2.0133984088897705, + "learning_rate": 5e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.701346755027771, + "num_tokens": 108009067.0, + "step": 4176 + }, + { + "epoch": 0.45870854381726334, + "grad_norm": 1.9329777956008911, + "learning_rate": 5e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.687584400177002, + "num_tokens": 108034786.0, + "step": 4177 + }, + { + "epoch": 0.458818361519877, + "grad_norm": 1.7850618362426758, + "learning_rate": 5e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6973816156387329, + "num_tokens": 108064647.0, + "step": 4178 + }, + { + "epoch": 0.4589281792224907, + "grad_norm": 1.7637691497802734, + "learning_rate": 5e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6945163607597351, + "num_tokens": 108095292.0, + "step": 4179 + }, + { + "epoch": 0.45903799692510433, + "grad_norm": 2.1502835750579834, + "learning_rate": 5e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7180137634277344, + "num_tokens": 108114559.0, + "step": 4180 + }, + { + "epoch": 0.459147814627718, + "grad_norm": 2.0509510040283203, + "learning_rate": 5e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7257292866706848, + "num_tokens": 108135559.0, + "step": 4181 + }, + { + "epoch": 0.4592576323303316, + "grad_norm": 2.1722464561462402, + "learning_rate": 5e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7019791603088379, + "num_tokens": 108156867.0, + "step": 4182 + }, + { + "epoch": 0.4593674500329453, + "grad_norm": 1.5485445261001587, + "learning_rate": 5e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7065445184707642, + "num_tokens": 108196105.0, + "step": 4183 + }, + { + "epoch": 0.45947726773555897, + "grad_norm": 1.8137918710708618, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7312265634536743, + "num_tokens": 108222688.0, + "step": 4184 + }, + { + "epoch": 0.4595870854381726, + "grad_norm": 2.016913652420044, + "learning_rate": 5e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.692412793636322, + "num_tokens": 108246174.0, + "step": 4185 + }, + { + "epoch": 0.4596969031407863, + "grad_norm": 1.8668172359466553, + "learning_rate": 5e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7164645195007324, + "num_tokens": 108273108.0, + "step": 4186 + }, + { + "epoch": 0.45980672084339996, + "grad_norm": 1.937644124031067, + "learning_rate": 5e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6950949430465698, + "num_tokens": 108300851.0, + "step": 4187 + }, + { + "epoch": 0.4599165385460136, + "grad_norm": 1.909652590751648, + "learning_rate": 5e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6812556982040405, + "num_tokens": 108328131.0, + "step": 4188 + }, + { + "epoch": 0.46002635624862726, + "grad_norm": 1.8326374292373657, + "learning_rate": 5e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7027320861816406, + "num_tokens": 108356878.0, + "step": 4189 + }, + { + "epoch": 0.46013617395124096, + "grad_norm": 2.299260377883911, + "learning_rate": 5e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6951208710670471, + "num_tokens": 108378931.0, + "step": 4190 + }, + { + "epoch": 0.4602459916538546, + "grad_norm": 2.036160945892334, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7080050706863403, + "num_tokens": 108401380.0, + "step": 4191 + }, + { + "epoch": 0.46035580935646825, + "grad_norm": 1.7015613317489624, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7135800123214722, + "num_tokens": 108434633.0, + "step": 4192 + }, + { + "epoch": 0.46046562705908195, + "grad_norm": 1.8447113037109375, + "learning_rate": 5e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7135297060012817, + "num_tokens": 108460900.0, + "step": 4193 + }, + { + "epoch": 0.4605754447616956, + "grad_norm": 1.7103039026260376, + "learning_rate": 5e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6965503096580505, + "num_tokens": 108491812.0, + "step": 4194 + }, + { + "epoch": 0.46068526246430924, + "grad_norm": 1.9814130067825317, + "learning_rate": 5e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7111557722091675, + "num_tokens": 108513594.0, + "step": 4195 + }, + { + "epoch": 0.4607950801669229, + "grad_norm": 1.8966758251190186, + "learning_rate": 5e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6945474147796631, + "num_tokens": 108540126.0, + "step": 4196 + }, + { + "epoch": 0.4609048978695366, + "grad_norm": 1.7075639963150024, + "learning_rate": 5e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6800713539123535, + "num_tokens": 108573514.0, + "step": 4197 + }, + { + "epoch": 0.46101471557215024, + "grad_norm": 1.7242130041122437, + "learning_rate": 5e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.6789867877960205, + "num_tokens": 108610001.0, + "step": 4198 + }, + { + "epoch": 0.4611245332747639, + "grad_norm": 1.821714162826538, + "learning_rate": 5e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7108910083770752, + "num_tokens": 108639551.0, + "step": 4199 + }, + { + "epoch": 0.4612343509773775, + "grad_norm": 1.9370275735855103, + "learning_rate": 5e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7103209495544434, + "num_tokens": 108663088.0, + "step": 4200 + }, + { + "epoch": 0.46134416867999123, + "grad_norm": 1.7309956550598145, + "learning_rate": 5e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6954285502433777, + "num_tokens": 108694000.0, + "step": 4201 + }, + { + "epoch": 0.4614539863826049, + "grad_norm": 1.7864689826965332, + "learning_rate": 5e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6941876411437988, + "num_tokens": 108723283.0, + "step": 4202 + }, + { + "epoch": 0.4615638040852185, + "grad_norm": 2.0723302364349365, + "learning_rate": 5e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7127294540405273, + "num_tokens": 108744318.0, + "step": 4203 + }, + { + "epoch": 0.4616736217878322, + "grad_norm": 2.065215587615967, + "learning_rate": 5e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.706213116645813, + "num_tokens": 108766419.0, + "step": 4204 + }, + { + "epoch": 0.46178343949044587, + "grad_norm": 1.987654685974121, + "learning_rate": 5e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6879385709762573, + "num_tokens": 108793546.0, + "step": 4205 + }, + { + "epoch": 0.4618932571930595, + "grad_norm": 1.8237582445144653, + "learning_rate": 5e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7134805917739868, + "num_tokens": 108819062.0, + "step": 4206 + }, + { + "epoch": 0.46200307489567316, + "grad_norm": 1.7245638370513916, + "learning_rate": 5e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7030317783355713, + "num_tokens": 108850638.0, + "step": 4207 + }, + { + "epoch": 0.46211289259828686, + "grad_norm": 1.8701897859573364, + "learning_rate": 5e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6746472120285034, + "num_tokens": 108878862.0, + "step": 4208 + }, + { + "epoch": 0.4622227103009005, + "grad_norm": 2.5283660888671875, + "learning_rate": 5e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7099621295928955, + "num_tokens": 108895488.0, + "step": 4209 + }, + { + "epoch": 0.46233252800351415, + "grad_norm": 2.2131264209747314, + "learning_rate": 5e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6960436105728149, + "num_tokens": 108915229.0, + "step": 4210 + }, + { + "epoch": 0.46244234570612786, + "grad_norm": 1.771954894065857, + "learning_rate": 5e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7011992931365967, + "num_tokens": 108946168.0, + "step": 4211 + }, + { + "epoch": 0.4625521634087415, + "grad_norm": 1.9411993026733398, + "learning_rate": 5e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6942603588104248, + "num_tokens": 108972377.0, + "step": 4212 + }, + { + "epoch": 0.46266198111135515, + "grad_norm": 1.8746229410171509, + "learning_rate": 5e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7135173082351685, + "num_tokens": 108997989.0, + "step": 4213 + }, + { + "epoch": 0.4627717988139688, + "grad_norm": 1.9161134958267212, + "learning_rate": 5e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7102072238922119, + "num_tokens": 109022806.0, + "step": 4214 + }, + { + "epoch": 0.4628816165165825, + "grad_norm": 1.9618644714355469, + "learning_rate": 5e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6993594169616699, + "num_tokens": 109046736.0, + "step": 4215 + }, + { + "epoch": 0.46299143421919614, + "grad_norm": 2.2974112033843994, + "learning_rate": 5e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.708804726600647, + "num_tokens": 109064299.0, + "step": 4216 + }, + { + "epoch": 0.4631012519218098, + "grad_norm": 1.6727241277694702, + "learning_rate": 5e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7206211090087891, + "num_tokens": 109095478.0, + "step": 4217 + }, + { + "epoch": 0.46321106962442343, + "grad_norm": 1.7552645206451416, + "learning_rate": 5e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6999082565307617, + "num_tokens": 109126682.0, + "step": 4218 + }, + { + "epoch": 0.46332088732703713, + "grad_norm": 1.7984447479248047, + "learning_rate": 5e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6992689371109009, + "num_tokens": 109154477.0, + "step": 4219 + }, + { + "epoch": 0.4634307050296508, + "grad_norm": 1.7710602283477783, + "learning_rate": 5e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7243620157241821, + "num_tokens": 109183485.0, + "step": 4220 + }, + { + "epoch": 0.4635405227322644, + "grad_norm": 1.8200335502624512, + "learning_rate": 5e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6951457262039185, + "num_tokens": 109213673.0, + "step": 4221 + }, + { + "epoch": 0.4636503404348781, + "grad_norm": 2.128474473953247, + "learning_rate": 5e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6945582628250122, + "num_tokens": 109236571.0, + "step": 4222 + }, + { + "epoch": 0.4637601581374918, + "grad_norm": 1.9506049156188965, + "learning_rate": 5e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6900557279586792, + "num_tokens": 109262681.0, + "step": 4223 + }, + { + "epoch": 0.4638699758401054, + "grad_norm": 1.9263997077941895, + "learning_rate": 5e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.700358510017395, + "num_tokens": 109287222.0, + "step": 4224 + }, + { + "epoch": 0.46397979354271907, + "grad_norm": 2.2246904373168945, + "learning_rate": 5e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6951746940612793, + "num_tokens": 109310530.0, + "step": 4225 + }, + { + "epoch": 0.46408961124533277, + "grad_norm": 1.8326393365859985, + "learning_rate": 5e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7130266427993774, + "num_tokens": 109337027.0, + "step": 4226 + }, + { + "epoch": 0.4641994289479464, + "grad_norm": 2.1485483646392822, + "learning_rate": 5e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6978685259819031, + "num_tokens": 109358458.0, + "step": 4227 + }, + { + "epoch": 0.46430924665056006, + "grad_norm": 1.9826836585998535, + "learning_rate": 5e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6937589645385742, + "num_tokens": 109382847.0, + "step": 4228 + }, + { + "epoch": 0.4644190643531737, + "grad_norm": 1.9475212097167969, + "learning_rate": 5e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7088833451271057, + "num_tokens": 109407165.0, + "step": 4229 + }, + { + "epoch": 0.4645288820557874, + "grad_norm": 1.886530876159668, + "learning_rate": 5e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7012887001037598, + "num_tokens": 109432650.0, + "step": 4230 + }, + { + "epoch": 0.46463869975840105, + "grad_norm": 2.1733145713806152, + "learning_rate": 5e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7241613268852234, + "num_tokens": 109452973.0, + "step": 4231 + }, + { + "epoch": 0.4647485174610147, + "grad_norm": 2.1455888748168945, + "learning_rate": 5e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7190683484077454, + "num_tokens": 109473641.0, + "step": 4232 + }, + { + "epoch": 0.4648583351636284, + "grad_norm": 1.8409477472305298, + "learning_rate": 5e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6869582533836365, + "num_tokens": 109501746.0, + "step": 4233 + }, + { + "epoch": 0.46496815286624205, + "grad_norm": 2.101433038711548, + "learning_rate": 5e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7083594799041748, + "num_tokens": 109524478.0, + "step": 4234 + }, + { + "epoch": 0.4650779705688557, + "grad_norm": 1.989664912223816, + "learning_rate": 5e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7026594877243042, + "num_tokens": 109549605.0, + "step": 4235 + }, + { + "epoch": 0.46518778827146934, + "grad_norm": 1.6983003616333008, + "learning_rate": 5e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6910576820373535, + "num_tokens": 109585707.0, + "step": 4236 + }, + { + "epoch": 0.46529760597408304, + "grad_norm": 1.7061628103256226, + "learning_rate": 5e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7001743912696838, + "num_tokens": 109617725.0, + "step": 4237 + }, + { + "epoch": 0.4654074236766967, + "grad_norm": 1.7894980907440186, + "learning_rate": 5e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.6998885273933411, + "num_tokens": 109646593.0, + "step": 4238 + }, + { + "epoch": 0.46551724137931033, + "grad_norm": 2.1542441844940186, + "learning_rate": 5e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7233424186706543, + "num_tokens": 109666778.0, + "step": 4239 + }, + { + "epoch": 0.46562705908192403, + "grad_norm": 2.0098471641540527, + "learning_rate": 5e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7016961574554443, + "num_tokens": 109693157.0, + "step": 4240 + }, + { + "epoch": 0.4657368767845377, + "grad_norm": 2.028101682662964, + "learning_rate": 5e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7147974967956543, + "num_tokens": 109715311.0, + "step": 4241 + }, + { + "epoch": 0.4658466944871513, + "grad_norm": 2.0949673652648926, + "learning_rate": 5e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6961054801940918, + "num_tokens": 109736551.0, + "step": 4242 + }, + { + "epoch": 0.46595651218976497, + "grad_norm": 1.8430417776107788, + "learning_rate": 5e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7001414895057678, + "num_tokens": 109761438.0, + "step": 4243 + }, + { + "epoch": 0.4660663298923787, + "grad_norm": 2.0380449295043945, + "learning_rate": 5e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7171134352684021, + "num_tokens": 109783958.0, + "step": 4244 + }, + { + "epoch": 0.4661761475949923, + "grad_norm": 1.8929951190948486, + "learning_rate": 5e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6924397945404053, + "num_tokens": 109810221.0, + "step": 4245 + }, + { + "epoch": 0.46628596529760596, + "grad_norm": 2.0197479724884033, + "learning_rate": 5e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.702808141708374, + "num_tokens": 109833296.0, + "step": 4246 + }, + { + "epoch": 0.4663957830002196, + "grad_norm": 1.8121978044509888, + "learning_rate": 5e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7026993036270142, + "num_tokens": 109860093.0, + "step": 4247 + }, + { + "epoch": 0.4665056007028333, + "grad_norm": 1.8258216381072998, + "learning_rate": 5e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6937072277069092, + "num_tokens": 109890339.0, + "step": 4248 + }, + { + "epoch": 0.46661541840544696, + "grad_norm": 2.114546775817871, + "learning_rate": 5e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7188855409622192, + "num_tokens": 109912298.0, + "step": 4249 + }, + { + "epoch": 0.4667252361080606, + "grad_norm": 1.9194411039352417, + "learning_rate": 5e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6965556144714355, + "num_tokens": 109937947.0, + "step": 4250 + }, + { + "epoch": 0.4668350538106743, + "grad_norm": 1.8891116380691528, + "learning_rate": 5e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6973608136177063, + "num_tokens": 109964249.0, + "step": 4251 + }, + { + "epoch": 0.46694487151328795, + "grad_norm": 1.7714539766311646, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7272727489471436, + "num_tokens": 109989554.0, + "step": 4252 + }, + { + "epoch": 0.4670546892159016, + "grad_norm": 1.9010350704193115, + "learning_rate": 5e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6896121501922607, + "num_tokens": 110016061.0, + "step": 4253 + }, + { + "epoch": 0.46716450691851524, + "grad_norm": 1.9296448230743408, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7183120250701904, + "num_tokens": 110040427.0, + "step": 4254 + }, + { + "epoch": 0.46727432462112894, + "grad_norm": 1.6567977666854858, + "learning_rate": 5e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7035930156707764, + "num_tokens": 110072887.0, + "step": 4255 + }, + { + "epoch": 0.4673841423237426, + "grad_norm": 1.8838459253311157, + "learning_rate": 5e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7099149823188782, + "num_tokens": 110099662.0, + "step": 4256 + }, + { + "epoch": 0.46749396002635624, + "grad_norm": 2.034308671951294, + "learning_rate": 5e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.704028844833374, + "num_tokens": 110122331.0, + "step": 4257 + }, + { + "epoch": 0.4676037777289699, + "grad_norm": 1.9739289283752441, + "learning_rate": 5e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.697760820388794, + "num_tokens": 110147487.0, + "step": 4258 + }, + { + "epoch": 0.4677135954315836, + "grad_norm": 1.985849380493164, + "learning_rate": 5e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7099435329437256, + "num_tokens": 110170426.0, + "step": 4259 + }, + { + "epoch": 0.46782341313419723, + "grad_norm": 1.8041881322860718, + "learning_rate": 5e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6849257946014404, + "num_tokens": 110198371.0, + "step": 4260 + }, + { + "epoch": 0.4679332308368109, + "grad_norm": 1.8347254991531372, + "learning_rate": 5e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6888154745101929, + "num_tokens": 110226544.0, + "step": 4261 + }, + { + "epoch": 0.4680430485394246, + "grad_norm": 1.9308258295059204, + "learning_rate": 5e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7113564014434814, + "num_tokens": 110253915.0, + "step": 4262 + }, + { + "epoch": 0.4681528662420382, + "grad_norm": 1.8900724649429321, + "learning_rate": 5e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7162147164344788, + "num_tokens": 110280564.0, + "step": 4263 + }, + { + "epoch": 0.46826268394465187, + "grad_norm": 2.033116102218628, + "learning_rate": 5e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7067301273345947, + "num_tokens": 110302434.0, + "step": 4264 + }, + { + "epoch": 0.4683725016472655, + "grad_norm": 1.8756763935089111, + "learning_rate": 5e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7005443572998047, + "num_tokens": 110329984.0, + "step": 4265 + }, + { + "epoch": 0.4684823193498792, + "grad_norm": 1.6922115087509155, + "learning_rate": 5e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7061300873756409, + "num_tokens": 110361309.0, + "step": 4266 + }, + { + "epoch": 0.46859213705249286, + "grad_norm": 1.8446818590164185, + "learning_rate": 5e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6877005100250244, + "num_tokens": 110389857.0, + "step": 4267 + }, + { + "epoch": 0.4687019547551065, + "grad_norm": 1.9977145195007324, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7167419195175171, + "num_tokens": 110411252.0, + "step": 4268 + }, + { + "epoch": 0.4688117724577202, + "grad_norm": 1.9062743186950684, + "learning_rate": 5e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6845123171806335, + "num_tokens": 110439318.0, + "step": 4269 + }, + { + "epoch": 0.46892159016033386, + "grad_norm": 2.081181049346924, + "learning_rate": 5e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7070083618164062, + "num_tokens": 110460323.0, + "step": 4270 + }, + { + "epoch": 0.4690314078629475, + "grad_norm": 1.8489700555801392, + "learning_rate": 5e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6963653564453125, + "num_tokens": 110487815.0, + "step": 4271 + }, + { + "epoch": 0.46914122556556115, + "grad_norm": 1.9984875917434692, + "learning_rate": 5e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7090939283370972, + "num_tokens": 110511759.0, + "step": 4272 + }, + { + "epoch": 0.46925104326817485, + "grad_norm": 1.800225019454956, + "learning_rate": 5e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7141395211219788, + "num_tokens": 110541531.0, + "step": 4273 + }, + { + "epoch": 0.4693608609707885, + "grad_norm": 2.1492459774017334, + "learning_rate": 5e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7228125929832458, + "num_tokens": 110562660.0, + "step": 4274 + }, + { + "epoch": 0.46947067867340214, + "grad_norm": 1.676888108253479, + "learning_rate": 5e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6903361082077026, + "num_tokens": 110598036.0, + "step": 4275 + }, + { + "epoch": 0.4695804963760158, + "grad_norm": 2.0135018825531006, + "learning_rate": 5e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7057482004165649, + "num_tokens": 110619699.0, + "step": 4276 + }, + { + "epoch": 0.4696903140786295, + "grad_norm": 2.1912341117858887, + "learning_rate": 5e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7148569822311401, + "num_tokens": 110639875.0, + "step": 4277 + }, + { + "epoch": 0.46980013178124314, + "grad_norm": 1.740411639213562, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7277691960334778, + "num_tokens": 110669785.0, + "step": 4278 + }, + { + "epoch": 0.4699099494838568, + "grad_norm": 1.9769566059112549, + "learning_rate": 5e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6718361377716064, + "num_tokens": 110695286.0, + "step": 4279 + }, + { + "epoch": 0.4700197671864705, + "grad_norm": 1.9443306922912598, + "learning_rate": 5e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7056760787963867, + "num_tokens": 110720628.0, + "step": 4280 + }, + { + "epoch": 0.47012958488908413, + "grad_norm": 2.0192794799804688, + "learning_rate": 5e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7032828330993652, + "num_tokens": 110744474.0, + "step": 4281 + }, + { + "epoch": 0.4702394025916978, + "grad_norm": 2.002175807952881, + "learning_rate": 5e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6876834630966187, + "num_tokens": 110769554.0, + "step": 4282 + }, + { + "epoch": 0.4703492202943114, + "grad_norm": 1.8079988956451416, + "learning_rate": 5e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7049858570098877, + "num_tokens": 110795841.0, + "step": 4283 + }, + { + "epoch": 0.4704590379969251, + "grad_norm": 1.9338335990905762, + "learning_rate": 5e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6971083879470825, + "num_tokens": 110821312.0, + "step": 4284 + }, + { + "epoch": 0.47056885569953877, + "grad_norm": 1.8768616914749146, + "learning_rate": 5e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6967674493789673, + "num_tokens": 110851718.0, + "step": 4285 + }, + { + "epoch": 0.4706786734021524, + "grad_norm": 2.0681192874908447, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7343955039978027, + "num_tokens": 110873968.0, + "step": 4286 + }, + { + "epoch": 0.4707884911047661, + "grad_norm": 1.883378505706787, + "learning_rate": 5e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6935890913009644, + "num_tokens": 110903857.0, + "step": 4287 + }, + { + "epoch": 0.47089830880737976, + "grad_norm": 1.9386156797409058, + "learning_rate": 5e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7067021131515503, + "num_tokens": 110927626.0, + "step": 4288 + }, + { + "epoch": 0.4710081265099934, + "grad_norm": 1.8208156824111938, + "learning_rate": 5e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6964747905731201, + "num_tokens": 110957235.0, + "step": 4289 + }, + { + "epoch": 0.47111794421260705, + "grad_norm": 1.9006239175796509, + "learning_rate": 5e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7090030312538147, + "num_tokens": 110982552.0, + "step": 4290 + }, + { + "epoch": 0.47122776191522076, + "grad_norm": 2.0300397872924805, + "learning_rate": 5e-06, + "loss": 0.909, + "mean_token_accuracy": 0.716144323348999, + "num_tokens": 111004218.0, + "step": 4291 + }, + { + "epoch": 0.4713375796178344, + "grad_norm": 1.6965652704238892, + "learning_rate": 5e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6837010383605957, + "num_tokens": 111035273.0, + "step": 4292 + }, + { + "epoch": 0.47144739732044805, + "grad_norm": 1.7692676782608032, + "learning_rate": 5e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7090290784835815, + "num_tokens": 111064856.0, + "step": 4293 + }, + { + "epoch": 0.4715572150230617, + "grad_norm": 1.700690507888794, + "learning_rate": 5e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7224594950675964, + "num_tokens": 111094355.0, + "step": 4294 + }, + { + "epoch": 0.4716670327256754, + "grad_norm": 2.015376567840576, + "learning_rate": 5e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7084441184997559, + "num_tokens": 111118672.0, + "step": 4295 + }, + { + "epoch": 0.47177685042828904, + "grad_norm": 2.202019453048706, + "learning_rate": 5e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7159570455551147, + "num_tokens": 111139056.0, + "step": 4296 + }, + { + "epoch": 0.4718866681309027, + "grad_norm": 1.8805339336395264, + "learning_rate": 5e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7009482383728027, + "num_tokens": 111164246.0, + "step": 4297 + }, + { + "epoch": 0.4719964858335164, + "grad_norm": 1.8990806341171265, + "learning_rate": 5e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7155818343162537, + "num_tokens": 111189400.0, + "step": 4298 + }, + { + "epoch": 0.47210630353613003, + "grad_norm": 1.7680017948150635, + "learning_rate": 5e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6814412474632263, + "num_tokens": 111224293.0, + "step": 4299 + }, + { + "epoch": 0.4722161212387437, + "grad_norm": 1.7264810800552368, + "learning_rate": 5e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6891785264015198, + "num_tokens": 111255772.0, + "step": 4300 + }, + { + "epoch": 0.4723259389413573, + "grad_norm": 1.9530339241027832, + "learning_rate": 5e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6845874786376953, + "num_tokens": 111283205.0, + "step": 4301 + }, + { + "epoch": 0.472435756643971, + "grad_norm": 1.7480465173721313, + "learning_rate": 5e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6804342269897461, + "num_tokens": 111318346.0, + "step": 4302 + }, + { + "epoch": 0.4725455743465847, + "grad_norm": 1.93483567237854, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7122443318367004, + "num_tokens": 111343897.0, + "step": 4303 + }, + { + "epoch": 0.4726553920491983, + "grad_norm": 1.8327418565750122, + "learning_rate": 5e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6795938014984131, + "num_tokens": 111371879.0, + "step": 4304 + }, + { + "epoch": 0.47276520975181197, + "grad_norm": 1.866716742515564, + "learning_rate": 5e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6782274842262268, + "num_tokens": 111399029.0, + "step": 4305 + }, + { + "epoch": 0.47287502745442567, + "grad_norm": 1.9457591772079468, + "learning_rate": 5e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6891300678253174, + "num_tokens": 111424232.0, + "step": 4306 + }, + { + "epoch": 0.4729848451570393, + "grad_norm": 2.0505340099334717, + "learning_rate": 5e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7126002907752991, + "num_tokens": 111446599.0, + "step": 4307 + }, + { + "epoch": 0.47309466285965296, + "grad_norm": 1.835980772972107, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7192684412002563, + "num_tokens": 111472219.0, + "step": 4308 + }, + { + "epoch": 0.47320448056226666, + "grad_norm": 2.20609712600708, + "learning_rate": 5e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7351879477500916, + "num_tokens": 111492365.0, + "step": 4309 + }, + { + "epoch": 0.4733142982648803, + "grad_norm": 1.8986046314239502, + "learning_rate": 5e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.703107476234436, + "num_tokens": 111519380.0, + "step": 4310 + }, + { + "epoch": 0.47342411596749395, + "grad_norm": 1.7721203565597534, + "learning_rate": 5e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6918030977249146, + "num_tokens": 111550731.0, + "step": 4311 + }, + { + "epoch": 0.4735339336701076, + "grad_norm": 2.187778949737549, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7152976989746094, + "num_tokens": 111572863.0, + "step": 4312 + }, + { + "epoch": 0.4736437513727213, + "grad_norm": 2.211820125579834, + "learning_rate": 5e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7180145382881165, + "num_tokens": 111593482.0, + "step": 4313 + }, + { + "epoch": 0.47375356907533495, + "grad_norm": 2.038761615753174, + "learning_rate": 5e-06, + "loss": 1.078, + "mean_token_accuracy": 0.6754952669143677, + "num_tokens": 111618848.0, + "step": 4314 + }, + { + "epoch": 0.4738633867779486, + "grad_norm": 1.805034875869751, + "learning_rate": 5e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7080124616622925, + "num_tokens": 111646917.0, + "step": 4315 + }, + { + "epoch": 0.4739732044805623, + "grad_norm": 1.8417747020721436, + "learning_rate": 5e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6930362582206726, + "num_tokens": 111675267.0, + "step": 4316 + }, + { + "epoch": 0.47408302218317594, + "grad_norm": 1.840399146080017, + "learning_rate": 5e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6930789947509766, + "num_tokens": 111703973.0, + "step": 4317 + }, + { + "epoch": 0.4741928398857896, + "grad_norm": 1.8329286575317383, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7005196809768677, + "num_tokens": 111731353.0, + "step": 4318 + }, + { + "epoch": 0.47430265758840323, + "grad_norm": 1.8073983192443848, + "learning_rate": 5e-06, + "loss": 1.0731, + "mean_token_accuracy": 0.680543839931488, + "num_tokens": 111761652.0, + "step": 4319 + }, + { + "epoch": 0.47441247529101693, + "grad_norm": 2.0334272384643555, + "learning_rate": 5e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7058612108230591, + "num_tokens": 111784069.0, + "step": 4320 + }, + { + "epoch": 0.4745222929936306, + "grad_norm": 2.1438822746276855, + "learning_rate": 5e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7201905250549316, + "num_tokens": 111805748.0, + "step": 4321 + }, + { + "epoch": 0.4746321106962442, + "grad_norm": 1.6772561073303223, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7316180467605591, + "num_tokens": 111833379.0, + "step": 4322 + }, + { + "epoch": 0.47474192839885787, + "grad_norm": 1.9127607345581055, + "learning_rate": 5e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7252973318099976, + "num_tokens": 111859499.0, + "step": 4323 + }, + { + "epoch": 0.4748517461014716, + "grad_norm": 1.834344744682312, + "learning_rate": 5e-06, + "loss": 0.968, + "mean_token_accuracy": 0.6966577768325806, + "num_tokens": 111885613.0, + "step": 4324 + }, + { + "epoch": 0.4749615638040852, + "grad_norm": 1.749263882637024, + "learning_rate": 5e-06, + "loss": 1.1053, + "mean_token_accuracy": 0.6735604405403137, + "num_tokens": 111919802.0, + "step": 4325 + }, + { + "epoch": 0.47507138150669886, + "grad_norm": 1.9548375606536865, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7163481712341309, + "num_tokens": 111943501.0, + "step": 4326 + }, + { + "epoch": 0.47518119920931257, + "grad_norm": 1.8252288103103638, + "learning_rate": 5e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6923848986625671, + "num_tokens": 111970185.0, + "step": 4327 + }, + { + "epoch": 0.4752910169119262, + "grad_norm": 2.0767710208892822, + "learning_rate": 5e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7196465730667114, + "num_tokens": 111990928.0, + "step": 4328 + }, + { + "epoch": 0.47540083461453986, + "grad_norm": 2.17433762550354, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7012646794319153, + "num_tokens": 112010536.0, + "step": 4329 + }, + { + "epoch": 0.4755106523171535, + "grad_norm": 1.8996827602386475, + "learning_rate": 5e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6861131191253662, + "num_tokens": 112040059.0, + "step": 4330 + }, + { + "epoch": 0.4756204700197672, + "grad_norm": 2.0018906593322754, + "learning_rate": 5e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7249583005905151, + "num_tokens": 112060812.0, + "step": 4331 + }, + { + "epoch": 0.47573028772238085, + "grad_norm": 1.8511154651641846, + "learning_rate": 5e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6966047286987305, + "num_tokens": 112086091.0, + "step": 4332 + }, + { + "epoch": 0.4758401054249945, + "grad_norm": 1.895941138267517, + "learning_rate": 5e-06, + "loss": 1.014, + "mean_token_accuracy": 0.695345401763916, + "num_tokens": 112112061.0, + "step": 4333 + }, + { + "epoch": 0.47594992312760814, + "grad_norm": 2.0886826515197754, + "learning_rate": 5e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7070357799530029, + "num_tokens": 112134107.0, + "step": 4334 + }, + { + "epoch": 0.47605974083022184, + "grad_norm": 1.8438963890075684, + "learning_rate": 5e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.68851238489151, + "num_tokens": 112162319.0, + "step": 4335 + }, + { + "epoch": 0.4761695585328355, + "grad_norm": 1.9615896940231323, + "learning_rate": 5e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6926538944244385, + "num_tokens": 112186870.0, + "step": 4336 + }, + { + "epoch": 0.47627937623544914, + "grad_norm": 1.8962080478668213, + "learning_rate": 5e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.706631064414978, + "num_tokens": 112213932.0, + "step": 4337 + }, + { + "epoch": 0.47638919393806284, + "grad_norm": 2.3061041831970215, + "learning_rate": 5e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7121025323867798, + "num_tokens": 112232075.0, + "step": 4338 + }, + { + "epoch": 0.4764990116406765, + "grad_norm": 1.7488927841186523, + "learning_rate": 5e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7040696740150452, + "num_tokens": 112261545.0, + "step": 4339 + }, + { + "epoch": 0.47660882934329013, + "grad_norm": 1.9835044145584106, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7018074989318848, + "num_tokens": 112284798.0, + "step": 4340 + }, + { + "epoch": 0.4767186470459038, + "grad_norm": 1.7402875423431396, + "learning_rate": 5e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.701696515083313, + "num_tokens": 112316990.0, + "step": 4341 + }, + { + "epoch": 0.4768284647485175, + "grad_norm": 1.7345857620239258, + "learning_rate": 5e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7094576358795166, + "num_tokens": 112347319.0, + "step": 4342 + }, + { + "epoch": 0.4769382824511311, + "grad_norm": 1.958680510520935, + "learning_rate": 5e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7070909738540649, + "num_tokens": 112371048.0, + "step": 4343 + }, + { + "epoch": 0.47704810015374477, + "grad_norm": 1.9847463369369507, + "learning_rate": 5e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7219889163970947, + "num_tokens": 112394058.0, + "step": 4344 + }, + { + "epoch": 0.47715791785635847, + "grad_norm": 1.7705769538879395, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6950500011444092, + "num_tokens": 112423434.0, + "step": 4345 + }, + { + "epoch": 0.4772677355589721, + "grad_norm": 1.943137288093567, + "learning_rate": 5e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6936678886413574, + "num_tokens": 112447071.0, + "step": 4346 + }, + { + "epoch": 0.47737755326158576, + "grad_norm": 1.7705134153366089, + "learning_rate": 5e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6932461261749268, + "num_tokens": 112477767.0, + "step": 4347 + }, + { + "epoch": 0.4774873709641994, + "grad_norm": 1.8844090700149536, + "learning_rate": 5e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7049754858016968, + "num_tokens": 112503901.0, + "step": 4348 + }, + { + "epoch": 0.4775971886668131, + "grad_norm": 1.7767695188522339, + "learning_rate": 5e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6838694214820862, + "num_tokens": 112533348.0, + "step": 4349 + }, + { + "epoch": 0.47770700636942676, + "grad_norm": 1.9032551050186157, + "learning_rate": 5e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7082778215408325, + "num_tokens": 112559846.0, + "step": 4350 + }, + { + "epoch": 0.4778168240720404, + "grad_norm": 1.7028796672821045, + "learning_rate": 5e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6981586813926697, + "num_tokens": 112591316.0, + "step": 4351 + }, + { + "epoch": 0.47792664177465405, + "grad_norm": 1.920789122581482, + "learning_rate": 5e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7002285718917847, + "num_tokens": 112616585.0, + "step": 4352 + }, + { + "epoch": 0.47803645947726775, + "grad_norm": 1.8871053457260132, + "learning_rate": 5e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7166868448257446, + "num_tokens": 112642091.0, + "step": 4353 + }, + { + "epoch": 0.4781462771798814, + "grad_norm": 2.0354573726654053, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7120222449302673, + "num_tokens": 112664466.0, + "step": 4354 + }, + { + "epoch": 0.47825609488249504, + "grad_norm": 2.025073289871216, + "learning_rate": 5e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6896097660064697, + "num_tokens": 112687187.0, + "step": 4355 + }, + { + "epoch": 0.47836591258510874, + "grad_norm": 1.8873076438903809, + "learning_rate": 5e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6867640018463135, + "num_tokens": 112715473.0, + "step": 4356 + }, + { + "epoch": 0.4784757302877224, + "grad_norm": 1.999215841293335, + "learning_rate": 5e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6887658834457397, + "num_tokens": 112740874.0, + "step": 4357 + }, + { + "epoch": 0.47858554799033604, + "grad_norm": 1.976894497871399, + "learning_rate": 5e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7078191041946411, + "num_tokens": 112762963.0, + "step": 4358 + }, + { + "epoch": 0.4786953656929497, + "grad_norm": 1.9866782426834106, + "learning_rate": 5e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7033776044845581, + "num_tokens": 112785618.0, + "step": 4359 + }, + { + "epoch": 0.4788051833955634, + "grad_norm": 1.999705195426941, + "learning_rate": 5e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7126898169517517, + "num_tokens": 112809745.0, + "step": 4360 + }, + { + "epoch": 0.47891500109817703, + "grad_norm": 1.7145369052886963, + "learning_rate": 5e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6941183805465698, + "num_tokens": 112841861.0, + "step": 4361 + }, + { + "epoch": 0.4790248188007907, + "grad_norm": 1.9125831127166748, + "learning_rate": 5e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7152220010757446, + "num_tokens": 112864672.0, + "step": 4362 + }, + { + "epoch": 0.4791346365034044, + "grad_norm": 1.7805938720703125, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7194070816040039, + "num_tokens": 112891830.0, + "step": 4363 + }, + { + "epoch": 0.479244454206018, + "grad_norm": 2.001988410949707, + "learning_rate": 5e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7212037444114685, + "num_tokens": 112915123.0, + "step": 4364 + }, + { + "epoch": 0.47935427190863167, + "grad_norm": 1.7481634616851807, + "learning_rate": 5e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6961137652397156, + "num_tokens": 112945804.0, + "step": 4365 + }, + { + "epoch": 0.4794640896112453, + "grad_norm": 2.0129518508911133, + "learning_rate": 5e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7175184488296509, + "num_tokens": 112970054.0, + "step": 4366 + }, + { + "epoch": 0.479573907313859, + "grad_norm": 2.2077078819274902, + "learning_rate": 5e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6954584717750549, + "num_tokens": 112992874.0, + "step": 4367 + }, + { + "epoch": 0.47968372501647266, + "grad_norm": 2.3522748947143555, + "learning_rate": 5e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7138533592224121, + "num_tokens": 113009798.0, + "step": 4368 + }, + { + "epoch": 0.4797935427190863, + "grad_norm": 1.9125676155090332, + "learning_rate": 5e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7001500129699707, + "num_tokens": 113033933.0, + "step": 4369 + }, + { + "epoch": 0.47990336042169995, + "grad_norm": 1.8661845922470093, + "learning_rate": 5e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6795369386672974, + "num_tokens": 113061436.0, + "step": 4370 + }, + { + "epoch": 0.48001317812431366, + "grad_norm": 1.987860918045044, + "learning_rate": 5e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6959565877914429, + "num_tokens": 113086809.0, + "step": 4371 + }, + { + "epoch": 0.4801229958269273, + "grad_norm": 2.2826809883117676, + "learning_rate": 5e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7190941572189331, + "num_tokens": 113105486.0, + "step": 4372 + }, + { + "epoch": 0.48023281352954095, + "grad_norm": 1.8694132566452026, + "learning_rate": 5e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6939363479614258, + "num_tokens": 113135697.0, + "step": 4373 + }, + { + "epoch": 0.48034263123215465, + "grad_norm": 1.9296754598617554, + "learning_rate": 5e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6904315948486328, + "num_tokens": 113161331.0, + "step": 4374 + }, + { + "epoch": 0.4804524489347683, + "grad_norm": 1.9269747734069824, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7118192911148071, + "num_tokens": 113185638.0, + "step": 4375 + }, + { + "epoch": 0.48056226663738194, + "grad_norm": 2.1660802364349365, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7236368656158447, + "num_tokens": 113208422.0, + "step": 4376 + }, + { + "epoch": 0.4806720843399956, + "grad_norm": 2.297717332839966, + "learning_rate": 5e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7101138234138489, + "num_tokens": 113226522.0, + "step": 4377 + }, + { + "epoch": 0.4807819020426093, + "grad_norm": 1.9622770547866821, + "learning_rate": 5e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7016853094100952, + "num_tokens": 113248969.0, + "step": 4378 + }, + { + "epoch": 0.48089171974522293, + "grad_norm": 1.9034596681594849, + "learning_rate": 5e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7150832414627075, + "num_tokens": 113272182.0, + "step": 4379 + }, + { + "epoch": 0.4810015374478366, + "grad_norm": 1.9816762208938599, + "learning_rate": 5e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6904336810112, + "num_tokens": 113295054.0, + "step": 4380 + }, + { + "epoch": 0.4811113551504502, + "grad_norm": 2.020768404006958, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7130686640739441, + "num_tokens": 113315026.0, + "step": 4381 + }, + { + "epoch": 0.4812211728530639, + "grad_norm": 1.7891381978988647, + "learning_rate": 5e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6817824840545654, + "num_tokens": 113343804.0, + "step": 4382 + }, + { + "epoch": 0.4813309905556776, + "grad_norm": 1.7488030195236206, + "learning_rate": 5e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7043423652648926, + "num_tokens": 113375631.0, + "step": 4383 + }, + { + "epoch": 0.4814408082582912, + "grad_norm": 1.6515413522720337, + "learning_rate": 5e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7131283283233643, + "num_tokens": 113410244.0, + "step": 4384 + }, + { + "epoch": 0.4815506259609049, + "grad_norm": 2.019460678100586, + "learning_rate": 5e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6856681108474731, + "num_tokens": 113434721.0, + "step": 4385 + }, + { + "epoch": 0.48166044366351857, + "grad_norm": 2.0862715244293213, + "learning_rate": 5e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.712059497833252, + "num_tokens": 113455878.0, + "step": 4386 + }, + { + "epoch": 0.4817702613661322, + "grad_norm": 1.815806269645691, + "learning_rate": 5e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.6784214377403259, + "num_tokens": 113486482.0, + "step": 4387 + }, + { + "epoch": 0.48188007906874586, + "grad_norm": 2.328752279281616, + "learning_rate": 5e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.6983717083930969, + "num_tokens": 113505032.0, + "step": 4388 + }, + { + "epoch": 0.48198989677135956, + "grad_norm": 1.9229485988616943, + "learning_rate": 5e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7034276127815247, + "num_tokens": 113529600.0, + "step": 4389 + }, + { + "epoch": 0.4820997144739732, + "grad_norm": 1.9428822994232178, + "learning_rate": 5e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.708200216293335, + "num_tokens": 113553821.0, + "step": 4390 + }, + { + "epoch": 0.48220953217658685, + "grad_norm": 2.1077165603637695, + "learning_rate": 5e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6950507164001465, + "num_tokens": 113574413.0, + "step": 4391 + }, + { + "epoch": 0.48231934987920055, + "grad_norm": 1.9915872812271118, + "learning_rate": 5e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.676384687423706, + "num_tokens": 113600466.0, + "step": 4392 + }, + { + "epoch": 0.4824291675818142, + "grad_norm": 1.957463026046753, + "learning_rate": 5e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7306463718414307, + "num_tokens": 113626011.0, + "step": 4393 + }, + { + "epoch": 0.48253898528442785, + "grad_norm": 1.6693922281265259, + "learning_rate": 5e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7123169898986816, + "num_tokens": 113657780.0, + "step": 4394 + }, + { + "epoch": 0.4826488029870415, + "grad_norm": 1.7549148797988892, + "learning_rate": 5e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7093859910964966, + "num_tokens": 113685328.0, + "step": 4395 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 2.0725841522216797, + "learning_rate": 5e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6991356015205383, + "num_tokens": 113708078.0, + "step": 4396 + }, + { + "epoch": 0.48286843839226884, + "grad_norm": 1.6239655017852783, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7158893346786499, + "num_tokens": 113741068.0, + "step": 4397 + }, + { + "epoch": 0.4829782560948825, + "grad_norm": 2.141406297683716, + "learning_rate": 5e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7042388916015625, + "num_tokens": 113762074.0, + "step": 4398 + }, + { + "epoch": 0.48308807379749613, + "grad_norm": 1.952521800994873, + "learning_rate": 5e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7039152383804321, + "num_tokens": 113786594.0, + "step": 4399 + }, + { + "epoch": 0.48319789150010983, + "grad_norm": 2.211527109146118, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7199247479438782, + "num_tokens": 113805215.0, + "step": 4400 + }, + { + "epoch": 0.4833077092027235, + "grad_norm": 1.7734029293060303, + "learning_rate": 5e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6864495277404785, + "num_tokens": 113835631.0, + "step": 4401 + }, + { + "epoch": 0.4834175269053371, + "grad_norm": 1.9068571329116821, + "learning_rate": 5e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6750662326812744, + "num_tokens": 113865186.0, + "step": 4402 + }, + { + "epoch": 0.4835273446079508, + "grad_norm": 1.8548346757888794, + "learning_rate": 5e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6933416128158569, + "num_tokens": 113893169.0, + "step": 4403 + }, + { + "epoch": 0.4836371623105645, + "grad_norm": 1.9147800207138062, + "learning_rate": 5e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7163476943969727, + "num_tokens": 113919044.0, + "step": 4404 + }, + { + "epoch": 0.4837469800131781, + "grad_norm": 2.0387537479400635, + "learning_rate": 5e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7018592953681946, + "num_tokens": 113942413.0, + "step": 4405 + }, + { + "epoch": 0.48385679771579176, + "grad_norm": 1.8225853443145752, + "learning_rate": 5e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.685121476650238, + "num_tokens": 113970586.0, + "step": 4406 + }, + { + "epoch": 0.48396661541840547, + "grad_norm": 1.8814144134521484, + "learning_rate": 5e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6916183233261108, + "num_tokens": 113997581.0, + "step": 4407 + }, + { + "epoch": 0.4840764331210191, + "grad_norm": 1.7783979177474976, + "learning_rate": 5e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.69983971118927, + "num_tokens": 114025485.0, + "step": 4408 + }, + { + "epoch": 0.48418625082363276, + "grad_norm": 1.8282458782196045, + "learning_rate": 5e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7191638946533203, + "num_tokens": 114051471.0, + "step": 4409 + }, + { + "epoch": 0.4842960685262464, + "grad_norm": 1.9650967121124268, + "learning_rate": 5e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.7048031091690063, + "num_tokens": 114079713.0, + "step": 4410 + }, + { + "epoch": 0.4844058862288601, + "grad_norm": 1.9164637327194214, + "learning_rate": 5e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7027988433837891, + "num_tokens": 114105097.0, + "step": 4411 + }, + { + "epoch": 0.48451570393147375, + "grad_norm": 2.091259717941284, + "learning_rate": 5e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6922480463981628, + "num_tokens": 114129367.0, + "step": 4412 + }, + { + "epoch": 0.4846255216340874, + "grad_norm": 1.9335438013076782, + "learning_rate": 5e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6760358810424805, + "num_tokens": 114156131.0, + "step": 4413 + }, + { + "epoch": 0.4847353393367011, + "grad_norm": 1.8056827783584595, + "learning_rate": 5e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6999930143356323, + "num_tokens": 114183855.0, + "step": 4414 + }, + { + "epoch": 0.48484515703931474, + "grad_norm": 2.064342498779297, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.6979657411575317, + "num_tokens": 114204719.0, + "step": 4415 + }, + { + "epoch": 0.4849549747419284, + "grad_norm": 2.084371566772461, + "learning_rate": 5e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7156845331192017, + "num_tokens": 114224914.0, + "step": 4416 + }, + { + "epoch": 0.48506479244454204, + "grad_norm": 1.8999876976013184, + "learning_rate": 5e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6927659511566162, + "num_tokens": 114251131.0, + "step": 4417 + }, + { + "epoch": 0.48517461014715574, + "grad_norm": 1.8416204452514648, + "learning_rate": 5e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7061145305633545, + "num_tokens": 114277379.0, + "step": 4418 + }, + { + "epoch": 0.4852844278497694, + "grad_norm": 2.073260545730591, + "learning_rate": 5e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7155184149742126, + "num_tokens": 114301170.0, + "step": 4419 + }, + { + "epoch": 0.48539424555238303, + "grad_norm": 1.8090356588363647, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.704121470451355, + "num_tokens": 114327062.0, + "step": 4420 + }, + { + "epoch": 0.48550406325499673, + "grad_norm": 1.825614333152771, + "learning_rate": 5e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6952984929084778, + "num_tokens": 114355990.0, + "step": 4421 + }, + { + "epoch": 0.4856138809576104, + "grad_norm": 1.9152311086654663, + "learning_rate": 5e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7134791612625122, + "num_tokens": 114380899.0, + "step": 4422 + }, + { + "epoch": 0.485723698660224, + "grad_norm": 1.9455713033676147, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7245030403137207, + "num_tokens": 114402637.0, + "step": 4423 + }, + { + "epoch": 0.48583351636283767, + "grad_norm": 1.9490622282028198, + "learning_rate": 5e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7088853120803833, + "num_tokens": 114428613.0, + "step": 4424 + }, + { + "epoch": 0.48594333406545137, + "grad_norm": 2.073462724685669, + "learning_rate": 5e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6965863704681396, + "num_tokens": 114453567.0, + "step": 4425 + }, + { + "epoch": 0.486053151768065, + "grad_norm": 2.177126407623291, + "learning_rate": 5e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7202882766723633, + "num_tokens": 114473494.0, + "step": 4426 + }, + { + "epoch": 0.48616296947067866, + "grad_norm": 2.3107402324676514, + "learning_rate": 5e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.70671147108078, + "num_tokens": 114492685.0, + "step": 4427 + }, + { + "epoch": 0.4862727871732923, + "grad_norm": 1.803318738937378, + "learning_rate": 5e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6972370147705078, + "num_tokens": 114519811.0, + "step": 4428 + }, + { + "epoch": 0.486382604875906, + "grad_norm": 2.1140902042388916, + "learning_rate": 5e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7000890970230103, + "num_tokens": 114541540.0, + "step": 4429 + }, + { + "epoch": 0.48649242257851966, + "grad_norm": 2.2062952518463135, + "learning_rate": 5e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7170946598052979, + "num_tokens": 114561223.0, + "step": 4430 + }, + { + "epoch": 0.4866022402811333, + "grad_norm": 2.2212677001953125, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7593221068382263, + "num_tokens": 114577284.0, + "step": 4431 + }, + { + "epoch": 0.486712057983747, + "grad_norm": 1.7085599899291992, + "learning_rate": 5e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6948263645172119, + "num_tokens": 114606381.0, + "step": 4432 + }, + { + "epoch": 0.48682187568636065, + "grad_norm": 2.023641347885132, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.71373051404953, + "num_tokens": 114628892.0, + "step": 4433 + }, + { + "epoch": 0.4869316933889743, + "grad_norm": 1.7560621500015259, + "learning_rate": 5e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7075070142745972, + "num_tokens": 114657374.0, + "step": 4434 + }, + { + "epoch": 0.48704151109158794, + "grad_norm": 1.888758897781372, + "learning_rate": 5e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6863301992416382, + "num_tokens": 114687656.0, + "step": 4435 + }, + { + "epoch": 0.48715132879420164, + "grad_norm": 2.000112533569336, + "learning_rate": 5e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6807557344436646, + "num_tokens": 114713428.0, + "step": 4436 + }, + { + "epoch": 0.4872611464968153, + "grad_norm": 1.8506618738174438, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7254998683929443, + "num_tokens": 114737785.0, + "step": 4437 + }, + { + "epoch": 0.48737096419942894, + "grad_norm": 1.9972378015518188, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7058290839195251, + "num_tokens": 114763603.0, + "step": 4438 + }, + { + "epoch": 0.48748078190204264, + "grad_norm": 1.9423110485076904, + "learning_rate": 5e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6914887428283691, + "num_tokens": 114791763.0, + "step": 4439 + }, + { + "epoch": 0.4875905996046563, + "grad_norm": 2.0977654457092285, + "learning_rate": 5e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7072837352752686, + "num_tokens": 114815426.0, + "step": 4440 + }, + { + "epoch": 0.48770041730726993, + "grad_norm": 2.0866827964782715, + "learning_rate": 5e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6898091435432434, + "num_tokens": 114836878.0, + "step": 4441 + }, + { + "epoch": 0.4878102350098836, + "grad_norm": 1.909518837928772, + "learning_rate": 5e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6949398517608643, + "num_tokens": 114865834.0, + "step": 4442 + }, + { + "epoch": 0.4879200527124973, + "grad_norm": 1.8555337190628052, + "learning_rate": 5e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.721141517162323, + "num_tokens": 114892051.0, + "step": 4443 + }, + { + "epoch": 0.4880298704151109, + "grad_norm": 2.0158438682556152, + "learning_rate": 5e-06, + "loss": 1.001, + "mean_token_accuracy": 0.69721919298172, + "num_tokens": 114915725.0, + "step": 4444 + }, + { + "epoch": 0.48813968811772457, + "grad_norm": 1.9719349145889282, + "learning_rate": 5e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7089431285858154, + "num_tokens": 114939408.0, + "step": 4445 + }, + { + "epoch": 0.4882495058203382, + "grad_norm": 1.8959081172943115, + "learning_rate": 5e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.702190637588501, + "num_tokens": 114965299.0, + "step": 4446 + }, + { + "epoch": 0.4883593235229519, + "grad_norm": 1.8027979135513306, + "learning_rate": 5e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7045505046844482, + "num_tokens": 114993227.0, + "step": 4447 + }, + { + "epoch": 0.48846914122556556, + "grad_norm": 1.8679797649383545, + "learning_rate": 5e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6977484226226807, + "num_tokens": 115021603.0, + "step": 4448 + }, + { + "epoch": 0.4885789589281792, + "grad_norm": 1.8094576597213745, + "learning_rate": 5e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7054252624511719, + "num_tokens": 115050574.0, + "step": 4449 + }, + { + "epoch": 0.4886887766307929, + "grad_norm": 1.8211541175842285, + "learning_rate": 5e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7054969668388367, + "num_tokens": 115078460.0, + "step": 4450 + }, + { + "epoch": 0.48879859433340656, + "grad_norm": 2.2023675441741943, + "learning_rate": 5e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7007589340209961, + "num_tokens": 115099359.0, + "step": 4451 + }, + { + "epoch": 0.4889084120360202, + "grad_norm": 2.00030517578125, + "learning_rate": 5e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6986433863639832, + "num_tokens": 115124939.0, + "step": 4452 + }, + { + "epoch": 0.48901822973863385, + "grad_norm": 2.0304203033447266, + "learning_rate": 5e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7135369777679443, + "num_tokens": 115148259.0, + "step": 4453 + }, + { + "epoch": 0.48912804744124755, + "grad_norm": 1.9755507707595825, + "learning_rate": 5e-06, + "loss": 1.0762, + "mean_token_accuracy": 0.6773296594619751, + "num_tokens": 115173934.0, + "step": 4454 + }, + { + "epoch": 0.4892378651438612, + "grad_norm": 2.1852166652679443, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7018146514892578, + "num_tokens": 115194483.0, + "step": 4455 + }, + { + "epoch": 0.48934768284647484, + "grad_norm": 1.7434179782867432, + "learning_rate": 5e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6816517114639282, + "num_tokens": 115225267.0, + "step": 4456 + }, + { + "epoch": 0.4894575005490885, + "grad_norm": 2.105949878692627, + "learning_rate": 5e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7127190232276917, + "num_tokens": 115245619.0, + "step": 4457 + }, + { + "epoch": 0.4895673182517022, + "grad_norm": 1.9357537031173706, + "learning_rate": 5e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7366122603416443, + "num_tokens": 115269917.0, + "step": 4458 + }, + { + "epoch": 0.48967713595431583, + "grad_norm": 1.8999254703521729, + "learning_rate": 5e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7042638063430786, + "num_tokens": 115295902.0, + "step": 4459 + }, + { + "epoch": 0.4897869536569295, + "grad_norm": 1.9949759244918823, + "learning_rate": 5e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.706936240196228, + "num_tokens": 115319843.0, + "step": 4460 + }, + { + "epoch": 0.4898967713595432, + "grad_norm": 1.6570316553115845, + "learning_rate": 5e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6857982277870178, + "num_tokens": 115351509.0, + "step": 4461 + }, + { + "epoch": 0.4900065890621568, + "grad_norm": 1.9012730121612549, + "learning_rate": 5e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.706424355506897, + "num_tokens": 115376347.0, + "step": 4462 + }, + { + "epoch": 0.4901164067647705, + "grad_norm": 1.9180915355682373, + "learning_rate": 5e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7226352691650391, + "num_tokens": 115401220.0, + "step": 4463 + }, + { + "epoch": 0.4902262244673841, + "grad_norm": 2.2063815593719482, + "learning_rate": 5e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6921581029891968, + "num_tokens": 115427417.0, + "step": 4464 + }, + { + "epoch": 0.4903360421699978, + "grad_norm": 1.8932744264602661, + "learning_rate": 5e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7053602933883667, + "num_tokens": 115452459.0, + "step": 4465 + }, + { + "epoch": 0.49044585987261147, + "grad_norm": 1.8155442476272583, + "learning_rate": 5e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.7017145156860352, + "num_tokens": 115480311.0, + "step": 4466 + }, + { + "epoch": 0.4905556775752251, + "grad_norm": 1.7293105125427246, + "learning_rate": 5e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6985012888908386, + "num_tokens": 115513002.0, + "step": 4467 + }, + { + "epoch": 0.4906654952778388, + "grad_norm": 1.8413456678390503, + "learning_rate": 5e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.6959807276725769, + "num_tokens": 115543761.0, + "step": 4468 + }, + { + "epoch": 0.49077531298045246, + "grad_norm": 1.930206298828125, + "learning_rate": 5e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6924495697021484, + "num_tokens": 115569954.0, + "step": 4469 + }, + { + "epoch": 0.4908851306830661, + "grad_norm": 2.2090859413146973, + "learning_rate": 5e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7246097922325134, + "num_tokens": 115589615.0, + "step": 4470 + }, + { + "epoch": 0.49099494838567975, + "grad_norm": 1.6477831602096558, + "learning_rate": 5e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7017937898635864, + "num_tokens": 115620857.0, + "step": 4471 + }, + { + "epoch": 0.49110476608829345, + "grad_norm": 1.7897217273712158, + "learning_rate": 5e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6929912567138672, + "num_tokens": 115650287.0, + "step": 4472 + }, + { + "epoch": 0.4912145837909071, + "grad_norm": 1.7488501071929932, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7212511897087097, + "num_tokens": 115676106.0, + "step": 4473 + }, + { + "epoch": 0.49132440149352075, + "grad_norm": 1.6791706085205078, + "learning_rate": 5e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6793158650398254, + "num_tokens": 115707805.0, + "step": 4474 + }, + { + "epoch": 0.4914342191961344, + "grad_norm": 1.833993911743164, + "learning_rate": 5e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6996074318885803, + "num_tokens": 115736878.0, + "step": 4475 + }, + { + "epoch": 0.4915440368987481, + "grad_norm": 2.092862844467163, + "learning_rate": 5e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7018855810165405, + "num_tokens": 115758694.0, + "step": 4476 + }, + { + "epoch": 0.49165385460136174, + "grad_norm": 1.7575936317443848, + "learning_rate": 5e-06, + "loss": 1.0879, + "mean_token_accuracy": 0.6704537868499756, + "num_tokens": 115788481.0, + "step": 4477 + }, + { + "epoch": 0.4917636723039754, + "grad_norm": 1.7751837968826294, + "learning_rate": 5e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6887924671173096, + "num_tokens": 115818200.0, + "step": 4478 + }, + { + "epoch": 0.4918734900065891, + "grad_norm": 1.8963786363601685, + "learning_rate": 5e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6940466165542603, + "num_tokens": 115842082.0, + "step": 4479 + }, + { + "epoch": 0.49198330770920273, + "grad_norm": 1.8737865686416626, + "learning_rate": 5e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7010400295257568, + "num_tokens": 115868301.0, + "step": 4480 + }, + { + "epoch": 0.4920931254118164, + "grad_norm": 1.9399045705795288, + "learning_rate": 5e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6930557489395142, + "num_tokens": 115895054.0, + "step": 4481 + }, + { + "epoch": 0.49220294311443, + "grad_norm": 1.7607783079147339, + "learning_rate": 5e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6981296539306641, + "num_tokens": 115927359.0, + "step": 4482 + }, + { + "epoch": 0.4923127608170437, + "grad_norm": 1.7520116567611694, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7212392091751099, + "num_tokens": 115954752.0, + "step": 4483 + }, + { + "epoch": 0.4924225785196574, + "grad_norm": 2.218149185180664, + "learning_rate": 5e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7097046971321106, + "num_tokens": 115973957.0, + "step": 4484 + }, + { + "epoch": 0.492532396222271, + "grad_norm": 1.8902288675308228, + "learning_rate": 5e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7208962440490723, + "num_tokens": 115999196.0, + "step": 4485 + }, + { + "epoch": 0.49264221392488466, + "grad_norm": 1.8069835901260376, + "learning_rate": 5e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7184295058250427, + "num_tokens": 116027001.0, + "step": 4486 + }, + { + "epoch": 0.49275203162749837, + "grad_norm": 1.9704556465148926, + "learning_rate": 5e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6870441436767578, + "num_tokens": 116050038.0, + "step": 4487 + }, + { + "epoch": 0.492861849330112, + "grad_norm": 1.949434757232666, + "learning_rate": 5e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6899174451828003, + "num_tokens": 116075722.0, + "step": 4488 + }, + { + "epoch": 0.49297166703272566, + "grad_norm": 1.935672640800476, + "learning_rate": 5e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7085264921188354, + "num_tokens": 116099960.0, + "step": 4489 + }, + { + "epoch": 0.49308148473533936, + "grad_norm": 1.8019623756408691, + "learning_rate": 5e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6895787715911865, + "num_tokens": 116130699.0, + "step": 4490 + }, + { + "epoch": 0.493191302437953, + "grad_norm": 1.7268208265304565, + "learning_rate": 5e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7076253890991211, + "num_tokens": 116160482.0, + "step": 4491 + }, + { + "epoch": 0.49330112014056665, + "grad_norm": 2.204401731491089, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7261180877685547, + "num_tokens": 116178659.0, + "step": 4492 + }, + { + "epoch": 0.4934109378431803, + "grad_norm": 1.8756301403045654, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7211010456085205, + "num_tokens": 116205314.0, + "step": 4493 + }, + { + "epoch": 0.493520755545794, + "grad_norm": 1.7811038494110107, + "learning_rate": 5e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7053654193878174, + "num_tokens": 116236705.0, + "step": 4494 + }, + { + "epoch": 0.49363057324840764, + "grad_norm": 2.208271026611328, + "learning_rate": 5e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6916650533676147, + "num_tokens": 116256062.0, + "step": 4495 + }, + { + "epoch": 0.4937403909510213, + "grad_norm": 1.9773566722869873, + "learning_rate": 5e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7227659225463867, + "num_tokens": 116278418.0, + "step": 4496 + }, + { + "epoch": 0.493850208653635, + "grad_norm": 1.8052641153335571, + "learning_rate": 5e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6856105327606201, + "num_tokens": 116305522.0, + "step": 4497 + }, + { + "epoch": 0.49396002635624864, + "grad_norm": 1.991042971611023, + "learning_rate": 5e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7032850980758667, + "num_tokens": 116329554.0, + "step": 4498 + }, + { + "epoch": 0.4940698440588623, + "grad_norm": 2.0275790691375732, + "learning_rate": 5e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.72557133436203, + "num_tokens": 116351386.0, + "step": 4499 + }, + { + "epoch": 0.49417966176147593, + "grad_norm": 1.8751140832901, + "learning_rate": 5e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7092975378036499, + "num_tokens": 116377585.0, + "step": 4500 + }, + { + "epoch": 0.49428947946408963, + "grad_norm": 1.8781194686889648, + "learning_rate": 5e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7263755798339844, + "num_tokens": 116401925.0, + "step": 4501 + }, + { + "epoch": 0.4943992971667033, + "grad_norm": 2.004580497741699, + "learning_rate": 5e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7021975517272949, + "num_tokens": 116424357.0, + "step": 4502 + }, + { + "epoch": 0.4945091148693169, + "grad_norm": 1.9803168773651123, + "learning_rate": 5e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7009081840515137, + "num_tokens": 116449223.0, + "step": 4503 + }, + { + "epoch": 0.49461893257193057, + "grad_norm": 1.7504526376724243, + "learning_rate": 5e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.69367516040802, + "num_tokens": 116478255.0, + "step": 4504 + }, + { + "epoch": 0.49472875027454427, + "grad_norm": 1.9661608934402466, + "learning_rate": 5e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7098402976989746, + "num_tokens": 116502569.0, + "step": 4505 + }, + { + "epoch": 0.4948385679771579, + "grad_norm": 1.9188768863677979, + "learning_rate": 5e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6832635402679443, + "num_tokens": 116528388.0, + "step": 4506 + }, + { + "epoch": 0.49494838567977156, + "grad_norm": 1.957013487815857, + "learning_rate": 5e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7021581530570984, + "num_tokens": 116550008.0, + "step": 4507 + }, + { + "epoch": 0.49505820338238526, + "grad_norm": 1.921576738357544, + "learning_rate": 5e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6976159811019897, + "num_tokens": 116578445.0, + "step": 4508 + }, + { + "epoch": 0.4951680210849989, + "grad_norm": 1.9732297658920288, + "learning_rate": 5e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6896477937698364, + "num_tokens": 116604482.0, + "step": 4509 + }, + { + "epoch": 0.49527783878761256, + "grad_norm": 1.8478147983551025, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.6957302093505859, + "num_tokens": 116631827.0, + "step": 4510 + }, + { + "epoch": 0.4953876564902262, + "grad_norm": 1.9977494478225708, + "learning_rate": 5e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6937992572784424, + "num_tokens": 116656702.0, + "step": 4511 + }, + { + "epoch": 0.4954974741928399, + "grad_norm": 1.9210020303726196, + "learning_rate": 5e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.6764373779296875, + "num_tokens": 116681770.0, + "step": 4512 + }, + { + "epoch": 0.49560729189545355, + "grad_norm": 2.143404245376587, + "learning_rate": 5e-06, + "loss": 1.041, + "mean_token_accuracy": 0.7011117935180664, + "num_tokens": 116704395.0, + "step": 4513 + }, + { + "epoch": 0.4957171095980672, + "grad_norm": 2.064845561981201, + "learning_rate": 5e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.71683669090271, + "num_tokens": 116729152.0, + "step": 4514 + }, + { + "epoch": 0.4958269273006809, + "grad_norm": 2.196582078933716, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7271251082420349, + "num_tokens": 116749195.0, + "step": 4515 + }, + { + "epoch": 0.49593674500329454, + "grad_norm": 1.8088730573654175, + "learning_rate": 5e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6809638738632202, + "num_tokens": 116779767.0, + "step": 4516 + }, + { + "epoch": 0.4960465627059082, + "grad_norm": 2.1465065479278564, + "learning_rate": 5e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.715822696685791, + "num_tokens": 116799174.0, + "step": 4517 + }, + { + "epoch": 0.49615638040852184, + "grad_norm": 1.9816581010818481, + "learning_rate": 5e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7049501538276672, + "num_tokens": 116824045.0, + "step": 4518 + }, + { + "epoch": 0.49626619811113554, + "grad_norm": 1.9779186248779297, + "learning_rate": 5e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7053418159484863, + "num_tokens": 116849883.0, + "step": 4519 + }, + { + "epoch": 0.4963760158137492, + "grad_norm": 2.207913398742676, + "learning_rate": 5e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7175226807594299, + "num_tokens": 116869506.0, + "step": 4520 + }, + { + "epoch": 0.49648583351636283, + "grad_norm": 2.263399600982666, + "learning_rate": 5e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7071540355682373, + "num_tokens": 116887576.0, + "step": 4521 + }, + { + "epoch": 0.4965956512189765, + "grad_norm": 1.9066585302352905, + "learning_rate": 5e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.687545895576477, + "num_tokens": 116917288.0, + "step": 4522 + }, + { + "epoch": 0.4967054689215902, + "grad_norm": 1.9328479766845703, + "learning_rate": 5e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7218703031539917, + "num_tokens": 116942427.0, + "step": 4523 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 2.3948051929473877, + "learning_rate": 5e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.693848729133606, + "num_tokens": 116959123.0, + "step": 4524 + }, + { + "epoch": 0.49692510432681747, + "grad_norm": 2.112314462661743, + "learning_rate": 5e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7077351212501526, + "num_tokens": 116980672.0, + "step": 4525 + }, + { + "epoch": 0.49703492202943117, + "grad_norm": 1.9528168439865112, + "learning_rate": 5e-06, + "loss": 1.0694, + "mean_token_accuracy": 0.6740367412567139, + "num_tokens": 117009132.0, + "step": 4526 + }, + { + "epoch": 0.4971447397320448, + "grad_norm": 2.013519763946533, + "learning_rate": 5e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7087279558181763, + "num_tokens": 117031682.0, + "step": 4527 + }, + { + "epoch": 0.49725455743465846, + "grad_norm": 1.9695708751678467, + "learning_rate": 5e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7124285697937012, + "num_tokens": 117054940.0, + "step": 4528 + }, + { + "epoch": 0.4973643751372721, + "grad_norm": 1.7979891300201416, + "learning_rate": 5e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6787729263305664, + "num_tokens": 117083413.0, + "step": 4529 + }, + { + "epoch": 0.4974741928398858, + "grad_norm": 2.1540191173553467, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7197911739349365, + "num_tokens": 117104911.0, + "step": 4530 + }, + { + "epoch": 0.49758401054249946, + "grad_norm": 1.9341685771942139, + "learning_rate": 5e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6882444620132446, + "num_tokens": 117132629.0, + "step": 4531 + }, + { + "epoch": 0.4976938282451131, + "grad_norm": 1.8821536302566528, + "learning_rate": 5e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7179135084152222, + "num_tokens": 117159580.0, + "step": 4532 + }, + { + "epoch": 0.49780364594772675, + "grad_norm": 1.982181191444397, + "learning_rate": 5e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6958566904067993, + "num_tokens": 117184511.0, + "step": 4533 + }, + { + "epoch": 0.49791346365034045, + "grad_norm": 1.8111841678619385, + "learning_rate": 5e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7227848172187805, + "num_tokens": 117210796.0, + "step": 4534 + }, + { + "epoch": 0.4980232813529541, + "grad_norm": 1.8705904483795166, + "learning_rate": 5e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7070969343185425, + "num_tokens": 117235694.0, + "step": 4535 + }, + { + "epoch": 0.49813309905556774, + "grad_norm": 1.8674315214157104, + "learning_rate": 5e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.696036159992218, + "num_tokens": 117262339.0, + "step": 4536 + }, + { + "epoch": 0.49824291675818144, + "grad_norm": 1.936774730682373, + "learning_rate": 5e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7104597091674805, + "num_tokens": 117287144.0, + "step": 4537 + }, + { + "epoch": 0.4983527344607951, + "grad_norm": 1.8458826541900635, + "learning_rate": 5e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6835216283798218, + "num_tokens": 117317379.0, + "step": 4538 + }, + { + "epoch": 0.49846255216340873, + "grad_norm": 1.8393996953964233, + "learning_rate": 5e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7062265872955322, + "num_tokens": 117344647.0, + "step": 4539 + }, + { + "epoch": 0.4985723698660224, + "grad_norm": 2.109624147415161, + "learning_rate": 5e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7024684548377991, + "num_tokens": 117365479.0, + "step": 4540 + }, + { + "epoch": 0.4986821875686361, + "grad_norm": 1.8207310438156128, + "learning_rate": 5e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7086387872695923, + "num_tokens": 117394332.0, + "step": 4541 + }, + { + "epoch": 0.4987920052712497, + "grad_norm": 1.7850595712661743, + "learning_rate": 5e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7119210362434387, + "num_tokens": 117420311.0, + "step": 4542 + }, + { + "epoch": 0.4989018229738634, + "grad_norm": 1.7622244358062744, + "learning_rate": 5e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.7034869194030762, + "num_tokens": 117450627.0, + "step": 4543 + }, + { + "epoch": 0.4990116406764771, + "grad_norm": 1.866737961769104, + "learning_rate": 5e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6960141658782959, + "num_tokens": 117477299.0, + "step": 4544 + }, + { + "epoch": 0.4991214583790907, + "grad_norm": 1.7757691144943237, + "learning_rate": 5e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6956815719604492, + "num_tokens": 117507415.0, + "step": 4545 + }, + { + "epoch": 0.49923127608170437, + "grad_norm": 2.18713116645813, + "learning_rate": 5e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7184358835220337, + "num_tokens": 117524999.0, + "step": 4546 + }, + { + "epoch": 0.499341093784318, + "grad_norm": 1.6859469413757324, + "learning_rate": 5e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6917058229446411, + "num_tokens": 117559947.0, + "step": 4547 + }, + { + "epoch": 0.4994509114869317, + "grad_norm": 1.9934940338134766, + "learning_rate": 5e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.680487334728241, + "num_tokens": 117585811.0, + "step": 4548 + }, + { + "epoch": 0.49956072918954536, + "grad_norm": 2.2348546981811523, + "learning_rate": 5e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7114149332046509, + "num_tokens": 117606229.0, + "step": 4549 + }, + { + "epoch": 0.499670546892159, + "grad_norm": 2.1141014099121094, + "learning_rate": 5e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6966361999511719, + "num_tokens": 117629437.0, + "step": 4550 + }, + { + "epoch": 0.49978036459477265, + "grad_norm": 1.867135763168335, + "learning_rate": 5e-06, + "loss": 0.921, + "mean_token_accuracy": 0.722797691822052, + "num_tokens": 117653152.0, + "step": 4551 + }, + { + "epoch": 0.49989018229738635, + "grad_norm": 2.0443668365478516, + "learning_rate": 5e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6827017068862915, + "num_tokens": 117678703.0, + "step": 4552 + }, + { + "epoch": 0.5, + "grad_norm": 1.9901103973388672, + "learning_rate": 5e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7006624937057495, + "num_tokens": 117703958.0, + "step": 4553 + }, + { + "epoch": 0.5001098177026136, + "grad_norm": 1.994289755821228, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7184053659439087, + "num_tokens": 117728753.0, + "step": 4554 + }, + { + "epoch": 0.5002196354052273, + "grad_norm": 1.9803417921066284, + "learning_rate": 5e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6945348381996155, + "num_tokens": 117754024.0, + "step": 4555 + }, + { + "epoch": 0.5003294531078409, + "grad_norm": 1.7482421398162842, + "learning_rate": 5e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7003298997879028, + "num_tokens": 117784628.0, + "step": 4556 + }, + { + "epoch": 0.5004392708104547, + "grad_norm": 2.1544015407562256, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7229174375534058, + "num_tokens": 117803794.0, + "step": 4557 + }, + { + "epoch": 0.5005490885130683, + "grad_norm": 1.9404135942459106, + "learning_rate": 5e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7093181014060974, + "num_tokens": 117828313.0, + "step": 4558 + }, + { + "epoch": 0.500658906215682, + "grad_norm": 1.9700889587402344, + "learning_rate": 5e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7114336490631104, + "num_tokens": 117850502.0, + "step": 4559 + }, + { + "epoch": 0.5007687239182956, + "grad_norm": 1.7193772792816162, + "learning_rate": 5e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6886028051376343, + "num_tokens": 117885331.0, + "step": 4560 + }, + { + "epoch": 0.5008785416209093, + "grad_norm": 1.9114919900894165, + "learning_rate": 5e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7046597599983215, + "num_tokens": 117914290.0, + "step": 4561 + }, + { + "epoch": 0.5009883593235229, + "grad_norm": 2.3497602939605713, + "learning_rate": 5e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7001708745956421, + "num_tokens": 117938097.0, + "step": 4562 + }, + { + "epoch": 0.5010981770261366, + "grad_norm": 1.7186167240142822, + "learning_rate": 5e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6935046315193176, + "num_tokens": 117967691.0, + "step": 4563 + }, + { + "epoch": 0.5012079947287503, + "grad_norm": 1.893152117729187, + "learning_rate": 5e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6794271469116211, + "num_tokens": 117995605.0, + "step": 4564 + }, + { + "epoch": 0.501317812431364, + "grad_norm": 1.943447470664978, + "learning_rate": 5e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7080235481262207, + "num_tokens": 118020899.0, + "step": 4565 + }, + { + "epoch": 0.5014276301339776, + "grad_norm": 2.023312568664551, + "learning_rate": 5e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6977946758270264, + "num_tokens": 118045066.0, + "step": 4566 + }, + { + "epoch": 0.5015374478365913, + "grad_norm": 2.118072032928467, + "learning_rate": 5e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7154953479766846, + "num_tokens": 118064977.0, + "step": 4567 + }, + { + "epoch": 0.5016472655392049, + "grad_norm": 1.67069411277771, + "learning_rate": 5e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7024809122085571, + "num_tokens": 118095212.0, + "step": 4568 + }, + { + "epoch": 0.5017570832418186, + "grad_norm": 1.9435330629348755, + "learning_rate": 5e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.6986235976219177, + "num_tokens": 118119713.0, + "step": 4569 + }, + { + "epoch": 0.5018669009444322, + "grad_norm": 1.9423012733459473, + "learning_rate": 5e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6853633522987366, + "num_tokens": 118145060.0, + "step": 4570 + }, + { + "epoch": 0.5019767186470458, + "grad_norm": 2.018925905227661, + "learning_rate": 5e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6882114410400391, + "num_tokens": 118169976.0, + "step": 4571 + }, + { + "epoch": 0.5020865363496596, + "grad_norm": 1.921971321105957, + "learning_rate": 5e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6926165223121643, + "num_tokens": 118196194.0, + "step": 4572 + }, + { + "epoch": 0.5021963540522733, + "grad_norm": 1.9276915788650513, + "learning_rate": 5e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7034417986869812, + "num_tokens": 118219967.0, + "step": 4573 + }, + { + "epoch": 0.5023061717548869, + "grad_norm": 1.680284023284912, + "learning_rate": 5e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6874446272850037, + "num_tokens": 118252613.0, + "step": 4574 + }, + { + "epoch": 0.5024159894575005, + "grad_norm": 1.9543505907058716, + "learning_rate": 5e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.697491466999054, + "num_tokens": 118279452.0, + "step": 4575 + }, + { + "epoch": 0.5025258071601142, + "grad_norm": 1.7731642723083496, + "learning_rate": 5e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7070966362953186, + "num_tokens": 118307149.0, + "step": 4576 + }, + { + "epoch": 0.5026356248627278, + "grad_norm": 1.977574348449707, + "learning_rate": 5e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7045100927352905, + "num_tokens": 118331881.0, + "step": 4577 + }, + { + "epoch": 0.5027454425653415, + "grad_norm": 2.1507952213287354, + "learning_rate": 5e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6932862401008606, + "num_tokens": 118354352.0, + "step": 4578 + }, + { + "epoch": 0.5028552602679552, + "grad_norm": 1.958535075187683, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7183504104614258, + "num_tokens": 118376968.0, + "step": 4579 + }, + { + "epoch": 0.5029650779705689, + "grad_norm": 1.7263506650924683, + "learning_rate": 5e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.72315913438797, + "num_tokens": 118407950.0, + "step": 4580 + }, + { + "epoch": 0.5030748956731825, + "grad_norm": 1.8513551950454712, + "learning_rate": 5e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6941810250282288, + "num_tokens": 118436267.0, + "step": 4581 + }, + { + "epoch": 0.5031847133757962, + "grad_norm": 1.8366889953613281, + "learning_rate": 5e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7167935371398926, + "num_tokens": 118462527.0, + "step": 4582 + }, + { + "epoch": 0.5032945310784098, + "grad_norm": 1.926645278930664, + "learning_rate": 5e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7146103382110596, + "num_tokens": 118486868.0, + "step": 4583 + }, + { + "epoch": 0.5034043487810235, + "grad_norm": 2.133723258972168, + "learning_rate": 5e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7118074297904968, + "num_tokens": 118507842.0, + "step": 4584 + }, + { + "epoch": 0.5035141664836371, + "grad_norm": 1.9608383178710938, + "learning_rate": 5e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7148540019989014, + "num_tokens": 118529322.0, + "step": 4585 + }, + { + "epoch": 0.5036239841862509, + "grad_norm": 2.204390048980713, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7276648879051208, + "num_tokens": 118548060.0, + "step": 4586 + }, + { + "epoch": 0.5037338018888645, + "grad_norm": 1.9632868766784668, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6992326974868774, + "num_tokens": 118572054.0, + "step": 4587 + }, + { + "epoch": 0.5038436195914782, + "grad_norm": 1.873411774635315, + "learning_rate": 5e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6944432854652405, + "num_tokens": 118599710.0, + "step": 4588 + }, + { + "epoch": 0.5039534372940918, + "grad_norm": 1.9618419408798218, + "learning_rate": 5e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.682917594909668, + "num_tokens": 118624364.0, + "step": 4589 + }, + { + "epoch": 0.5040632549967055, + "grad_norm": 1.9875903129577637, + "learning_rate": 5e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7040412425994873, + "num_tokens": 118648761.0, + "step": 4590 + }, + { + "epoch": 0.5041730726993191, + "grad_norm": 1.8039149045944214, + "learning_rate": 5e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7011666297912598, + "num_tokens": 118673940.0, + "step": 4591 + }, + { + "epoch": 0.5042828904019327, + "grad_norm": 2.023611307144165, + "learning_rate": 5e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7016139030456543, + "num_tokens": 118697262.0, + "step": 4592 + }, + { + "epoch": 0.5043927081045465, + "grad_norm": 1.8128025531768799, + "learning_rate": 5e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6904661655426025, + "num_tokens": 118725528.0, + "step": 4593 + }, + { + "epoch": 0.5045025258071602, + "grad_norm": 1.7128188610076904, + "learning_rate": 5e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6983675956726074, + "num_tokens": 118756678.0, + "step": 4594 + }, + { + "epoch": 0.5046123435097738, + "grad_norm": 1.7918661832809448, + "learning_rate": 5e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7105287313461304, + "num_tokens": 118782874.0, + "step": 4595 + }, + { + "epoch": 0.5047221612123874, + "grad_norm": 1.9399073123931885, + "learning_rate": 5e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7044444680213928, + "num_tokens": 118808900.0, + "step": 4596 + }, + { + "epoch": 0.5048319789150011, + "grad_norm": 1.6859745979309082, + "learning_rate": 5e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7009795904159546, + "num_tokens": 118842766.0, + "step": 4597 + }, + { + "epoch": 0.5049417966176147, + "grad_norm": 2.3490688800811768, + "learning_rate": 5e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7137883901596069, + "num_tokens": 118860605.0, + "step": 4598 + }, + { + "epoch": 0.5050516143202284, + "grad_norm": 1.900416612625122, + "learning_rate": 5e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.709718644618988, + "num_tokens": 118886033.0, + "step": 4599 + }, + { + "epoch": 0.505161432022842, + "grad_norm": 1.7989845275878906, + "learning_rate": 5e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6981798410415649, + "num_tokens": 118914971.0, + "step": 4600 + }, + { + "epoch": 0.5052712497254558, + "grad_norm": 1.6704310178756714, + "learning_rate": 5e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.693203866481781, + "num_tokens": 118947462.0, + "step": 4601 + }, + { + "epoch": 0.5053810674280694, + "grad_norm": 2.161616563796997, + "learning_rate": 5e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6854419112205505, + "num_tokens": 118968591.0, + "step": 4602 + }, + { + "epoch": 0.5054908851306831, + "grad_norm": 1.836708664894104, + "learning_rate": 5e-06, + "loss": 0.905, + "mean_token_accuracy": 0.716708779335022, + "num_tokens": 118993989.0, + "step": 4603 + }, + { + "epoch": 0.5056007028332967, + "grad_norm": 1.8811513185501099, + "learning_rate": 5e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7002002596855164, + "num_tokens": 119018440.0, + "step": 4604 + }, + { + "epoch": 0.5057105205359104, + "grad_norm": 1.8809840679168701, + "learning_rate": 5e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6805287003517151, + "num_tokens": 119045670.0, + "step": 4605 + }, + { + "epoch": 0.505820338238524, + "grad_norm": 1.8662704229354858, + "learning_rate": 5e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6926240921020508, + "num_tokens": 119071962.0, + "step": 4606 + }, + { + "epoch": 0.5059301559411377, + "grad_norm": 2.115001678466797, + "learning_rate": 5e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6981900930404663, + "num_tokens": 119093180.0, + "step": 4607 + }, + { + "epoch": 0.5060399736437514, + "grad_norm": 1.7580684423446655, + "learning_rate": 5e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.7015488147735596, + "num_tokens": 119124053.0, + "step": 4608 + }, + { + "epoch": 0.5061497913463651, + "grad_norm": 1.8724243640899658, + "learning_rate": 5e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7044592499732971, + "num_tokens": 119151831.0, + "step": 4609 + }, + { + "epoch": 0.5062596090489787, + "grad_norm": 1.814432144165039, + "learning_rate": 5e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6941791772842407, + "num_tokens": 119180218.0, + "step": 4610 + }, + { + "epoch": 0.5063694267515924, + "grad_norm": 1.8249764442443848, + "learning_rate": 5e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7102907299995422, + "num_tokens": 119209328.0, + "step": 4611 + }, + { + "epoch": 0.506479244454206, + "grad_norm": 1.809747338294983, + "learning_rate": 5e-06, + "loss": 1.1404, + "mean_token_accuracy": 0.6575155854225159, + "num_tokens": 119240367.0, + "step": 4612 + }, + { + "epoch": 0.5065890621568196, + "grad_norm": 1.6430761814117432, + "learning_rate": 5e-06, + "loss": 1.1044, + "mean_token_accuracy": 0.6706234812736511, + "num_tokens": 119275252.0, + "step": 4613 + }, + { + "epoch": 0.5066988798594333, + "grad_norm": 2.0430197715759277, + "learning_rate": 5e-06, + "loss": 0.947, + "mean_token_accuracy": 0.707960844039917, + "num_tokens": 119298395.0, + "step": 4614 + }, + { + "epoch": 0.506808697562047, + "grad_norm": 1.7229552268981934, + "learning_rate": 5e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6882069706916809, + "num_tokens": 119332303.0, + "step": 4615 + }, + { + "epoch": 0.5069185152646607, + "grad_norm": 1.8400053977966309, + "learning_rate": 5e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7101867198944092, + "num_tokens": 119358872.0, + "step": 4616 + }, + { + "epoch": 0.5070283329672743, + "grad_norm": 1.5698155164718628, + "learning_rate": 5e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6952732801437378, + "num_tokens": 119393132.0, + "step": 4617 + }, + { + "epoch": 0.507138150669888, + "grad_norm": 1.9185668230056763, + "learning_rate": 5e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.68427574634552, + "num_tokens": 119417973.0, + "step": 4618 + }, + { + "epoch": 0.5072479683725016, + "grad_norm": 2.060396432876587, + "learning_rate": 5e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.704739511013031, + "num_tokens": 119441085.0, + "step": 4619 + }, + { + "epoch": 0.5073577860751153, + "grad_norm": 2.0082521438598633, + "learning_rate": 5e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7414746284484863, + "num_tokens": 119461585.0, + "step": 4620 + }, + { + "epoch": 0.5074676037777289, + "grad_norm": 1.7336177825927734, + "learning_rate": 5e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6804629564285278, + "num_tokens": 119491398.0, + "step": 4621 + }, + { + "epoch": 0.5075774214803427, + "grad_norm": 1.8432466983795166, + "learning_rate": 5e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.709191620349884, + "num_tokens": 119519527.0, + "step": 4622 + }, + { + "epoch": 0.5076872391829563, + "grad_norm": 1.8893828392028809, + "learning_rate": 5e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7293010950088501, + "num_tokens": 119542040.0, + "step": 4623 + }, + { + "epoch": 0.50779705688557, + "grad_norm": 1.8104300498962402, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7060818672180176, + "num_tokens": 119569185.0, + "step": 4624 + }, + { + "epoch": 0.5079068745881836, + "grad_norm": 1.9669547080993652, + "learning_rate": 5e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6975857615470886, + "num_tokens": 119593291.0, + "step": 4625 + }, + { + "epoch": 0.5080166922907973, + "grad_norm": 1.9832000732421875, + "learning_rate": 5e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7156226634979248, + "num_tokens": 119618369.0, + "step": 4626 + }, + { + "epoch": 0.5081265099934109, + "grad_norm": 1.864312767982483, + "learning_rate": 5e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6966373324394226, + "num_tokens": 119646745.0, + "step": 4627 + }, + { + "epoch": 0.5082363276960246, + "grad_norm": 1.7238266468048096, + "learning_rate": 5e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7041596174240112, + "num_tokens": 119676253.0, + "step": 4628 + }, + { + "epoch": 0.5083461453986382, + "grad_norm": 1.8349417448043823, + "learning_rate": 5e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7074785232543945, + "num_tokens": 119704000.0, + "step": 4629 + }, + { + "epoch": 0.508455963101252, + "grad_norm": 1.7247159481048584, + "learning_rate": 5e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6974591016769409, + "num_tokens": 119733314.0, + "step": 4630 + }, + { + "epoch": 0.5085657808038656, + "grad_norm": 1.9544556140899658, + "learning_rate": 5e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7072902321815491, + "num_tokens": 119757598.0, + "step": 4631 + }, + { + "epoch": 0.5086755985064793, + "grad_norm": 1.9998984336853027, + "learning_rate": 5e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7248404026031494, + "num_tokens": 119779487.0, + "step": 4632 + }, + { + "epoch": 0.5087854162090929, + "grad_norm": 2.2019248008728027, + "learning_rate": 5e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6939406394958496, + "num_tokens": 119802388.0, + "step": 4633 + }, + { + "epoch": 0.5088952339117065, + "grad_norm": 1.8393361568450928, + "learning_rate": 5e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6895685791969299, + "num_tokens": 119831757.0, + "step": 4634 + }, + { + "epoch": 0.5090050516143202, + "grad_norm": 1.8363970518112183, + "learning_rate": 5e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6978840827941895, + "num_tokens": 119858913.0, + "step": 4635 + }, + { + "epoch": 0.5091148693169338, + "grad_norm": 1.8595807552337646, + "learning_rate": 5e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6759628653526306, + "num_tokens": 119886642.0, + "step": 4636 + }, + { + "epoch": 0.5092246870195476, + "grad_norm": 1.9511442184448242, + "learning_rate": 5e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6971741318702698, + "num_tokens": 119911327.0, + "step": 4637 + }, + { + "epoch": 0.5093345047221612, + "grad_norm": 2.0129597187042236, + "learning_rate": 5e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6954474449157715, + "num_tokens": 119936954.0, + "step": 4638 + }, + { + "epoch": 0.5094443224247749, + "grad_norm": 1.7626574039459229, + "learning_rate": 5e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7270072102546692, + "num_tokens": 119962554.0, + "step": 4639 + }, + { + "epoch": 0.5095541401273885, + "grad_norm": 1.9361047744750977, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7272634506225586, + "num_tokens": 119987270.0, + "step": 4640 + }, + { + "epoch": 0.5096639578300022, + "grad_norm": 2.0777339935302734, + "learning_rate": 5e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7233127355575562, + "num_tokens": 120008454.0, + "step": 4641 + }, + { + "epoch": 0.5097737755326158, + "grad_norm": 1.939202070236206, + "learning_rate": 5e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6878408193588257, + "num_tokens": 120033987.0, + "step": 4642 + }, + { + "epoch": 0.5098835932352295, + "grad_norm": 1.745906114578247, + "learning_rate": 5e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.712365984916687, + "num_tokens": 120062095.0, + "step": 4643 + }, + { + "epoch": 0.5099934109378432, + "grad_norm": 1.8813598155975342, + "learning_rate": 5e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7054025530815125, + "num_tokens": 120088087.0, + "step": 4644 + }, + { + "epoch": 0.5101032286404569, + "grad_norm": 2.114187002182007, + "learning_rate": 5e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.680601954460144, + "num_tokens": 120111785.0, + "step": 4645 + }, + { + "epoch": 0.5102130463430705, + "grad_norm": 2.08552885055542, + "learning_rate": 5e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7112468481063843, + "num_tokens": 120131984.0, + "step": 4646 + }, + { + "epoch": 0.5103228640456842, + "grad_norm": 1.9779142141342163, + "learning_rate": 5e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6979407072067261, + "num_tokens": 120158177.0, + "step": 4647 + }, + { + "epoch": 0.5104326817482978, + "grad_norm": 2.0169858932495117, + "learning_rate": 5e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6980185508728027, + "num_tokens": 120180702.0, + "step": 4648 + }, + { + "epoch": 0.5105424994509115, + "grad_norm": 1.973949670791626, + "learning_rate": 5e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7137496471405029, + "num_tokens": 120205535.0, + "step": 4649 + }, + { + "epoch": 0.5106523171535251, + "grad_norm": 2.1836161613464355, + "learning_rate": 5e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6903051137924194, + "num_tokens": 120225225.0, + "step": 4650 + }, + { + "epoch": 0.5107621348561389, + "grad_norm": 1.8406387567520142, + "learning_rate": 5e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.720262885093689, + "num_tokens": 120252713.0, + "step": 4651 + }, + { + "epoch": 0.5108719525587525, + "grad_norm": 2.384107828140259, + "learning_rate": 5e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7138172388076782, + "num_tokens": 120269583.0, + "step": 4652 + }, + { + "epoch": 0.5109817702613662, + "grad_norm": 2.0655736923217773, + "learning_rate": 5e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7340309023857117, + "num_tokens": 120291281.0, + "step": 4653 + }, + { + "epoch": 0.5110915879639798, + "grad_norm": 1.9831607341766357, + "learning_rate": 5e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7040433287620544, + "num_tokens": 120316196.0, + "step": 4654 + }, + { + "epoch": 0.5112014056665934, + "grad_norm": 1.8304492235183716, + "learning_rate": 5e-06, + "loss": 1.0895, + "mean_token_accuracy": 0.6751195788383484, + "num_tokens": 120347643.0, + "step": 4655 + }, + { + "epoch": 0.5113112233692071, + "grad_norm": 1.8389136791229248, + "learning_rate": 5e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7028275728225708, + "num_tokens": 120377177.0, + "step": 4656 + }, + { + "epoch": 0.5114210410718207, + "grad_norm": 1.9732342958450317, + "learning_rate": 5e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7042254209518433, + "num_tokens": 120399156.0, + "step": 4657 + }, + { + "epoch": 0.5115308587744345, + "grad_norm": 1.8483827114105225, + "learning_rate": 5e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7065845727920532, + "num_tokens": 120424103.0, + "step": 4658 + }, + { + "epoch": 0.5116406764770481, + "grad_norm": 1.7977213859558105, + "learning_rate": 5e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6844035387039185, + "num_tokens": 120452475.0, + "step": 4659 + }, + { + "epoch": 0.5117504941796618, + "grad_norm": 1.8528677225112915, + "learning_rate": 5e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7048248052597046, + "num_tokens": 120477605.0, + "step": 4660 + }, + { + "epoch": 0.5118603118822754, + "grad_norm": 2.0177783966064453, + "learning_rate": 5e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.696890652179718, + "num_tokens": 120501141.0, + "step": 4661 + }, + { + "epoch": 0.5119701295848891, + "grad_norm": 1.922204613685608, + "learning_rate": 5e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7078564763069153, + "num_tokens": 120526080.0, + "step": 4662 + }, + { + "epoch": 0.5120799472875027, + "grad_norm": 1.6970300674438477, + "learning_rate": 5e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7000508308410645, + "num_tokens": 120555638.0, + "step": 4663 + }, + { + "epoch": 0.5121897649901164, + "grad_norm": 1.851665735244751, + "learning_rate": 5e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6842178106307983, + "num_tokens": 120581980.0, + "step": 4664 + }, + { + "epoch": 0.51229958269273, + "grad_norm": 1.8851310014724731, + "learning_rate": 5e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6955840587615967, + "num_tokens": 120607052.0, + "step": 4665 + }, + { + "epoch": 0.5124094003953438, + "grad_norm": 1.7444732189178467, + "learning_rate": 5e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7291296124458313, + "num_tokens": 120639244.0, + "step": 4666 + }, + { + "epoch": 0.5125192180979574, + "grad_norm": 1.8668830394744873, + "learning_rate": 5e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.698088526725769, + "num_tokens": 120664469.0, + "step": 4667 + }, + { + "epoch": 0.5126290358005711, + "grad_norm": 1.9319566488265991, + "learning_rate": 5e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6953848004341125, + "num_tokens": 120688352.0, + "step": 4668 + }, + { + "epoch": 0.5127388535031847, + "grad_norm": 2.0393640995025635, + "learning_rate": 5e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6854439377784729, + "num_tokens": 120711554.0, + "step": 4669 + }, + { + "epoch": 0.5128486712057984, + "grad_norm": 1.7463092803955078, + "learning_rate": 5e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7088932991027832, + "num_tokens": 120740866.0, + "step": 4670 + }, + { + "epoch": 0.512958488908412, + "grad_norm": 1.9032773971557617, + "learning_rate": 5e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6991142630577087, + "num_tokens": 120767074.0, + "step": 4671 + }, + { + "epoch": 0.5130683066110256, + "grad_norm": 1.7587918043136597, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7249583005905151, + "num_tokens": 120794750.0, + "step": 4672 + }, + { + "epoch": 0.5131781243136394, + "grad_norm": 2.01251220703125, + "learning_rate": 5e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6945793628692627, + "num_tokens": 120819343.0, + "step": 4673 + }, + { + "epoch": 0.513287942016253, + "grad_norm": 1.7615684270858765, + "learning_rate": 5e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7044906616210938, + "num_tokens": 120850815.0, + "step": 4674 + }, + { + "epoch": 0.5133977597188667, + "grad_norm": 1.965279221534729, + "learning_rate": 5e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6949460506439209, + "num_tokens": 120877294.0, + "step": 4675 + }, + { + "epoch": 0.5135075774214803, + "grad_norm": 2.048471212387085, + "learning_rate": 5e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7058349847793579, + "num_tokens": 120899119.0, + "step": 4676 + }, + { + "epoch": 0.513617395124094, + "grad_norm": 1.6835083961486816, + "learning_rate": 5e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.7016012668609619, + "num_tokens": 120930801.0, + "step": 4677 + }, + { + "epoch": 0.5137272128267076, + "grad_norm": 2.2259976863861084, + "learning_rate": 5e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7137719392776489, + "num_tokens": 120949457.0, + "step": 4678 + }, + { + "epoch": 0.5138370305293213, + "grad_norm": 1.5343841314315796, + "learning_rate": 5e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6836145520210266, + "num_tokens": 120989108.0, + "step": 4679 + }, + { + "epoch": 0.513946848231935, + "grad_norm": 1.7876508235931396, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7063971757888794, + "num_tokens": 121021251.0, + "step": 4680 + }, + { + "epoch": 0.5140566659345487, + "grad_norm": 1.954914927482605, + "learning_rate": 5e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7088005542755127, + "num_tokens": 121046209.0, + "step": 4681 + }, + { + "epoch": 0.5141664836371623, + "grad_norm": 2.1849722862243652, + "learning_rate": 5e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7259669303894043, + "num_tokens": 121065517.0, + "step": 4682 + }, + { + "epoch": 0.514276301339776, + "grad_norm": 1.7917118072509766, + "learning_rate": 5e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6818681955337524, + "num_tokens": 121093035.0, + "step": 4683 + }, + { + "epoch": 0.5143861190423896, + "grad_norm": 1.869580864906311, + "learning_rate": 5e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7125554084777832, + "num_tokens": 121119263.0, + "step": 4684 + }, + { + "epoch": 0.5144959367450033, + "grad_norm": 1.6667934656143188, + "learning_rate": 5e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7183286547660828, + "num_tokens": 121148242.0, + "step": 4685 + }, + { + "epoch": 0.5146057544476169, + "grad_norm": 2.1396710872650146, + "learning_rate": 5e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7092489004135132, + "num_tokens": 121170400.0, + "step": 4686 + }, + { + "epoch": 0.5147155721502307, + "grad_norm": 2.015213966369629, + "learning_rate": 5e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.717348039150238, + "num_tokens": 121192037.0, + "step": 4687 + }, + { + "epoch": 0.5148253898528443, + "grad_norm": 1.8266602754592896, + "learning_rate": 5e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6912245750427246, + "num_tokens": 121223883.0, + "step": 4688 + }, + { + "epoch": 0.514935207555458, + "grad_norm": 2.1529202461242676, + "learning_rate": 5e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6927587985992432, + "num_tokens": 121248434.0, + "step": 4689 + }, + { + "epoch": 0.5150450252580716, + "grad_norm": 2.0374186038970947, + "learning_rate": 5e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7177624106407166, + "num_tokens": 121271630.0, + "step": 4690 + }, + { + "epoch": 0.5151548429606853, + "grad_norm": 1.7336387634277344, + "learning_rate": 5e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7057400941848755, + "num_tokens": 121301459.0, + "step": 4691 + }, + { + "epoch": 0.5152646606632989, + "grad_norm": 1.8753670454025269, + "learning_rate": 5e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7255548238754272, + "num_tokens": 121326248.0, + "step": 4692 + }, + { + "epoch": 0.5153744783659125, + "grad_norm": 1.7998863458633423, + "learning_rate": 5e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7073830962181091, + "num_tokens": 121354874.0, + "step": 4693 + }, + { + "epoch": 0.5154842960685262, + "grad_norm": 1.8298161029815674, + "learning_rate": 5e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.726274847984314, + "num_tokens": 121383779.0, + "step": 4694 + }, + { + "epoch": 0.51559411377114, + "grad_norm": 1.9256484508514404, + "learning_rate": 5e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7020266652107239, + "num_tokens": 121409664.0, + "step": 4695 + }, + { + "epoch": 0.5157039314737536, + "grad_norm": 1.959999918937683, + "learning_rate": 5e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6908483505249023, + "num_tokens": 121434171.0, + "step": 4696 + }, + { + "epoch": 0.5158137491763672, + "grad_norm": 1.8607840538024902, + "learning_rate": 5e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7158075571060181, + "num_tokens": 121460710.0, + "step": 4697 + }, + { + "epoch": 0.5159235668789809, + "grad_norm": 1.8904203176498413, + "learning_rate": 5e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6910691857337952, + "num_tokens": 121486528.0, + "step": 4698 + }, + { + "epoch": 0.5160333845815945, + "grad_norm": 1.8670899868011475, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7098685503005981, + "num_tokens": 121513232.0, + "step": 4699 + }, + { + "epoch": 0.5161432022842082, + "grad_norm": 1.7407015562057495, + "learning_rate": 5e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7049660682678223, + "num_tokens": 121542645.0, + "step": 4700 + }, + { + "epoch": 0.5162530199868218, + "grad_norm": 1.778436541557312, + "learning_rate": 5e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7141131162643433, + "num_tokens": 121568651.0, + "step": 4701 + }, + { + "epoch": 0.5163628376894356, + "grad_norm": 1.9610604047775269, + "learning_rate": 5e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6854914426803589, + "num_tokens": 121592426.0, + "step": 4702 + }, + { + "epoch": 0.5164726553920492, + "grad_norm": 1.992747187614441, + "learning_rate": 5e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7026426196098328, + "num_tokens": 121615939.0, + "step": 4703 + }, + { + "epoch": 0.5165824730946629, + "grad_norm": 1.852726697921753, + "learning_rate": 5e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6797820329666138, + "num_tokens": 121644108.0, + "step": 4704 + }, + { + "epoch": 0.5166922907972765, + "grad_norm": 1.8047196865081787, + "learning_rate": 5e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6993305683135986, + "num_tokens": 121672524.0, + "step": 4705 + }, + { + "epoch": 0.5168021084998902, + "grad_norm": 1.8823925256729126, + "learning_rate": 5e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7040257453918457, + "num_tokens": 121696731.0, + "step": 4706 + }, + { + "epoch": 0.5169119262025038, + "grad_norm": 1.8638010025024414, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.717571496963501, + "num_tokens": 121721872.0, + "step": 4707 + }, + { + "epoch": 0.5170217439051175, + "grad_norm": 1.919356346130371, + "learning_rate": 5e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7048542499542236, + "num_tokens": 121748661.0, + "step": 4708 + }, + { + "epoch": 0.5171315616077312, + "grad_norm": 2.072542428970337, + "learning_rate": 5e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7006151676177979, + "num_tokens": 121771226.0, + "step": 4709 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 2.026975154876709, + "learning_rate": 5e-06, + "loss": 1.046, + "mean_token_accuracy": 0.686686098575592, + "num_tokens": 121793929.0, + "step": 4710 + }, + { + "epoch": 0.5173511970129585, + "grad_norm": 2.044257879257202, + "learning_rate": 5e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7070400714874268, + "num_tokens": 121818520.0, + "step": 4711 + }, + { + "epoch": 0.5174610147155722, + "grad_norm": 1.7169793844223022, + "learning_rate": 5e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7047160863876343, + "num_tokens": 121847761.0, + "step": 4712 + }, + { + "epoch": 0.5175708324181858, + "grad_norm": 1.8144627809524536, + "learning_rate": 5e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6814587712287903, + "num_tokens": 121876414.0, + "step": 4713 + }, + { + "epoch": 0.5176806501207994, + "grad_norm": 1.8278108835220337, + "learning_rate": 5e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7224932909011841, + "num_tokens": 121901514.0, + "step": 4714 + }, + { + "epoch": 0.5177904678234131, + "grad_norm": 1.9135419130325317, + "learning_rate": 5e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7035865187644958, + "num_tokens": 121925162.0, + "step": 4715 + }, + { + "epoch": 0.5179002855260268, + "grad_norm": 1.9321353435516357, + "learning_rate": 5e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7086582183837891, + "num_tokens": 121949078.0, + "step": 4716 + }, + { + "epoch": 0.5180101032286405, + "grad_norm": 1.9380183219909668, + "learning_rate": 5e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7007730007171631, + "num_tokens": 121973058.0, + "step": 4717 + }, + { + "epoch": 0.5181199209312541, + "grad_norm": 1.8934005498886108, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6890861988067627, + "num_tokens": 121998423.0, + "step": 4718 + }, + { + "epoch": 0.5182297386338678, + "grad_norm": 1.753299593925476, + "learning_rate": 5e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6936163902282715, + "num_tokens": 122028592.0, + "step": 4719 + }, + { + "epoch": 0.5183395563364814, + "grad_norm": 1.9678748846054077, + "learning_rate": 5e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6988074779510498, + "num_tokens": 122051612.0, + "step": 4720 + }, + { + "epoch": 0.5184493740390951, + "grad_norm": 2.0479214191436768, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7210038304328918, + "num_tokens": 122075790.0, + "step": 4721 + }, + { + "epoch": 0.5185591917417087, + "grad_norm": 1.9190741777420044, + "learning_rate": 5e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6925018429756165, + "num_tokens": 122101729.0, + "step": 4722 + }, + { + "epoch": 0.5186690094443224, + "grad_norm": 1.8894575834274292, + "learning_rate": 5e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7016725540161133, + "num_tokens": 122127176.0, + "step": 4723 + }, + { + "epoch": 0.5187788271469361, + "grad_norm": 1.9735419750213623, + "learning_rate": 5e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7309412956237793, + "num_tokens": 122148786.0, + "step": 4724 + }, + { + "epoch": 0.5188886448495498, + "grad_norm": 2.098341226577759, + "learning_rate": 5e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.721276044845581, + "num_tokens": 122168943.0, + "step": 4725 + }, + { + "epoch": 0.5189984625521634, + "grad_norm": 2.008852243423462, + "learning_rate": 5e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6967896223068237, + "num_tokens": 122194494.0, + "step": 4726 + }, + { + "epoch": 0.5191082802547771, + "grad_norm": 1.792802095413208, + "learning_rate": 5e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7046377658843994, + "num_tokens": 122220836.0, + "step": 4727 + }, + { + "epoch": 0.5192180979573907, + "grad_norm": 1.9615004062652588, + "learning_rate": 5e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6951481103897095, + "num_tokens": 122246688.0, + "step": 4728 + }, + { + "epoch": 0.5193279156600044, + "grad_norm": 2.018165111541748, + "learning_rate": 5e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6990170478820801, + "num_tokens": 122270161.0, + "step": 4729 + }, + { + "epoch": 0.519437733362618, + "grad_norm": 1.9432275295257568, + "learning_rate": 5e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6962551474571228, + "num_tokens": 122294741.0, + "step": 4730 + }, + { + "epoch": 0.5195475510652318, + "grad_norm": 2.1718435287475586, + "learning_rate": 5e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6970725059509277, + "num_tokens": 122315489.0, + "step": 4731 + }, + { + "epoch": 0.5196573687678454, + "grad_norm": 2.0007810592651367, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7160942554473877, + "num_tokens": 122337546.0, + "step": 4732 + }, + { + "epoch": 0.519767186470459, + "grad_norm": 2.0263752937316895, + "learning_rate": 5e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7018847465515137, + "num_tokens": 122360589.0, + "step": 4733 + }, + { + "epoch": 0.5198770041730727, + "grad_norm": 2.0201399326324463, + "learning_rate": 5e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7172316908836365, + "num_tokens": 122384733.0, + "step": 4734 + }, + { + "epoch": 0.5199868218756863, + "grad_norm": 2.109771490097046, + "learning_rate": 5e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.72100830078125, + "num_tokens": 122407513.0, + "step": 4735 + }, + { + "epoch": 0.5200966395783, + "grad_norm": 1.9688297510147095, + "learning_rate": 5e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6899777054786682, + "num_tokens": 122430847.0, + "step": 4736 + }, + { + "epoch": 0.5202064572809136, + "grad_norm": 1.9427696466445923, + "learning_rate": 5e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7216234803199768, + "num_tokens": 122452885.0, + "step": 4737 + }, + { + "epoch": 0.5203162749835274, + "grad_norm": 1.7371439933776855, + "learning_rate": 5e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6936696767807007, + "num_tokens": 122482533.0, + "step": 4738 + }, + { + "epoch": 0.520426092686141, + "grad_norm": 1.7973453998565674, + "learning_rate": 5e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.698707103729248, + "num_tokens": 122511524.0, + "step": 4739 + }, + { + "epoch": 0.5205359103887547, + "grad_norm": 1.7120935916900635, + "learning_rate": 5e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6868217587471008, + "num_tokens": 122543441.0, + "step": 4740 + }, + { + "epoch": 0.5206457280913683, + "grad_norm": 1.944929599761963, + "learning_rate": 5e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7109965085983276, + "num_tokens": 122566926.0, + "step": 4741 + }, + { + "epoch": 0.520755545793982, + "grad_norm": 1.9482852220535278, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7323322296142578, + "num_tokens": 122588992.0, + "step": 4742 + }, + { + "epoch": 0.5208653634965956, + "grad_norm": 1.8633294105529785, + "learning_rate": 5e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6958892345428467, + "num_tokens": 122616282.0, + "step": 4743 + }, + { + "epoch": 0.5209751811992093, + "grad_norm": 2.0460269451141357, + "learning_rate": 5e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7043494582176208, + "num_tokens": 122637695.0, + "step": 4744 + }, + { + "epoch": 0.521084998901823, + "grad_norm": 2.0944035053253174, + "learning_rate": 5e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.6964539289474487, + "num_tokens": 122659022.0, + "step": 4745 + }, + { + "epoch": 0.5211948166044367, + "grad_norm": 1.672939419746399, + "learning_rate": 5e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6848976612091064, + "num_tokens": 122689577.0, + "step": 4746 + }, + { + "epoch": 0.5213046343070503, + "grad_norm": 1.7858856916427612, + "learning_rate": 5e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7059842944145203, + "num_tokens": 122717236.0, + "step": 4747 + }, + { + "epoch": 0.521414452009664, + "grad_norm": 1.8525867462158203, + "learning_rate": 5e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6997289061546326, + "num_tokens": 122741849.0, + "step": 4748 + }, + { + "epoch": 0.5215242697122776, + "grad_norm": 1.710771918296814, + "learning_rate": 5e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7106770277023315, + "num_tokens": 122772158.0, + "step": 4749 + }, + { + "epoch": 0.5216340874148913, + "grad_norm": 2.002607583999634, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7120333313941956, + "num_tokens": 122793838.0, + "step": 4750 + }, + { + "epoch": 0.5217439051175049, + "grad_norm": 1.7391674518585205, + "learning_rate": 5e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7030259370803833, + "num_tokens": 122823998.0, + "step": 4751 + }, + { + "epoch": 0.5218537228201185, + "grad_norm": 1.9340314865112305, + "learning_rate": 5e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6921776533126831, + "num_tokens": 122849234.0, + "step": 4752 + }, + { + "epoch": 0.5219635405227323, + "grad_norm": 1.7354577779769897, + "learning_rate": 5e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7103187441825867, + "num_tokens": 122879816.0, + "step": 4753 + }, + { + "epoch": 0.522073358225346, + "grad_norm": 1.9136075973510742, + "learning_rate": 5e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7204636335372925, + "num_tokens": 122903900.0, + "step": 4754 + }, + { + "epoch": 0.5221831759279596, + "grad_norm": 1.7876770496368408, + "learning_rate": 5e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.6711266040802002, + "num_tokens": 122938130.0, + "step": 4755 + }, + { + "epoch": 0.5222929936305732, + "grad_norm": 1.9665437936782837, + "learning_rate": 5e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6956430673599243, + "num_tokens": 122961506.0, + "step": 4756 + }, + { + "epoch": 0.5224028113331869, + "grad_norm": 2.007883310317993, + "learning_rate": 5e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6798052191734314, + "num_tokens": 122989305.0, + "step": 4757 + }, + { + "epoch": 0.5225126290358005, + "grad_norm": 1.8202193975448608, + "learning_rate": 5e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6848381757736206, + "num_tokens": 123018347.0, + "step": 4758 + }, + { + "epoch": 0.5226224467384142, + "grad_norm": 1.8092947006225586, + "learning_rate": 5e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6863296031951904, + "num_tokens": 123049642.0, + "step": 4759 + }, + { + "epoch": 0.5227322644410279, + "grad_norm": 1.9210602045059204, + "learning_rate": 5e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7115159630775452, + "num_tokens": 123076258.0, + "step": 4760 + }, + { + "epoch": 0.5228420821436416, + "grad_norm": 2.0427050590515137, + "learning_rate": 5e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7116794586181641, + "num_tokens": 123097586.0, + "step": 4761 + }, + { + "epoch": 0.5229518998462552, + "grad_norm": 2.1868386268615723, + "learning_rate": 5e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.6975126266479492, + "num_tokens": 123119659.0, + "step": 4762 + }, + { + "epoch": 0.5230617175488689, + "grad_norm": 1.993396520614624, + "learning_rate": 5e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6888720393180847, + "num_tokens": 123144431.0, + "step": 4763 + }, + { + "epoch": 0.5231715352514825, + "grad_norm": 1.9824323654174805, + "learning_rate": 5e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7075926065444946, + "num_tokens": 123167518.0, + "step": 4764 + }, + { + "epoch": 0.5232813529540962, + "grad_norm": 1.9904128313064575, + "learning_rate": 5e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6981820464134216, + "num_tokens": 123191814.0, + "step": 4765 + }, + { + "epoch": 0.5233911706567098, + "grad_norm": 1.9082725048065186, + "learning_rate": 5e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7331733703613281, + "num_tokens": 123215134.0, + "step": 4766 + }, + { + "epoch": 0.5235009883593236, + "grad_norm": 1.8174947500228882, + "learning_rate": 5e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7152522206306458, + "num_tokens": 123240332.0, + "step": 4767 + }, + { + "epoch": 0.5236108060619372, + "grad_norm": 2.0778310298919678, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7313891649246216, + "num_tokens": 123258290.0, + "step": 4768 + }, + { + "epoch": 0.5237206237645509, + "grad_norm": 2.17325496673584, + "learning_rate": 5e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7055902481079102, + "num_tokens": 123277589.0, + "step": 4769 + }, + { + "epoch": 0.5238304414671645, + "grad_norm": 2.0422050952911377, + "learning_rate": 5e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6967020034790039, + "num_tokens": 123300662.0, + "step": 4770 + }, + { + "epoch": 0.5239402591697782, + "grad_norm": 1.89481782913208, + "learning_rate": 5e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6939271688461304, + "num_tokens": 123324851.0, + "step": 4771 + }, + { + "epoch": 0.5240500768723918, + "grad_norm": 1.860298752784729, + "learning_rate": 5e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7151622772216797, + "num_tokens": 123350146.0, + "step": 4772 + }, + { + "epoch": 0.5241598945750054, + "grad_norm": 2.0887937545776367, + "learning_rate": 5e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6948496103286743, + "num_tokens": 123372877.0, + "step": 4773 + }, + { + "epoch": 0.5242697122776192, + "grad_norm": 1.8338552713394165, + "learning_rate": 5e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.688994824886322, + "num_tokens": 123401885.0, + "step": 4774 + }, + { + "epoch": 0.5243795299802329, + "grad_norm": 2.2389750480651855, + "learning_rate": 5e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.6969016790390015, + "num_tokens": 123421191.0, + "step": 4775 + }, + { + "epoch": 0.5244893476828465, + "grad_norm": 1.9276230335235596, + "learning_rate": 5e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7006188631057739, + "num_tokens": 123444301.0, + "step": 4776 + }, + { + "epoch": 0.5245991653854601, + "grad_norm": 1.7380025386810303, + "learning_rate": 5e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7259346842765808, + "num_tokens": 123471244.0, + "step": 4777 + }, + { + "epoch": 0.5247089830880738, + "grad_norm": 1.9385805130004883, + "learning_rate": 5e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6984997987747192, + "num_tokens": 123498206.0, + "step": 4778 + }, + { + "epoch": 0.5248188007906874, + "grad_norm": 1.7257061004638672, + "learning_rate": 5e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.707841694355011, + "num_tokens": 123526962.0, + "step": 4779 + }, + { + "epoch": 0.5249286184933011, + "grad_norm": 2.012676477432251, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7186610102653503, + "num_tokens": 123548625.0, + "step": 4780 + }, + { + "epoch": 0.5250384361959147, + "grad_norm": 1.6698927879333496, + "learning_rate": 5e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6932757496833801, + "num_tokens": 123580396.0, + "step": 4781 + }, + { + "epoch": 0.5251482538985285, + "grad_norm": 1.784975528717041, + "learning_rate": 5e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6961215734481812, + "num_tokens": 123608633.0, + "step": 4782 + }, + { + "epoch": 0.5252580716011421, + "grad_norm": 1.9029676914215088, + "learning_rate": 5e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6819334030151367, + "num_tokens": 123634740.0, + "step": 4783 + }, + { + "epoch": 0.5253678893037558, + "grad_norm": 1.7249469757080078, + "learning_rate": 5e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7002153396606445, + "num_tokens": 123663735.0, + "step": 4784 + }, + { + "epoch": 0.5254777070063694, + "grad_norm": 1.8050042390823364, + "learning_rate": 5e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6868013739585876, + "num_tokens": 123693328.0, + "step": 4785 + }, + { + "epoch": 0.5255875247089831, + "grad_norm": 2.187739133834839, + "learning_rate": 5e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7154135704040527, + "num_tokens": 123712796.0, + "step": 4786 + }, + { + "epoch": 0.5256973424115967, + "grad_norm": 1.697543740272522, + "learning_rate": 5e-06, + "loss": 1.1012, + "mean_token_accuracy": 0.6715112328529358, + "num_tokens": 123744515.0, + "step": 4787 + }, + { + "epoch": 0.5258071601142104, + "grad_norm": 2.257075071334839, + "learning_rate": 5e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7157592177391052, + "num_tokens": 123762532.0, + "step": 4788 + }, + { + "epoch": 0.5259169778168241, + "grad_norm": 2.014007568359375, + "learning_rate": 5e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7031695246696472, + "num_tokens": 123785201.0, + "step": 4789 + }, + { + "epoch": 0.5260267955194378, + "grad_norm": 1.797688364982605, + "learning_rate": 5e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7099198698997498, + "num_tokens": 123812711.0, + "step": 4790 + }, + { + "epoch": 0.5261366132220514, + "grad_norm": 1.837164282798767, + "learning_rate": 5e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6936416625976562, + "num_tokens": 123840612.0, + "step": 4791 + }, + { + "epoch": 0.526246430924665, + "grad_norm": 1.9554520845413208, + "learning_rate": 5e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7019897699356079, + "num_tokens": 123863006.0, + "step": 4792 + }, + { + "epoch": 0.5263562486272787, + "grad_norm": 1.9186153411865234, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7024214267730713, + "num_tokens": 123886876.0, + "step": 4793 + }, + { + "epoch": 0.5264660663298923, + "grad_norm": 1.9907516241073608, + "learning_rate": 5e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.715625524520874, + "num_tokens": 123909138.0, + "step": 4794 + }, + { + "epoch": 0.526575884032506, + "grad_norm": 2.123211622238159, + "learning_rate": 5e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6934577822685242, + "num_tokens": 123931530.0, + "step": 4795 + }, + { + "epoch": 0.5266857017351197, + "grad_norm": 1.9630396366119385, + "learning_rate": 5e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6973863840103149, + "num_tokens": 123956548.0, + "step": 4796 + }, + { + "epoch": 0.5267955194377334, + "grad_norm": 1.7980496883392334, + "learning_rate": 5e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7119171619415283, + "num_tokens": 123983377.0, + "step": 4797 + }, + { + "epoch": 0.526905337140347, + "grad_norm": 1.8182361125946045, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7130221128463745, + "num_tokens": 124010324.0, + "step": 4798 + }, + { + "epoch": 0.5270151548429607, + "grad_norm": 1.8731166124343872, + "learning_rate": 5e-06, + "loss": 1.0905, + "mean_token_accuracy": 0.6753664016723633, + "num_tokens": 124037184.0, + "step": 4799 + }, + { + "epoch": 0.5271249725455743, + "grad_norm": 2.016979455947876, + "learning_rate": 5e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7102247476577759, + "num_tokens": 124058727.0, + "step": 4800 + }, + { + "epoch": 0.527234790248188, + "grad_norm": 1.8903435468673706, + "learning_rate": 5e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6900667548179626, + "num_tokens": 124084254.0, + "step": 4801 + }, + { + "epoch": 0.5273446079508016, + "grad_norm": 1.9703372716903687, + "learning_rate": 5e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7168761491775513, + "num_tokens": 124107188.0, + "step": 4802 + }, + { + "epoch": 0.5274544256534154, + "grad_norm": 1.9062269926071167, + "learning_rate": 5e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7023490071296692, + "num_tokens": 124133566.0, + "step": 4803 + }, + { + "epoch": 0.527564243356029, + "grad_norm": 1.85844886302948, + "learning_rate": 5e-06, + "loss": 1.1002, + "mean_token_accuracy": 0.6725178956985474, + "num_tokens": 124162274.0, + "step": 4804 + }, + { + "epoch": 0.5276740610586427, + "grad_norm": 1.952866554260254, + "learning_rate": 5e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6922997832298279, + "num_tokens": 124187000.0, + "step": 4805 + }, + { + "epoch": 0.5277838787612563, + "grad_norm": 1.9485182762145996, + "learning_rate": 5e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.6951120495796204, + "num_tokens": 124210968.0, + "step": 4806 + }, + { + "epoch": 0.52789369646387, + "grad_norm": 1.7327208518981934, + "learning_rate": 5e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6851261854171753, + "num_tokens": 124244598.0, + "step": 4807 + }, + { + "epoch": 0.5280035141664836, + "grad_norm": 1.8320621252059937, + "learning_rate": 5e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7143848538398743, + "num_tokens": 124269075.0, + "step": 4808 + }, + { + "epoch": 0.5281133318690973, + "grad_norm": 1.806898593902588, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.719985842704773, + "num_tokens": 124298439.0, + "step": 4809 + }, + { + "epoch": 0.528223149571711, + "grad_norm": 1.8449852466583252, + "learning_rate": 5e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7241630554199219, + "num_tokens": 124323069.0, + "step": 4810 + }, + { + "epoch": 0.5283329672743247, + "grad_norm": 1.7804828882217407, + "learning_rate": 5e-06, + "loss": 1.0889, + "mean_token_accuracy": 0.6777262687683105, + "num_tokens": 124355911.0, + "step": 4811 + }, + { + "epoch": 0.5284427849769383, + "grad_norm": 1.9502493143081665, + "learning_rate": 5e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7181291580200195, + "num_tokens": 124379768.0, + "step": 4812 + }, + { + "epoch": 0.528552602679552, + "grad_norm": 2.0531296730041504, + "learning_rate": 5e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.706824541091919, + "num_tokens": 124402593.0, + "step": 4813 + }, + { + "epoch": 0.5286624203821656, + "grad_norm": 1.7970600128173828, + "learning_rate": 5e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6836963891983032, + "num_tokens": 124430564.0, + "step": 4814 + }, + { + "epoch": 0.5287722380847792, + "grad_norm": 1.9179606437683105, + "learning_rate": 5e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6920555233955383, + "num_tokens": 124456330.0, + "step": 4815 + }, + { + "epoch": 0.5288820557873929, + "grad_norm": 1.7212023735046387, + "learning_rate": 5e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7264934182167053, + "num_tokens": 124484021.0, + "step": 4816 + }, + { + "epoch": 0.5289918734900065, + "grad_norm": 1.8648290634155273, + "learning_rate": 5e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6989542841911316, + "num_tokens": 124511375.0, + "step": 4817 + }, + { + "epoch": 0.5291016911926203, + "grad_norm": 1.6462407112121582, + "learning_rate": 5e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7037930488586426, + "num_tokens": 124544878.0, + "step": 4818 + }, + { + "epoch": 0.5292115088952339, + "grad_norm": 1.8188966512680054, + "learning_rate": 5e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7018200755119324, + "num_tokens": 124571794.0, + "step": 4819 + }, + { + "epoch": 0.5293213265978476, + "grad_norm": 1.9113128185272217, + "learning_rate": 5e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7098968029022217, + "num_tokens": 124597332.0, + "step": 4820 + }, + { + "epoch": 0.5294311443004612, + "grad_norm": 1.9726890325546265, + "learning_rate": 5e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6940091848373413, + "num_tokens": 124622018.0, + "step": 4821 + }, + { + "epoch": 0.5295409620030749, + "grad_norm": 1.981111764907837, + "learning_rate": 5e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.700072169303894, + "num_tokens": 124644500.0, + "step": 4822 + }, + { + "epoch": 0.5296507797056885, + "grad_norm": 2.0602381229400635, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7309410572052002, + "num_tokens": 124665445.0, + "step": 4823 + }, + { + "epoch": 0.5297605974083022, + "grad_norm": 1.652169108390808, + "learning_rate": 5e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6945806741714478, + "num_tokens": 124698241.0, + "step": 4824 + }, + { + "epoch": 0.5298704151109159, + "grad_norm": 1.9649102687835693, + "learning_rate": 5e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6799509525299072, + "num_tokens": 124726889.0, + "step": 4825 + }, + { + "epoch": 0.5299802328135296, + "grad_norm": 2.082066059112549, + "learning_rate": 5e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7089807987213135, + "num_tokens": 124751761.0, + "step": 4826 + }, + { + "epoch": 0.5300900505161432, + "grad_norm": 2.1826159954071045, + "learning_rate": 5e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7040807008743286, + "num_tokens": 124771279.0, + "step": 4827 + }, + { + "epoch": 0.5301998682187569, + "grad_norm": 1.639549970626831, + "learning_rate": 5e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7023663520812988, + "num_tokens": 124804261.0, + "step": 4828 + }, + { + "epoch": 0.5303096859213705, + "grad_norm": 1.8251049518585205, + "learning_rate": 5e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7192175388336182, + "num_tokens": 124829262.0, + "step": 4829 + }, + { + "epoch": 0.5304195036239842, + "grad_norm": 1.8739160299301147, + "learning_rate": 5e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7169676423072815, + "num_tokens": 124856919.0, + "step": 4830 + }, + { + "epoch": 0.5305293213265978, + "grad_norm": 2.305454730987549, + "learning_rate": 5e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7122907042503357, + "num_tokens": 124876072.0, + "step": 4831 + }, + { + "epoch": 0.5306391390292116, + "grad_norm": 1.9117413759231567, + "learning_rate": 5e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7120133638381958, + "num_tokens": 124902607.0, + "step": 4832 + }, + { + "epoch": 0.5307489567318252, + "grad_norm": 1.935237169265747, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.718283474445343, + "num_tokens": 124925287.0, + "step": 4833 + }, + { + "epoch": 0.5308587744344389, + "grad_norm": 1.7555248737335205, + "learning_rate": 5e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6931208968162537, + "num_tokens": 124954756.0, + "step": 4834 + }, + { + "epoch": 0.5309685921370525, + "grad_norm": 1.763981580734253, + "learning_rate": 5e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6980602741241455, + "num_tokens": 124983069.0, + "step": 4835 + }, + { + "epoch": 0.5310784098396661, + "grad_norm": 2.0460188388824463, + "learning_rate": 5e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7284202575683594, + "num_tokens": 125006232.0, + "step": 4836 + }, + { + "epoch": 0.5311882275422798, + "grad_norm": 1.9018298387527466, + "learning_rate": 5e-06, + "loss": 1.1039, + "mean_token_accuracy": 0.6739374399185181, + "num_tokens": 125034944.0, + "step": 4837 + }, + { + "epoch": 0.5312980452448934, + "grad_norm": 1.7428234815597534, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7068663835525513, + "num_tokens": 125061067.0, + "step": 4838 + }, + { + "epoch": 0.5314078629475072, + "grad_norm": 2.1127333641052246, + "learning_rate": 5e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7076264023780823, + "num_tokens": 125081552.0, + "step": 4839 + }, + { + "epoch": 0.5315176806501208, + "grad_norm": 1.8378853797912598, + "learning_rate": 5e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6890853643417358, + "num_tokens": 125110417.0, + "step": 4840 + }, + { + "epoch": 0.5316274983527345, + "grad_norm": 2.000077486038208, + "learning_rate": 5e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6849425435066223, + "num_tokens": 125135033.0, + "step": 4841 + }, + { + "epoch": 0.5317373160553481, + "grad_norm": 1.6348366737365723, + "learning_rate": 5e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.688525915145874, + "num_tokens": 125169456.0, + "step": 4842 + }, + { + "epoch": 0.5318471337579618, + "grad_norm": 1.9133646488189697, + "learning_rate": 5e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7006222009658813, + "num_tokens": 125195086.0, + "step": 4843 + }, + { + "epoch": 0.5319569514605754, + "grad_norm": 1.6892839670181274, + "learning_rate": 5e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7084513902664185, + "num_tokens": 125226735.0, + "step": 4844 + }, + { + "epoch": 0.5320667691631891, + "grad_norm": 1.8387352228164673, + "learning_rate": 5e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6981528997421265, + "num_tokens": 125252293.0, + "step": 4845 + }, + { + "epoch": 0.5321765868658027, + "grad_norm": 1.7238426208496094, + "learning_rate": 5e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7328161001205444, + "num_tokens": 125281623.0, + "step": 4846 + }, + { + "epoch": 0.5322864045684165, + "grad_norm": 1.8707095384597778, + "learning_rate": 5e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6792139410972595, + "num_tokens": 125310730.0, + "step": 4847 + }, + { + "epoch": 0.5323962222710301, + "grad_norm": 1.7550194263458252, + "learning_rate": 5e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6920523643493652, + "num_tokens": 125338175.0, + "step": 4848 + }, + { + "epoch": 0.5325060399736438, + "grad_norm": 2.0878045558929443, + "learning_rate": 5e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7043196558952332, + "num_tokens": 125360709.0, + "step": 4849 + }, + { + "epoch": 0.5326158576762574, + "grad_norm": 1.936735987663269, + "learning_rate": 5e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6759578585624695, + "num_tokens": 125387932.0, + "step": 4850 + }, + { + "epoch": 0.532725675378871, + "grad_norm": 1.880365014076233, + "learning_rate": 5e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7209782600402832, + "num_tokens": 125413820.0, + "step": 4851 + }, + { + "epoch": 0.5328354930814847, + "grad_norm": 1.9214019775390625, + "learning_rate": 5e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6952386498451233, + "num_tokens": 125440436.0, + "step": 4852 + }, + { + "epoch": 0.5329453107840983, + "grad_norm": 1.8873838186264038, + "learning_rate": 5e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6913329362869263, + "num_tokens": 125467776.0, + "step": 4853 + }, + { + "epoch": 0.5330551284867121, + "grad_norm": 1.8060107231140137, + "learning_rate": 5e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7087361812591553, + "num_tokens": 125494824.0, + "step": 4854 + }, + { + "epoch": 0.5331649461893258, + "grad_norm": 1.7555317878723145, + "learning_rate": 5e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6959891319274902, + "num_tokens": 125522415.0, + "step": 4855 + }, + { + "epoch": 0.5332747638919394, + "grad_norm": 1.9516398906707764, + "learning_rate": 5e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7005929946899414, + "num_tokens": 125547777.0, + "step": 4856 + }, + { + "epoch": 0.533384581594553, + "grad_norm": 1.9142680168151855, + "learning_rate": 5e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7239241600036621, + "num_tokens": 125571066.0, + "step": 4857 + }, + { + "epoch": 0.5334943992971667, + "grad_norm": 2.041802167892456, + "learning_rate": 5e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7246288061141968, + "num_tokens": 125591277.0, + "step": 4858 + }, + { + "epoch": 0.5336042169997803, + "grad_norm": 1.715043306350708, + "learning_rate": 5e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7045643925666809, + "num_tokens": 125618777.0, + "step": 4859 + }, + { + "epoch": 0.533714034702394, + "grad_norm": 1.7220594882965088, + "learning_rate": 5e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7306250929832458, + "num_tokens": 125645561.0, + "step": 4860 + }, + { + "epoch": 0.5338238524050077, + "grad_norm": 1.7920571565628052, + "learning_rate": 5e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.700571596622467, + "num_tokens": 125672958.0, + "step": 4861 + }, + { + "epoch": 0.5339336701076214, + "grad_norm": 1.9079244136810303, + "learning_rate": 5e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7080460786819458, + "num_tokens": 125699790.0, + "step": 4862 + }, + { + "epoch": 0.534043487810235, + "grad_norm": 1.8181450366973877, + "learning_rate": 5e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.696999728679657, + "num_tokens": 125726448.0, + "step": 4863 + }, + { + "epoch": 0.5341533055128487, + "grad_norm": 1.7615365982055664, + "learning_rate": 5e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6829104423522949, + "num_tokens": 125757431.0, + "step": 4864 + }, + { + "epoch": 0.5342631232154623, + "grad_norm": 2.089902639389038, + "learning_rate": 5e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6824062466621399, + "num_tokens": 125782143.0, + "step": 4865 + }, + { + "epoch": 0.534372940918076, + "grad_norm": 1.967036485671997, + "learning_rate": 5e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7199648022651672, + "num_tokens": 125806483.0, + "step": 4866 + }, + { + "epoch": 0.5344827586206896, + "grad_norm": 1.9161694049835205, + "learning_rate": 5e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7211501598358154, + "num_tokens": 125830507.0, + "step": 4867 + }, + { + "epoch": 0.5345925763233034, + "grad_norm": 1.910771131515503, + "learning_rate": 5e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6828026175498962, + "num_tokens": 125857469.0, + "step": 4868 + }, + { + "epoch": 0.534702394025917, + "grad_norm": 1.9136135578155518, + "learning_rate": 5e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.711039662361145, + "num_tokens": 125881904.0, + "step": 4869 + }, + { + "epoch": 0.5348122117285307, + "grad_norm": 1.9032560586929321, + "learning_rate": 5e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.709835946559906, + "num_tokens": 125907168.0, + "step": 4870 + }, + { + "epoch": 0.5349220294311443, + "grad_norm": 1.9319100379943848, + "learning_rate": 5e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6840022802352905, + "num_tokens": 125931704.0, + "step": 4871 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 1.750131368637085, + "learning_rate": 5e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7173984050750732, + "num_tokens": 125957556.0, + "step": 4872 + }, + { + "epoch": 0.5351416648363716, + "grad_norm": 2.0666675567626953, + "learning_rate": 5e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7064036130905151, + "num_tokens": 125979127.0, + "step": 4873 + }, + { + "epoch": 0.5352514825389852, + "grad_norm": 1.7242940664291382, + "learning_rate": 5e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.688522219657898, + "num_tokens": 126009521.0, + "step": 4874 + }, + { + "epoch": 0.5353613002415989, + "grad_norm": 2.140336751937866, + "learning_rate": 5e-06, + "loss": 0.91, + "mean_token_accuracy": 0.713417649269104, + "num_tokens": 126029072.0, + "step": 4875 + }, + { + "epoch": 0.5354711179442126, + "grad_norm": 1.8763647079467773, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7167053818702698, + "num_tokens": 126055638.0, + "step": 4876 + }, + { + "epoch": 0.5355809356468263, + "grad_norm": 2.16697096824646, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7124775052070618, + "num_tokens": 126075783.0, + "step": 4877 + }, + { + "epoch": 0.5356907533494399, + "grad_norm": 1.7978023290634155, + "learning_rate": 5e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7089534997940063, + "num_tokens": 126105983.0, + "step": 4878 + }, + { + "epoch": 0.5358005710520536, + "grad_norm": 1.7603814601898193, + "learning_rate": 5e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7070146799087524, + "num_tokens": 126136412.0, + "step": 4879 + }, + { + "epoch": 0.5359103887546672, + "grad_norm": 2.113124132156372, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7322747707366943, + "num_tokens": 126155529.0, + "step": 4880 + }, + { + "epoch": 0.5360202064572809, + "grad_norm": 1.77439546585083, + "learning_rate": 5e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6826910972595215, + "num_tokens": 126183002.0, + "step": 4881 + }, + { + "epoch": 0.5361300241598945, + "grad_norm": 2.1653099060058594, + "learning_rate": 5e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7353801727294922, + "num_tokens": 126204051.0, + "step": 4882 + }, + { + "epoch": 0.5362398418625083, + "grad_norm": 1.877640724182129, + "learning_rate": 5e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6905331611633301, + "num_tokens": 126230952.0, + "step": 4883 + }, + { + "epoch": 0.5363496595651219, + "grad_norm": 1.5633137226104736, + "learning_rate": 5e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6806970238685608, + "num_tokens": 126266190.0, + "step": 4884 + }, + { + "epoch": 0.5364594772677356, + "grad_norm": 1.818198800086975, + "learning_rate": 5e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7069023251533508, + "num_tokens": 126293150.0, + "step": 4885 + }, + { + "epoch": 0.5365692949703492, + "grad_norm": 1.634887456893921, + "learning_rate": 5e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6927480101585388, + "num_tokens": 126325216.0, + "step": 4886 + }, + { + "epoch": 0.5366791126729629, + "grad_norm": 1.7375844717025757, + "learning_rate": 5e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7028554677963257, + "num_tokens": 126352844.0, + "step": 4887 + }, + { + "epoch": 0.5367889303755765, + "grad_norm": 1.9604625701904297, + "learning_rate": 5e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7239195108413696, + "num_tokens": 126376780.0, + "step": 4888 + }, + { + "epoch": 0.5368987480781902, + "grad_norm": 1.8177369832992554, + "learning_rate": 5e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.69452303647995, + "num_tokens": 126405789.0, + "step": 4889 + }, + { + "epoch": 0.5370085657808039, + "grad_norm": 1.9981498718261719, + "learning_rate": 5e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7118890285491943, + "num_tokens": 126430879.0, + "step": 4890 + }, + { + "epoch": 0.5371183834834176, + "grad_norm": 1.98140287399292, + "learning_rate": 5e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7212943434715271, + "num_tokens": 126453244.0, + "step": 4891 + }, + { + "epoch": 0.5372282011860312, + "grad_norm": 2.1898088455200195, + "learning_rate": 5e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7184126973152161, + "num_tokens": 126472699.0, + "step": 4892 + }, + { + "epoch": 0.5373380188886449, + "grad_norm": 2.193143367767334, + "learning_rate": 5e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7301477193832397, + "num_tokens": 126491228.0, + "step": 4893 + }, + { + "epoch": 0.5374478365912585, + "grad_norm": 1.8229026794433594, + "learning_rate": 5e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7082041501998901, + "num_tokens": 126519452.0, + "step": 4894 + }, + { + "epoch": 0.5375576542938721, + "grad_norm": 1.8351210355758667, + "learning_rate": 5e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7151868343353271, + "num_tokens": 126544644.0, + "step": 4895 + }, + { + "epoch": 0.5376674719964858, + "grad_norm": 1.8292721509933472, + "learning_rate": 5e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6907411813735962, + "num_tokens": 126572012.0, + "step": 4896 + }, + { + "epoch": 0.5377772896990995, + "grad_norm": 1.9406192302703857, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7070697546005249, + "num_tokens": 126595409.0, + "step": 4897 + }, + { + "epoch": 0.5378871074017132, + "grad_norm": 1.910698413848877, + "learning_rate": 5e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7048354148864746, + "num_tokens": 126622231.0, + "step": 4898 + }, + { + "epoch": 0.5379969251043268, + "grad_norm": 1.9418842792510986, + "learning_rate": 5e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7265529036521912, + "num_tokens": 126643543.0, + "step": 4899 + }, + { + "epoch": 0.5381067428069405, + "grad_norm": 1.74809730052948, + "learning_rate": 5e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6768256425857544, + "num_tokens": 126676747.0, + "step": 4900 + }, + { + "epoch": 0.5382165605095541, + "grad_norm": 1.8776330947875977, + "learning_rate": 5e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6959207057952881, + "num_tokens": 126705412.0, + "step": 4901 + }, + { + "epoch": 0.5383263782121678, + "grad_norm": 1.972905158996582, + "learning_rate": 5e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7105377912521362, + "num_tokens": 126727657.0, + "step": 4902 + }, + { + "epoch": 0.5384361959147814, + "grad_norm": 1.906929612159729, + "learning_rate": 5e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.695979118347168, + "num_tokens": 126752847.0, + "step": 4903 + }, + { + "epoch": 0.5385460136173951, + "grad_norm": 1.9140640497207642, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7136483788490295, + "num_tokens": 126776883.0, + "step": 4904 + }, + { + "epoch": 0.5386558313200088, + "grad_norm": 2.020282745361328, + "learning_rate": 5e-06, + "loss": 0.974, + "mean_token_accuracy": 0.6983067989349365, + "num_tokens": 126797853.0, + "step": 4905 + }, + { + "epoch": 0.5387656490226225, + "grad_norm": 1.759810447692871, + "learning_rate": 5e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6890684366226196, + "num_tokens": 126825001.0, + "step": 4906 + }, + { + "epoch": 0.5388754667252361, + "grad_norm": 1.6924407482147217, + "learning_rate": 5e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7024847865104675, + "num_tokens": 126857454.0, + "step": 4907 + }, + { + "epoch": 0.5389852844278498, + "grad_norm": 2.112703323364258, + "learning_rate": 5e-06, + "loss": 1.02, + "mean_token_accuracy": 0.699467658996582, + "num_tokens": 126881098.0, + "step": 4908 + }, + { + "epoch": 0.5390951021304634, + "grad_norm": 1.930104374885559, + "learning_rate": 5e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6868181228637695, + "num_tokens": 126908385.0, + "step": 4909 + }, + { + "epoch": 0.5392049198330771, + "grad_norm": 1.7873505353927612, + "learning_rate": 5e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6932336091995239, + "num_tokens": 126936008.0, + "step": 4910 + }, + { + "epoch": 0.5393147375356907, + "grad_norm": 1.799492597579956, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.712631106376648, + "num_tokens": 126964158.0, + "step": 4911 + }, + { + "epoch": 0.5394245552383045, + "grad_norm": 1.914141058921814, + "learning_rate": 5e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6875014901161194, + "num_tokens": 126990573.0, + "step": 4912 + }, + { + "epoch": 0.5395343729409181, + "grad_norm": 2.0027225017547607, + "learning_rate": 5e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6970910429954529, + "num_tokens": 127013470.0, + "step": 4913 + }, + { + "epoch": 0.5396441906435318, + "grad_norm": 1.607010841369629, + "learning_rate": 5e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6900508999824524, + "num_tokens": 127046773.0, + "step": 4914 + }, + { + "epoch": 0.5397540083461454, + "grad_norm": 2.04472017288208, + "learning_rate": 5e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7001564502716064, + "num_tokens": 127068247.0, + "step": 4915 + }, + { + "epoch": 0.539863826048759, + "grad_norm": 2.048978567123413, + "learning_rate": 5e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.731918454170227, + "num_tokens": 127089866.0, + "step": 4916 + }, + { + "epoch": 0.5399736437513727, + "grad_norm": 1.8490357398986816, + "learning_rate": 5e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7002726793289185, + "num_tokens": 127115579.0, + "step": 4917 + }, + { + "epoch": 0.5400834614539863, + "grad_norm": 1.5333818197250366, + "learning_rate": 5e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7107443809509277, + "num_tokens": 127150005.0, + "step": 4918 + }, + { + "epoch": 0.5401932791566001, + "grad_norm": 1.945481777191162, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7380468845367432, + "num_tokens": 127172768.0, + "step": 4919 + }, + { + "epoch": 0.5403030968592137, + "grad_norm": 1.7071750164031982, + "learning_rate": 5e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7044780850410461, + "num_tokens": 127203662.0, + "step": 4920 + }, + { + "epoch": 0.5404129145618274, + "grad_norm": 1.6700018644332886, + "learning_rate": 5e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7189985513687134, + "num_tokens": 127232166.0, + "step": 4921 + }, + { + "epoch": 0.540522732264441, + "grad_norm": 2.0457675457000732, + "learning_rate": 5e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7122389674186707, + "num_tokens": 127256619.0, + "step": 4922 + }, + { + "epoch": 0.5406325499670547, + "grad_norm": 1.980246663093567, + "learning_rate": 5e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7239457368850708, + "num_tokens": 127279653.0, + "step": 4923 + }, + { + "epoch": 0.5407423676696683, + "grad_norm": 2.0190117359161377, + "learning_rate": 5e-06, + "loss": 1.004, + "mean_token_accuracy": 0.700468897819519, + "num_tokens": 127304354.0, + "step": 4924 + }, + { + "epoch": 0.540852185372282, + "grad_norm": 2.114102840423584, + "learning_rate": 5e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7088091373443604, + "num_tokens": 127325293.0, + "step": 4925 + }, + { + "epoch": 0.5409620030748957, + "grad_norm": 1.9425073862075806, + "learning_rate": 5e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7052318453788757, + "num_tokens": 127350873.0, + "step": 4926 + }, + { + "epoch": 0.5410718207775094, + "grad_norm": 1.6081857681274414, + "learning_rate": 5e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6900073289871216, + "num_tokens": 127386937.0, + "step": 4927 + }, + { + "epoch": 0.541181638480123, + "grad_norm": 1.7211726903915405, + "learning_rate": 5e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6894661784172058, + "num_tokens": 127418976.0, + "step": 4928 + }, + { + "epoch": 0.5412914561827367, + "grad_norm": 1.8103916645050049, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7175983190536499, + "num_tokens": 127445293.0, + "step": 4929 + }, + { + "epoch": 0.5414012738853503, + "grad_norm": 1.8167489767074585, + "learning_rate": 5e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7282017469406128, + "num_tokens": 127471051.0, + "step": 4930 + }, + { + "epoch": 0.541511091587964, + "grad_norm": 1.8338408470153809, + "learning_rate": 5e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7199303507804871, + "num_tokens": 127498190.0, + "step": 4931 + }, + { + "epoch": 0.5416209092905776, + "grad_norm": 2.078319549560547, + "learning_rate": 5e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6925151348114014, + "num_tokens": 127524691.0, + "step": 4932 + }, + { + "epoch": 0.5417307269931912, + "grad_norm": 1.7446078062057495, + "learning_rate": 5e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7317590713500977, + "num_tokens": 127552276.0, + "step": 4933 + }, + { + "epoch": 0.541840544695805, + "grad_norm": 1.954201579093933, + "learning_rate": 5e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6959007382392883, + "num_tokens": 127580135.0, + "step": 4934 + }, + { + "epoch": 0.5419503623984187, + "grad_norm": 1.6148111820220947, + "learning_rate": 5e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7099552154541016, + "num_tokens": 127612042.0, + "step": 4935 + }, + { + "epoch": 0.5420601801010323, + "grad_norm": 2.038388967514038, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7220152616500854, + "num_tokens": 127633860.0, + "step": 4936 + }, + { + "epoch": 0.5421699978036459, + "grad_norm": 2.059800148010254, + "learning_rate": 5e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7234674096107483, + "num_tokens": 127657345.0, + "step": 4937 + }, + { + "epoch": 0.5422798155062596, + "grad_norm": 2.070279598236084, + "learning_rate": 5e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7153998613357544, + "num_tokens": 127679860.0, + "step": 4938 + }, + { + "epoch": 0.5423896332088732, + "grad_norm": 1.8705888986587524, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7096996903419495, + "num_tokens": 127704724.0, + "step": 4939 + }, + { + "epoch": 0.5424994509114869, + "grad_norm": 1.8272455930709839, + "learning_rate": 5e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7221451997756958, + "num_tokens": 127731068.0, + "step": 4940 + }, + { + "epoch": 0.5426092686141006, + "grad_norm": 2.1001710891723633, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7215859293937683, + "num_tokens": 127751743.0, + "step": 4941 + }, + { + "epoch": 0.5427190863167143, + "grad_norm": 1.830138921737671, + "learning_rate": 5e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.700705885887146, + "num_tokens": 127776282.0, + "step": 4942 + }, + { + "epoch": 0.5428289040193279, + "grad_norm": 1.9010151624679565, + "learning_rate": 5e-06, + "loss": 1.0839, + "mean_token_accuracy": 0.6788409352302551, + "num_tokens": 127805644.0, + "step": 4943 + }, + { + "epoch": 0.5429387217219416, + "grad_norm": 2.0900490283966064, + "learning_rate": 5e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7033537030220032, + "num_tokens": 127827614.0, + "step": 4944 + }, + { + "epoch": 0.5430485394245552, + "grad_norm": 1.6832529306411743, + "learning_rate": 5e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6947559118270874, + "num_tokens": 127861735.0, + "step": 4945 + }, + { + "epoch": 0.5431583571271689, + "grad_norm": 1.8635916709899902, + "learning_rate": 5e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7394397854804993, + "num_tokens": 127885223.0, + "step": 4946 + }, + { + "epoch": 0.5432681748297825, + "grad_norm": 1.8316376209259033, + "learning_rate": 5e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6984054446220398, + "num_tokens": 127912471.0, + "step": 4947 + }, + { + "epoch": 0.5433779925323963, + "grad_norm": 1.6183369159698486, + "learning_rate": 5e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.7002192735671997, + "num_tokens": 127945296.0, + "step": 4948 + }, + { + "epoch": 0.5434878102350099, + "grad_norm": 1.7879338264465332, + "learning_rate": 5e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6903773546218872, + "num_tokens": 127971662.0, + "step": 4949 + }, + { + "epoch": 0.5435976279376236, + "grad_norm": 1.7186578512191772, + "learning_rate": 5e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6999108195304871, + "num_tokens": 127999248.0, + "step": 4950 + }, + { + "epoch": 0.5437074456402372, + "grad_norm": 1.8652538061141968, + "learning_rate": 5e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6925400495529175, + "num_tokens": 128025068.0, + "step": 4951 + }, + { + "epoch": 0.5438172633428509, + "grad_norm": 1.7968723773956299, + "learning_rate": 5e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7072582244873047, + "num_tokens": 128052986.0, + "step": 4952 + }, + { + "epoch": 0.5439270810454645, + "grad_norm": 2.008790969848633, + "learning_rate": 5e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.700888991355896, + "num_tokens": 128078149.0, + "step": 4953 + }, + { + "epoch": 0.5440368987480781, + "grad_norm": 1.8581093549728394, + "learning_rate": 5e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6949537992477417, + "num_tokens": 128107586.0, + "step": 4954 + }, + { + "epoch": 0.5441467164506919, + "grad_norm": 1.7416183948516846, + "learning_rate": 5e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6918231248855591, + "num_tokens": 128136541.0, + "step": 4955 + }, + { + "epoch": 0.5442565341533055, + "grad_norm": 1.7656785249710083, + "learning_rate": 5e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7069747447967529, + "num_tokens": 128162947.0, + "step": 4956 + }, + { + "epoch": 0.5443663518559192, + "grad_norm": 1.9659861326217651, + "learning_rate": 5e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7083629369735718, + "num_tokens": 128186608.0, + "step": 4957 + }, + { + "epoch": 0.5444761695585328, + "grad_norm": 2.293887138366699, + "learning_rate": 5e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7220273017883301, + "num_tokens": 128206265.0, + "step": 4958 + }, + { + "epoch": 0.5445859872611465, + "grad_norm": 1.6885548830032349, + "learning_rate": 5e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7214721441268921, + "num_tokens": 128235902.0, + "step": 4959 + }, + { + "epoch": 0.5446958049637601, + "grad_norm": 2.037415027618408, + "learning_rate": 5e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6994912028312683, + "num_tokens": 128258313.0, + "step": 4960 + }, + { + "epoch": 0.5448056226663738, + "grad_norm": 1.9264206886291504, + "learning_rate": 5e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.698269784450531, + "num_tokens": 128283800.0, + "step": 4961 + }, + { + "epoch": 0.5449154403689875, + "grad_norm": 2.0009002685546875, + "learning_rate": 5e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6973164081573486, + "num_tokens": 128310364.0, + "step": 4962 + }, + { + "epoch": 0.5450252580716012, + "grad_norm": 1.876097321510315, + "learning_rate": 5e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.700545608997345, + "num_tokens": 128336491.0, + "step": 4963 + }, + { + "epoch": 0.5451350757742148, + "grad_norm": 1.8753803968429565, + "learning_rate": 5e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6984810829162598, + "num_tokens": 128364142.0, + "step": 4964 + }, + { + "epoch": 0.5452448934768285, + "grad_norm": 1.7741116285324097, + "learning_rate": 5e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7036576271057129, + "num_tokens": 128389424.0, + "step": 4965 + }, + { + "epoch": 0.5453547111794421, + "grad_norm": 1.6337082386016846, + "learning_rate": 5e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7115710377693176, + "num_tokens": 128419296.0, + "step": 4966 + }, + { + "epoch": 0.5454645288820558, + "grad_norm": 1.701675295829773, + "learning_rate": 5e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6919928789138794, + "num_tokens": 128450710.0, + "step": 4967 + }, + { + "epoch": 0.5455743465846694, + "grad_norm": 1.6650453805923462, + "learning_rate": 5e-06, + "loss": 1.0762, + "mean_token_accuracy": 0.6739500761032104, + "num_tokens": 128482416.0, + "step": 4968 + }, + { + "epoch": 0.5456841642872831, + "grad_norm": 1.9487987756729126, + "learning_rate": 5e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.708430290222168, + "num_tokens": 128507682.0, + "step": 4969 + }, + { + "epoch": 0.5457939819898968, + "grad_norm": 1.7537429332733154, + "learning_rate": 5e-06, + "loss": 1.0469, + "mean_token_accuracy": 0.6850193738937378, + "num_tokens": 128537088.0, + "step": 4970 + }, + { + "epoch": 0.5459037996925105, + "grad_norm": 1.7334322929382324, + "learning_rate": 5e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6990406513214111, + "num_tokens": 128567532.0, + "step": 4971 + }, + { + "epoch": 0.5460136173951241, + "grad_norm": 1.7830735445022583, + "learning_rate": 5e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.700440526008606, + "num_tokens": 128594558.0, + "step": 4972 + }, + { + "epoch": 0.5461234350977378, + "grad_norm": 1.8858590126037598, + "learning_rate": 5e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6993575096130371, + "num_tokens": 128621599.0, + "step": 4973 + }, + { + "epoch": 0.5462332528003514, + "grad_norm": 1.8969049453735352, + "learning_rate": 5e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7124942541122437, + "num_tokens": 128646644.0, + "step": 4974 + }, + { + "epoch": 0.546343070502965, + "grad_norm": 1.828321933746338, + "learning_rate": 5e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7011233568191528, + "num_tokens": 128673725.0, + "step": 4975 + }, + { + "epoch": 0.5464528882055787, + "grad_norm": 1.861812710762024, + "learning_rate": 5e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6989411115646362, + "num_tokens": 128702682.0, + "step": 4976 + }, + { + "epoch": 0.5465627059081924, + "grad_norm": 2.1214706897735596, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7192217111587524, + "num_tokens": 128720901.0, + "step": 4977 + }, + { + "epoch": 0.5466725236108061, + "grad_norm": 1.9872159957885742, + "learning_rate": 5e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7173596620559692, + "num_tokens": 128742475.0, + "step": 4978 + }, + { + "epoch": 0.5467823413134197, + "grad_norm": 2.0248894691467285, + "learning_rate": 5e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7121016979217529, + "num_tokens": 128764477.0, + "step": 4979 + }, + { + "epoch": 0.5468921590160334, + "grad_norm": 1.9603490829467773, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7351394295692444, + "num_tokens": 128787557.0, + "step": 4980 + }, + { + "epoch": 0.547001976718647, + "grad_norm": 1.7239576578140259, + "learning_rate": 5e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.690704345703125, + "num_tokens": 128817953.0, + "step": 4981 + }, + { + "epoch": 0.5471117944212607, + "grad_norm": 1.8051799535751343, + "learning_rate": 5e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.699591338634491, + "num_tokens": 128844850.0, + "step": 4982 + }, + { + "epoch": 0.5472216121238743, + "grad_norm": 2.1428921222686768, + "learning_rate": 5e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7031655311584473, + "num_tokens": 128866176.0, + "step": 4983 + }, + { + "epoch": 0.5473314298264881, + "grad_norm": 1.9076346158981323, + "learning_rate": 5e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7005919814109802, + "num_tokens": 128892571.0, + "step": 4984 + }, + { + "epoch": 0.5474412475291017, + "grad_norm": 1.838051199913025, + "learning_rate": 5e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.697786808013916, + "num_tokens": 128919751.0, + "step": 4985 + }, + { + "epoch": 0.5475510652317154, + "grad_norm": 1.9331891536712646, + "learning_rate": 5e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7012046575546265, + "num_tokens": 128945667.0, + "step": 4986 + }, + { + "epoch": 0.547660882934329, + "grad_norm": 2.2484655380249023, + "learning_rate": 5e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7073602676391602, + "num_tokens": 128963966.0, + "step": 4987 + }, + { + "epoch": 0.5477707006369427, + "grad_norm": 1.7477178573608398, + "learning_rate": 5e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6931564211845398, + "num_tokens": 128992518.0, + "step": 4988 + }, + { + "epoch": 0.5478805183395563, + "grad_norm": 1.9007110595703125, + "learning_rate": 5e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7030461430549622, + "num_tokens": 129016256.0, + "step": 4989 + }, + { + "epoch": 0.54799033604217, + "grad_norm": 2.140096664428711, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7304937839508057, + "num_tokens": 129037965.0, + "step": 4990 + }, + { + "epoch": 0.5481001537447837, + "grad_norm": 1.8928956985473633, + "learning_rate": 5e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6925094127655029, + "num_tokens": 129066665.0, + "step": 4991 + }, + { + "epoch": 0.5482099714473974, + "grad_norm": 1.8369137048721313, + "learning_rate": 5e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6873210668563843, + "num_tokens": 129094449.0, + "step": 4992 + }, + { + "epoch": 0.548319789150011, + "grad_norm": 1.7902196645736694, + "learning_rate": 5e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7186518311500549, + "num_tokens": 129122765.0, + "step": 4993 + }, + { + "epoch": 0.5484296068526247, + "grad_norm": 2.2193527221679688, + "learning_rate": 5e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7160322666168213, + "num_tokens": 129141017.0, + "step": 4994 + }, + { + "epoch": 0.5485394245552383, + "grad_norm": 1.9836112260818481, + "learning_rate": 5e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7189418077468872, + "num_tokens": 129164666.0, + "step": 4995 + }, + { + "epoch": 0.5486492422578519, + "grad_norm": 2.100510358810425, + "learning_rate": 5e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7164153456687927, + "num_tokens": 129188016.0, + "step": 4996 + }, + { + "epoch": 0.5487590599604656, + "grad_norm": 1.806538462638855, + "learning_rate": 5e-06, + "loss": 1.0784, + "mean_token_accuracy": 0.6835336685180664, + "num_tokens": 129216112.0, + "step": 4997 + }, + { + "epoch": 0.5488688776630792, + "grad_norm": 2.326531171798706, + "learning_rate": 5e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7263885140419006, + "num_tokens": 129232509.0, + "step": 4998 + }, + { + "epoch": 0.548978695365693, + "grad_norm": 1.8401846885681152, + "learning_rate": 5e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6864953637123108, + "num_tokens": 129260866.0, + "step": 4999 + }, + { + "epoch": 0.5490885130683066, + "grad_norm": 1.9601655006408691, + "learning_rate": 5e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6955611109733582, + "num_tokens": 129289305.0, + "step": 5000 + }, + { + "epoch": 0.5491983307709203, + "grad_norm": 1.8120681047439575, + "learning_rate": 5e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.683657705783844, + "num_tokens": 129316501.0, + "step": 5001 + }, + { + "epoch": 0.5493081484735339, + "grad_norm": 2.1388394832611084, + "learning_rate": 5e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7118407487869263, + "num_tokens": 129336464.0, + "step": 5002 + }, + { + "epoch": 0.5494179661761476, + "grad_norm": 1.8666834831237793, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6866101026535034, + "num_tokens": 129362020.0, + "step": 5003 + }, + { + "epoch": 0.5495277838787612, + "grad_norm": 1.6513639688491821, + "learning_rate": 5e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7014435529708862, + "num_tokens": 129391729.0, + "step": 5004 + }, + { + "epoch": 0.5496376015813749, + "grad_norm": 1.8421602249145508, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7306334972381592, + "num_tokens": 129416689.0, + "step": 5005 + }, + { + "epoch": 0.5497474192839886, + "grad_norm": 2.100048542022705, + "learning_rate": 5e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7240270376205444, + "num_tokens": 129438789.0, + "step": 5006 + }, + { + "epoch": 0.5498572369866023, + "grad_norm": 1.7452507019042969, + "learning_rate": 5e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6914485692977905, + "num_tokens": 129469580.0, + "step": 5007 + }, + { + "epoch": 0.5499670546892159, + "grad_norm": 1.9315346479415894, + "learning_rate": 5e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7289657592773438, + "num_tokens": 129492795.0, + "step": 5008 + }, + { + "epoch": 0.5500768723918296, + "grad_norm": 1.8371845483779907, + "learning_rate": 5e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6960091590881348, + "num_tokens": 129519448.0, + "step": 5009 + }, + { + "epoch": 0.5501866900944432, + "grad_norm": 2.0078024864196777, + "learning_rate": 5e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7139288187026978, + "num_tokens": 129542798.0, + "step": 5010 + }, + { + "epoch": 0.5502965077970569, + "grad_norm": 1.988776683807373, + "learning_rate": 5e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7091059684753418, + "num_tokens": 129570180.0, + "step": 5011 + }, + { + "epoch": 0.5504063254996705, + "grad_norm": 2.1286559104919434, + "learning_rate": 5e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6981257200241089, + "num_tokens": 129592482.0, + "step": 5012 + }, + { + "epoch": 0.5505161432022843, + "grad_norm": 1.648748755455017, + "learning_rate": 5e-06, + "loss": 1.1219, + "mean_token_accuracy": 0.6688676476478577, + "num_tokens": 129625048.0, + "step": 5013 + }, + { + "epoch": 0.5506259609048979, + "grad_norm": 1.792859673500061, + "learning_rate": 5e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7256386280059814, + "num_tokens": 129652403.0, + "step": 5014 + }, + { + "epoch": 0.5507357786075116, + "grad_norm": 1.8740403652191162, + "learning_rate": 5e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.708220899105072, + "num_tokens": 129680859.0, + "step": 5015 + }, + { + "epoch": 0.5508455963101252, + "grad_norm": 2.1001503467559814, + "learning_rate": 5e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7117984294891357, + "num_tokens": 129702632.0, + "step": 5016 + }, + { + "epoch": 0.5509554140127388, + "grad_norm": 1.8788470029830933, + "learning_rate": 5e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7157935500144958, + "num_tokens": 129728672.0, + "step": 5017 + }, + { + "epoch": 0.5510652317153525, + "grad_norm": 1.8050624132156372, + "learning_rate": 5e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.6980260014533997, + "num_tokens": 129753851.0, + "step": 5018 + }, + { + "epoch": 0.5511750494179661, + "grad_norm": 1.6707885265350342, + "learning_rate": 5e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7255915999412537, + "num_tokens": 129784027.0, + "step": 5019 + }, + { + "epoch": 0.5512848671205799, + "grad_norm": 1.6377613544464111, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7195196151733398, + "num_tokens": 129815948.0, + "step": 5020 + }, + { + "epoch": 0.5513946848231935, + "grad_norm": 2.018082857131958, + "learning_rate": 5e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7100124359130859, + "num_tokens": 129838311.0, + "step": 5021 + }, + { + "epoch": 0.5515045025258072, + "grad_norm": 2.2543232440948486, + "learning_rate": 5e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7156429886817932, + "num_tokens": 129857554.0, + "step": 5022 + }, + { + "epoch": 0.5516143202284208, + "grad_norm": 1.7624478340148926, + "learning_rate": 5e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6957689523696899, + "num_tokens": 129888037.0, + "step": 5023 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 1.609731912612915, + "learning_rate": 5e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6830452680587769, + "num_tokens": 129923670.0, + "step": 5024 + }, + { + "epoch": 0.5518339556336481, + "grad_norm": 1.8762009143829346, + "learning_rate": 5e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7137378454208374, + "num_tokens": 129949232.0, + "step": 5025 + }, + { + "epoch": 0.5519437733362618, + "grad_norm": 1.8775032758712769, + "learning_rate": 5e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7276965975761414, + "num_tokens": 129973301.0, + "step": 5026 + }, + { + "epoch": 0.5520535910388754, + "grad_norm": 1.8048901557922363, + "learning_rate": 5e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6900308728218079, + "num_tokens": 130000616.0, + "step": 5027 + }, + { + "epoch": 0.5521634087414892, + "grad_norm": 1.8957509994506836, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.6969481110572815, + "num_tokens": 130024343.0, + "step": 5028 + }, + { + "epoch": 0.5522732264441028, + "grad_norm": 2.0836617946624756, + "learning_rate": 5e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7074776887893677, + "num_tokens": 130044911.0, + "step": 5029 + }, + { + "epoch": 0.5523830441467165, + "grad_norm": 1.5711240768432617, + "learning_rate": 5e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7135772109031677, + "num_tokens": 130077884.0, + "step": 5030 + }, + { + "epoch": 0.5524928618493301, + "grad_norm": 1.8619879484176636, + "learning_rate": 5e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7041813731193542, + "num_tokens": 130102232.0, + "step": 5031 + }, + { + "epoch": 0.5526026795519438, + "grad_norm": 1.9203476905822754, + "learning_rate": 5e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7054550647735596, + "num_tokens": 130127346.0, + "step": 5032 + }, + { + "epoch": 0.5527124972545574, + "grad_norm": 2.205045700073242, + "learning_rate": 5e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7291911244392395, + "num_tokens": 130147200.0, + "step": 5033 + }, + { + "epoch": 0.552822314957171, + "grad_norm": 1.7605061531066895, + "learning_rate": 5e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7133135795593262, + "num_tokens": 130176449.0, + "step": 5034 + }, + { + "epoch": 0.5529321326597848, + "grad_norm": 1.785261869430542, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7047131061553955, + "num_tokens": 130204553.0, + "step": 5035 + }, + { + "epoch": 0.5530419503623984, + "grad_norm": 1.8816871643066406, + "learning_rate": 5e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6887056231498718, + "num_tokens": 130230995.0, + "step": 5036 + }, + { + "epoch": 0.5531517680650121, + "grad_norm": 1.958524227142334, + "learning_rate": 5e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7172309160232544, + "num_tokens": 130254297.0, + "step": 5037 + }, + { + "epoch": 0.5532615857676257, + "grad_norm": 1.7930277585983276, + "learning_rate": 5e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6930786371231079, + "num_tokens": 130284107.0, + "step": 5038 + }, + { + "epoch": 0.5533714034702394, + "grad_norm": 1.8222832679748535, + "learning_rate": 5e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7271836400032043, + "num_tokens": 130308919.0, + "step": 5039 + }, + { + "epoch": 0.553481221172853, + "grad_norm": 2.0296406745910645, + "learning_rate": 5e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7120453119277954, + "num_tokens": 130333012.0, + "step": 5040 + }, + { + "epoch": 0.5535910388754667, + "grad_norm": 1.9066112041473389, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7218755483627319, + "num_tokens": 130356513.0, + "step": 5041 + }, + { + "epoch": 0.5537008565780804, + "grad_norm": 1.8018230199813843, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7216672897338867, + "num_tokens": 130384095.0, + "step": 5042 + }, + { + "epoch": 0.5538106742806941, + "grad_norm": 1.7483723163604736, + "learning_rate": 5e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.6790520548820496, + "num_tokens": 130411663.0, + "step": 5043 + }, + { + "epoch": 0.5539204919833077, + "grad_norm": 1.7440484762191772, + "learning_rate": 5e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7048047780990601, + "num_tokens": 130438819.0, + "step": 5044 + }, + { + "epoch": 0.5540303096859214, + "grad_norm": 1.8018262386322021, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.694082498550415, + "num_tokens": 130465541.0, + "step": 5045 + }, + { + "epoch": 0.554140127388535, + "grad_norm": 2.0123465061187744, + "learning_rate": 5e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7131589651107788, + "num_tokens": 130486151.0, + "step": 5046 + }, + { + "epoch": 0.5542499450911487, + "grad_norm": 1.8751742839813232, + "learning_rate": 5e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7131954431533813, + "num_tokens": 130509835.0, + "step": 5047 + }, + { + "epoch": 0.5543597627937623, + "grad_norm": 1.6943278312683105, + "learning_rate": 5e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7030088901519775, + "num_tokens": 130538089.0, + "step": 5048 + }, + { + "epoch": 0.5544695804963761, + "grad_norm": 1.8285958766937256, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7259455919265747, + "num_tokens": 130565647.0, + "step": 5049 + }, + { + "epoch": 0.5545793981989897, + "grad_norm": 1.9597671031951904, + "learning_rate": 5e-06, + "loss": 0.919, + "mean_token_accuracy": 0.717900276184082, + "num_tokens": 130587946.0, + "step": 5050 + }, + { + "epoch": 0.5546892159016034, + "grad_norm": 2.0869052410125732, + "learning_rate": 5e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6925429105758667, + "num_tokens": 130610724.0, + "step": 5051 + }, + { + "epoch": 0.554799033604217, + "grad_norm": 1.849186897277832, + "learning_rate": 5e-06, + "loss": 0.923, + "mean_token_accuracy": 0.710294246673584, + "num_tokens": 130637873.0, + "step": 5052 + }, + { + "epoch": 0.5549088513068307, + "grad_norm": 1.8034355640411377, + "learning_rate": 5e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6941879987716675, + "num_tokens": 130663990.0, + "step": 5053 + }, + { + "epoch": 0.5550186690094443, + "grad_norm": 1.7144737243652344, + "learning_rate": 5e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.704076886177063, + "num_tokens": 130695936.0, + "step": 5054 + }, + { + "epoch": 0.555128486712058, + "grad_norm": 2.042935371398926, + "learning_rate": 5e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7035912871360779, + "num_tokens": 130719592.0, + "step": 5055 + }, + { + "epoch": 0.5552383044146716, + "grad_norm": 1.6820584535598755, + "learning_rate": 5e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7146264314651489, + "num_tokens": 130749305.0, + "step": 5056 + }, + { + "epoch": 0.5553481221172853, + "grad_norm": 2.1577999591827393, + "learning_rate": 5e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7139087915420532, + "num_tokens": 130768544.0, + "step": 5057 + }, + { + "epoch": 0.555457939819899, + "grad_norm": 1.8253135681152344, + "learning_rate": 5e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7073579430580139, + "num_tokens": 130792309.0, + "step": 5058 + }, + { + "epoch": 0.5555677575225126, + "grad_norm": 1.882660150527954, + "learning_rate": 5e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6945269107818604, + "num_tokens": 130819402.0, + "step": 5059 + }, + { + "epoch": 0.5556775752251263, + "grad_norm": 2.030809164047241, + "learning_rate": 5e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6965434551239014, + "num_tokens": 130845341.0, + "step": 5060 + }, + { + "epoch": 0.5557873929277399, + "grad_norm": 2.036221504211426, + "learning_rate": 5e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6957173347473145, + "num_tokens": 130868465.0, + "step": 5061 + }, + { + "epoch": 0.5558972106303536, + "grad_norm": 2.0517208576202393, + "learning_rate": 5e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.698948860168457, + "num_tokens": 130892083.0, + "step": 5062 + }, + { + "epoch": 0.5560070283329672, + "grad_norm": 1.9199445247650146, + "learning_rate": 5e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6988500356674194, + "num_tokens": 130920381.0, + "step": 5063 + }, + { + "epoch": 0.556116846035581, + "grad_norm": 1.8775501251220703, + "learning_rate": 5e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6884446144104004, + "num_tokens": 130945744.0, + "step": 5064 + }, + { + "epoch": 0.5562266637381946, + "grad_norm": 1.9824180603027344, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7296814918518066, + "num_tokens": 130968480.0, + "step": 5065 + }, + { + "epoch": 0.5563364814408083, + "grad_norm": 1.8018081188201904, + "learning_rate": 5e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6948288679122925, + "num_tokens": 130996802.0, + "step": 5066 + }, + { + "epoch": 0.5564462991434219, + "grad_norm": 2.0117082595825195, + "learning_rate": 5e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7043142318725586, + "num_tokens": 131021382.0, + "step": 5067 + }, + { + "epoch": 0.5565561168460356, + "grad_norm": 1.857256293296814, + "learning_rate": 5e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7063252329826355, + "num_tokens": 131049908.0, + "step": 5068 + }, + { + "epoch": 0.5566659345486492, + "grad_norm": 1.5776491165161133, + "learning_rate": 5e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6827510595321655, + "num_tokens": 131087575.0, + "step": 5069 + }, + { + "epoch": 0.5567757522512629, + "grad_norm": 1.8797527551651, + "learning_rate": 5e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7088992595672607, + "num_tokens": 131111860.0, + "step": 5070 + }, + { + "epoch": 0.5568855699538766, + "grad_norm": 1.9243583679199219, + "learning_rate": 5e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6743467450141907, + "num_tokens": 131137325.0, + "step": 5071 + }, + { + "epoch": 0.5569953876564903, + "grad_norm": 1.7007901668548584, + "learning_rate": 5e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6949036121368408, + "num_tokens": 131169421.0, + "step": 5072 + }, + { + "epoch": 0.5571052053591039, + "grad_norm": 2.1331143379211426, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7344473600387573, + "num_tokens": 131188202.0, + "step": 5073 + }, + { + "epoch": 0.5572150230617176, + "grad_norm": 1.8034223318099976, + "learning_rate": 5e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7147896885871887, + "num_tokens": 131216741.0, + "step": 5074 + }, + { + "epoch": 0.5573248407643312, + "grad_norm": 2.0326128005981445, + "learning_rate": 5e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7195841073989868, + "num_tokens": 131237829.0, + "step": 5075 + }, + { + "epoch": 0.5574346584669448, + "grad_norm": 1.94648277759552, + "learning_rate": 5e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.707320511341095, + "num_tokens": 131264602.0, + "step": 5076 + }, + { + "epoch": 0.5575444761695585, + "grad_norm": 1.8407150506973267, + "learning_rate": 5e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7078113555908203, + "num_tokens": 131293703.0, + "step": 5077 + }, + { + "epoch": 0.5576542938721722, + "grad_norm": 1.933066487312317, + "learning_rate": 5e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7254768013954163, + "num_tokens": 131318633.0, + "step": 5078 + }, + { + "epoch": 0.5577641115747859, + "grad_norm": 1.8743736743927002, + "learning_rate": 5e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7013477087020874, + "num_tokens": 131343918.0, + "step": 5079 + }, + { + "epoch": 0.5578739292773995, + "grad_norm": 1.7158021926879883, + "learning_rate": 5e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6959941387176514, + "num_tokens": 131373572.0, + "step": 5080 + }, + { + "epoch": 0.5579837469800132, + "grad_norm": 1.8898956775665283, + "learning_rate": 5e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7320593595504761, + "num_tokens": 131397057.0, + "step": 5081 + }, + { + "epoch": 0.5580935646826268, + "grad_norm": 1.9101344347000122, + "learning_rate": 5e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7249569892883301, + "num_tokens": 131421288.0, + "step": 5082 + }, + { + "epoch": 0.5582033823852405, + "grad_norm": 1.9248074293136597, + "learning_rate": 5e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6964359879493713, + "num_tokens": 131447942.0, + "step": 5083 + }, + { + "epoch": 0.5583132000878541, + "grad_norm": 1.9301576614379883, + "learning_rate": 5e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7145610451698303, + "num_tokens": 131470458.0, + "step": 5084 + }, + { + "epoch": 0.5584230177904678, + "grad_norm": 1.6805386543273926, + "learning_rate": 5e-06, + "loss": 1.004, + "mean_token_accuracy": 0.696485698223114, + "num_tokens": 131500629.0, + "step": 5085 + }, + { + "epoch": 0.5585328354930815, + "grad_norm": 2.069545269012451, + "learning_rate": 5e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7136433124542236, + "num_tokens": 131520989.0, + "step": 5086 + }, + { + "epoch": 0.5586426531956952, + "grad_norm": 1.9641724824905396, + "learning_rate": 5e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7029085159301758, + "num_tokens": 131545578.0, + "step": 5087 + }, + { + "epoch": 0.5587524708983088, + "grad_norm": 1.750647783279419, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7017982006072998, + "num_tokens": 131574788.0, + "step": 5088 + }, + { + "epoch": 0.5588622886009225, + "grad_norm": 1.8960243463516235, + "learning_rate": 5e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7225192785263062, + "num_tokens": 131597690.0, + "step": 5089 + }, + { + "epoch": 0.5589721063035361, + "grad_norm": 1.8198981285095215, + "learning_rate": 5e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6900726556777954, + "num_tokens": 131625252.0, + "step": 5090 + }, + { + "epoch": 0.5590819240061498, + "grad_norm": 1.7685637474060059, + "learning_rate": 5e-06, + "loss": 1.0806, + "mean_token_accuracy": 0.6845592260360718, + "num_tokens": 131653425.0, + "step": 5091 + }, + { + "epoch": 0.5591917417087634, + "grad_norm": 2.1390581130981445, + "learning_rate": 5e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7266025543212891, + "num_tokens": 131673810.0, + "step": 5092 + }, + { + "epoch": 0.5593015594113772, + "grad_norm": 1.9316946268081665, + "learning_rate": 5e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6944717168807983, + "num_tokens": 131698820.0, + "step": 5093 + }, + { + "epoch": 0.5594113771139908, + "grad_norm": 1.6850134134292603, + "learning_rate": 5e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7191799879074097, + "num_tokens": 131730203.0, + "step": 5094 + }, + { + "epoch": 0.5595211948166045, + "grad_norm": 1.7499926090240479, + "learning_rate": 5e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6910703778266907, + "num_tokens": 131760240.0, + "step": 5095 + }, + { + "epoch": 0.5596310125192181, + "grad_norm": 1.9063360691070557, + "learning_rate": 5e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7129732370376587, + "num_tokens": 131783924.0, + "step": 5096 + }, + { + "epoch": 0.5597408302218317, + "grad_norm": 2.0579795837402344, + "learning_rate": 5e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7032197713851929, + "num_tokens": 131807400.0, + "step": 5097 + }, + { + "epoch": 0.5598506479244454, + "grad_norm": 2.0830299854278564, + "learning_rate": 5e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.723581850528717, + "num_tokens": 131826918.0, + "step": 5098 + }, + { + "epoch": 0.559960465627059, + "grad_norm": 1.7194585800170898, + "learning_rate": 5e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7100229859352112, + "num_tokens": 131857030.0, + "step": 5099 + }, + { + "epoch": 0.5600702833296728, + "grad_norm": 1.793657898902893, + "learning_rate": 5e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6965968608856201, + "num_tokens": 131886307.0, + "step": 5100 + }, + { + "epoch": 0.5601801010322864, + "grad_norm": 1.8133312463760376, + "learning_rate": 5e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7042557001113892, + "num_tokens": 131911572.0, + "step": 5101 + }, + { + "epoch": 0.5602899187349001, + "grad_norm": 1.8716181516647339, + "learning_rate": 5e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.712141752243042, + "num_tokens": 131937485.0, + "step": 5102 + }, + { + "epoch": 0.5603997364375137, + "grad_norm": 1.995059609413147, + "learning_rate": 5e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.704043984413147, + "num_tokens": 131959469.0, + "step": 5103 + }, + { + "epoch": 0.5605095541401274, + "grad_norm": 1.8998587131500244, + "learning_rate": 5e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6994551420211792, + "num_tokens": 131984459.0, + "step": 5104 + }, + { + "epoch": 0.560619371842741, + "grad_norm": 2.0191941261291504, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7136538028717041, + "num_tokens": 132006619.0, + "step": 5105 + }, + { + "epoch": 0.5607291895453547, + "grad_norm": 1.9295616149902344, + "learning_rate": 5e-06, + "loss": 1.0762, + "mean_token_accuracy": 0.6795216798782349, + "num_tokens": 132033496.0, + "step": 5106 + }, + { + "epoch": 0.5608390072479684, + "grad_norm": 1.7542661428451538, + "learning_rate": 5e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7100303173065186, + "num_tokens": 132060833.0, + "step": 5107 + }, + { + "epoch": 0.5609488249505821, + "grad_norm": 1.9217849969863892, + "learning_rate": 5e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6997907757759094, + "num_tokens": 132084741.0, + "step": 5108 + }, + { + "epoch": 0.5610586426531957, + "grad_norm": 1.7488234043121338, + "learning_rate": 5e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.721139669418335, + "num_tokens": 132111452.0, + "step": 5109 + }, + { + "epoch": 0.5611684603558094, + "grad_norm": 1.9443668127059937, + "learning_rate": 5e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7025439739227295, + "num_tokens": 132136160.0, + "step": 5110 + }, + { + "epoch": 0.561278278058423, + "grad_norm": 1.6878175735473633, + "learning_rate": 5e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7012240290641785, + "num_tokens": 132165455.0, + "step": 5111 + }, + { + "epoch": 0.5613880957610367, + "grad_norm": 1.7735995054244995, + "learning_rate": 5e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7070633172988892, + "num_tokens": 132192533.0, + "step": 5112 + }, + { + "epoch": 0.5614979134636503, + "grad_norm": 1.8911223411560059, + "learning_rate": 5e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6894371509552002, + "num_tokens": 132217283.0, + "step": 5113 + }, + { + "epoch": 0.5616077311662641, + "grad_norm": 1.7210056781768799, + "learning_rate": 5e-06, + "loss": 1.1152, + "mean_token_accuracy": 0.6667342185974121, + "num_tokens": 132248806.0, + "step": 5114 + }, + { + "epoch": 0.5617175488688777, + "grad_norm": 1.8112293481826782, + "learning_rate": 5e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.6733793020248413, + "num_tokens": 132278343.0, + "step": 5115 + }, + { + "epoch": 0.5618273665714913, + "grad_norm": 1.6663943529129028, + "learning_rate": 5e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6801395416259766, + "num_tokens": 132311537.0, + "step": 5116 + }, + { + "epoch": 0.561937184274105, + "grad_norm": 1.8423962593078613, + "learning_rate": 5e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7094823718070984, + "num_tokens": 132337153.0, + "step": 5117 + }, + { + "epoch": 0.5620470019767186, + "grad_norm": 2.013777494430542, + "learning_rate": 5e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7015329599380493, + "num_tokens": 132360779.0, + "step": 5118 + }, + { + "epoch": 0.5621568196793323, + "grad_norm": 1.8039504289627075, + "learning_rate": 5e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.708544135093689, + "num_tokens": 132388962.0, + "step": 5119 + }, + { + "epoch": 0.5622666373819459, + "grad_norm": 1.8408730030059814, + "learning_rate": 5e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7139707803726196, + "num_tokens": 132415165.0, + "step": 5120 + }, + { + "epoch": 0.5623764550845596, + "grad_norm": 1.773687720298767, + "learning_rate": 5e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.720782995223999, + "num_tokens": 132441331.0, + "step": 5121 + }, + { + "epoch": 0.5624862727871733, + "grad_norm": 1.8029999732971191, + "learning_rate": 5e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6951854825019836, + "num_tokens": 132469945.0, + "step": 5122 + }, + { + "epoch": 0.562596090489787, + "grad_norm": 2.102041482925415, + "learning_rate": 5e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7022782564163208, + "num_tokens": 132491898.0, + "step": 5123 + }, + { + "epoch": 0.5627059081924006, + "grad_norm": 1.7381994724273682, + "learning_rate": 5e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7343463897705078, + "num_tokens": 132520888.0, + "step": 5124 + }, + { + "epoch": 0.5628157258950143, + "grad_norm": 1.6999176740646362, + "learning_rate": 5e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7008382081985474, + "num_tokens": 132553672.0, + "step": 5125 + }, + { + "epoch": 0.5629255435976279, + "grad_norm": 1.7704732418060303, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7177991271018982, + "num_tokens": 132578761.0, + "step": 5126 + }, + { + "epoch": 0.5630353613002416, + "grad_norm": 1.822332501411438, + "learning_rate": 5e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6990389823913574, + "num_tokens": 132608561.0, + "step": 5127 + }, + { + "epoch": 0.5631451790028552, + "grad_norm": 1.797327995300293, + "learning_rate": 5e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6968523263931274, + "num_tokens": 132640280.0, + "step": 5128 + }, + { + "epoch": 0.563254996705469, + "grad_norm": 1.9164787530899048, + "learning_rate": 5e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6864005327224731, + "num_tokens": 132666180.0, + "step": 5129 + }, + { + "epoch": 0.5633648144080826, + "grad_norm": 1.8195210695266724, + "learning_rate": 5e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6986716389656067, + "num_tokens": 132691902.0, + "step": 5130 + }, + { + "epoch": 0.5634746321106963, + "grad_norm": 1.8025281429290771, + "learning_rate": 5e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7081588506698608, + "num_tokens": 132718128.0, + "step": 5131 + }, + { + "epoch": 0.5635844498133099, + "grad_norm": 1.9039596319198608, + "learning_rate": 5e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7081000804901123, + "num_tokens": 132742265.0, + "step": 5132 + }, + { + "epoch": 0.5636942675159236, + "grad_norm": 1.996253490447998, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.722807765007019, + "num_tokens": 132764718.0, + "step": 5133 + }, + { + "epoch": 0.5638040852185372, + "grad_norm": 1.8734313249588013, + "learning_rate": 5e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7018417119979858, + "num_tokens": 132789177.0, + "step": 5134 + }, + { + "epoch": 0.5639139029211508, + "grad_norm": 1.9551808834075928, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7410517930984497, + "num_tokens": 132811203.0, + "step": 5135 + }, + { + "epoch": 0.5640237206237646, + "grad_norm": 1.7746390104293823, + "learning_rate": 5e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7222886085510254, + "num_tokens": 132838824.0, + "step": 5136 + }, + { + "epoch": 0.5641335383263782, + "grad_norm": 1.8079445362091064, + "learning_rate": 5e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.709614634513855, + "num_tokens": 132866648.0, + "step": 5137 + }, + { + "epoch": 0.5642433560289919, + "grad_norm": 1.8842995166778564, + "learning_rate": 5e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7099611759185791, + "num_tokens": 132891633.0, + "step": 5138 + }, + { + "epoch": 0.5643531737316055, + "grad_norm": 1.9157689809799194, + "learning_rate": 5e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6957926750183105, + "num_tokens": 132914806.0, + "step": 5139 + }, + { + "epoch": 0.5644629914342192, + "grad_norm": 2.206578493118286, + "learning_rate": 5e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7108972072601318, + "num_tokens": 132933352.0, + "step": 5140 + }, + { + "epoch": 0.5645728091368328, + "grad_norm": 1.82827889919281, + "learning_rate": 5e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6908566355705261, + "num_tokens": 132960695.0, + "step": 5141 + }, + { + "epoch": 0.5646826268394465, + "grad_norm": 1.7781506776809692, + "learning_rate": 5e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7145705819129944, + "num_tokens": 132988630.0, + "step": 5142 + }, + { + "epoch": 0.5647924445420602, + "grad_norm": 2.2042388916015625, + "learning_rate": 5e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7079756259918213, + "num_tokens": 133009158.0, + "step": 5143 + }, + { + "epoch": 0.5649022622446739, + "grad_norm": 1.851001262664795, + "learning_rate": 5e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7149514555931091, + "num_tokens": 133034013.0, + "step": 5144 + }, + { + "epoch": 0.5650120799472875, + "grad_norm": 1.670501708984375, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7312309741973877, + "num_tokens": 133064478.0, + "step": 5145 + }, + { + "epoch": 0.5651218976499012, + "grad_norm": 1.726333737373352, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7078915238380432, + "num_tokens": 133095007.0, + "step": 5146 + }, + { + "epoch": 0.5652317153525148, + "grad_norm": 2.0488009452819824, + "learning_rate": 5e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7138621807098389, + "num_tokens": 133114752.0, + "step": 5147 + }, + { + "epoch": 0.5653415330551285, + "grad_norm": 1.822447657585144, + "learning_rate": 5e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6999649405479431, + "num_tokens": 133142716.0, + "step": 5148 + }, + { + "epoch": 0.5654513507577421, + "grad_norm": 1.8709070682525635, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7230156660079956, + "num_tokens": 133169882.0, + "step": 5149 + }, + { + "epoch": 0.5655611684603558, + "grad_norm": 2.076444149017334, + "learning_rate": 5e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6949902772903442, + "num_tokens": 133193315.0, + "step": 5150 + }, + { + "epoch": 0.5656709861629695, + "grad_norm": 1.83870530128479, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.711046576499939, + "num_tokens": 133218922.0, + "step": 5151 + }, + { + "epoch": 0.5657808038655832, + "grad_norm": 1.8180135488510132, + "learning_rate": 5e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6895207166671753, + "num_tokens": 133247120.0, + "step": 5152 + }, + { + "epoch": 0.5658906215681968, + "grad_norm": 1.7600880861282349, + "learning_rate": 5e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6942416429519653, + "num_tokens": 133275864.0, + "step": 5153 + }, + { + "epoch": 0.5660004392708105, + "grad_norm": 1.9327878952026367, + "learning_rate": 5e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.717362642288208, + "num_tokens": 133297882.0, + "step": 5154 + }, + { + "epoch": 0.5661102569734241, + "grad_norm": 1.6622343063354492, + "learning_rate": 5e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7014355063438416, + "num_tokens": 133330168.0, + "step": 5155 + }, + { + "epoch": 0.5662200746760377, + "grad_norm": 1.8602139949798584, + "learning_rate": 5e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7036666870117188, + "num_tokens": 133358611.0, + "step": 5156 + }, + { + "epoch": 0.5663298923786514, + "grad_norm": 1.864949345588684, + "learning_rate": 5e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7025764584541321, + "num_tokens": 133382965.0, + "step": 5157 + }, + { + "epoch": 0.5664397100812651, + "grad_norm": 1.761198878288269, + "learning_rate": 5e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7360586524009705, + "num_tokens": 133409662.0, + "step": 5158 + }, + { + "epoch": 0.5665495277838788, + "grad_norm": 1.6062313318252563, + "learning_rate": 5e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7094937562942505, + "num_tokens": 133441118.0, + "step": 5159 + }, + { + "epoch": 0.5666593454864924, + "grad_norm": 1.667981743812561, + "learning_rate": 5e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6948112845420837, + "num_tokens": 133473718.0, + "step": 5160 + }, + { + "epoch": 0.5667691631891061, + "grad_norm": 1.8887444734573364, + "learning_rate": 5e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6941428184509277, + "num_tokens": 133499295.0, + "step": 5161 + }, + { + "epoch": 0.5668789808917197, + "grad_norm": 1.7572818994522095, + "learning_rate": 5e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7080585360527039, + "num_tokens": 133527408.0, + "step": 5162 + }, + { + "epoch": 0.5669887985943334, + "grad_norm": 1.794303059577942, + "learning_rate": 5e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6892197132110596, + "num_tokens": 133555244.0, + "step": 5163 + }, + { + "epoch": 0.567098616296947, + "grad_norm": 1.7946665287017822, + "learning_rate": 5e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6958544254302979, + "num_tokens": 133584362.0, + "step": 5164 + }, + { + "epoch": 0.5672084339995608, + "grad_norm": 2.003232479095459, + "learning_rate": 5e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.704874575138092, + "num_tokens": 133607340.0, + "step": 5165 + }, + { + "epoch": 0.5673182517021744, + "grad_norm": 2.2507126331329346, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7358434200286865, + "num_tokens": 133624763.0, + "step": 5166 + }, + { + "epoch": 0.5674280694047881, + "grad_norm": 1.9066731929779053, + "learning_rate": 5e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6979904770851135, + "num_tokens": 133650030.0, + "step": 5167 + }, + { + "epoch": 0.5675378871074017, + "grad_norm": 2.074598550796509, + "learning_rate": 5e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.692211925983429, + "num_tokens": 133672706.0, + "step": 5168 + }, + { + "epoch": 0.5676477048100154, + "grad_norm": 1.743498682975769, + "learning_rate": 5e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.701188862323761, + "num_tokens": 133703951.0, + "step": 5169 + }, + { + "epoch": 0.567757522512629, + "grad_norm": 1.7140508890151978, + "learning_rate": 5e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6947402954101562, + "num_tokens": 133732703.0, + "step": 5170 + }, + { + "epoch": 0.5678673402152427, + "grad_norm": 1.8802748918533325, + "learning_rate": 5e-06, + "loss": 1.0806, + "mean_token_accuracy": 0.6767561435699463, + "num_tokens": 133758311.0, + "step": 5171 + }, + { + "epoch": 0.5679771579178564, + "grad_norm": 2.006026268005371, + "learning_rate": 5e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7009716033935547, + "num_tokens": 133783244.0, + "step": 5172 + }, + { + "epoch": 0.5680869756204701, + "grad_norm": 2.167158842086792, + "learning_rate": 5e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7178503274917603, + "num_tokens": 133802129.0, + "step": 5173 + }, + { + "epoch": 0.5681967933230837, + "grad_norm": 1.9154196977615356, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7358558773994446, + "num_tokens": 133824111.0, + "step": 5174 + }, + { + "epoch": 0.5683066110256974, + "grad_norm": 1.9204723834991455, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7254489064216614, + "num_tokens": 133846748.0, + "step": 5175 + }, + { + "epoch": 0.568416428728311, + "grad_norm": 1.9578388929367065, + "learning_rate": 5e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6972124576568604, + "num_tokens": 133870459.0, + "step": 5176 + }, + { + "epoch": 0.5685262464309246, + "grad_norm": 2.3634190559387207, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7316128611564636, + "num_tokens": 133886663.0, + "step": 5177 + }, + { + "epoch": 0.5686360641335383, + "grad_norm": 1.7093615531921387, + "learning_rate": 5e-06, + "loss": 1.1143, + "mean_token_accuracy": 0.6709097623825073, + "num_tokens": 133917193.0, + "step": 5178 + }, + { + "epoch": 0.5687458818361519, + "grad_norm": 1.779807686805725, + "learning_rate": 5e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7046533226966858, + "num_tokens": 133948582.0, + "step": 5179 + }, + { + "epoch": 0.5688556995387657, + "grad_norm": 2.0535194873809814, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7314585447311401, + "num_tokens": 133970312.0, + "step": 5180 + }, + { + "epoch": 0.5689655172413793, + "grad_norm": 1.8093100786209106, + "learning_rate": 5e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7046535015106201, + "num_tokens": 133997477.0, + "step": 5181 + }, + { + "epoch": 0.569075334943993, + "grad_norm": 1.8793047666549683, + "learning_rate": 5e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7334271669387817, + "num_tokens": 134023540.0, + "step": 5182 + }, + { + "epoch": 0.5691851526466066, + "grad_norm": 1.8363977670669556, + "learning_rate": 5e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6898516416549683, + "num_tokens": 134053603.0, + "step": 5183 + }, + { + "epoch": 0.5692949703492203, + "grad_norm": 1.8908149003982544, + "learning_rate": 5e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6946882605552673, + "num_tokens": 134079103.0, + "step": 5184 + }, + { + "epoch": 0.5694047880518339, + "grad_norm": 1.6200940608978271, + "learning_rate": 5e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6843309998512268, + "num_tokens": 134113975.0, + "step": 5185 + }, + { + "epoch": 0.5695146057544476, + "grad_norm": 1.707732081413269, + "learning_rate": 5e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7154673337936401, + "num_tokens": 134141971.0, + "step": 5186 + }, + { + "epoch": 0.5696244234570613, + "grad_norm": 1.9697967767715454, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6926835179328918, + "num_tokens": 134165364.0, + "step": 5187 + }, + { + "epoch": 0.569734241159675, + "grad_norm": 1.9104013442993164, + "learning_rate": 5e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6997307538986206, + "num_tokens": 134188136.0, + "step": 5188 + }, + { + "epoch": 0.5698440588622886, + "grad_norm": 2.3068013191223145, + "learning_rate": 5e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7189127206802368, + "num_tokens": 134204452.0, + "step": 5189 + }, + { + "epoch": 0.5699538765649023, + "grad_norm": 1.8523101806640625, + "learning_rate": 5e-06, + "loss": 0.983, + "mean_token_accuracy": 0.697546124458313, + "num_tokens": 134231409.0, + "step": 5190 + }, + { + "epoch": 0.5700636942675159, + "grad_norm": 2.0105032920837402, + "learning_rate": 5e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.728620707988739, + "num_tokens": 134251764.0, + "step": 5191 + }, + { + "epoch": 0.5701735119701296, + "grad_norm": 1.9080954790115356, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7192413210868835, + "num_tokens": 134277257.0, + "step": 5192 + }, + { + "epoch": 0.5702833296727432, + "grad_norm": 1.7737959623336792, + "learning_rate": 5e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.687284529209137, + "num_tokens": 134305936.0, + "step": 5193 + }, + { + "epoch": 0.570393147375357, + "grad_norm": 2.144177198410034, + "learning_rate": 5e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7033647298812866, + "num_tokens": 134326238.0, + "step": 5194 + }, + { + "epoch": 0.5705029650779706, + "grad_norm": 2.020054817199707, + "learning_rate": 5e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7185124158859253, + "num_tokens": 134347734.0, + "step": 5195 + }, + { + "epoch": 0.5706127827805842, + "grad_norm": 1.7180752754211426, + "learning_rate": 5e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7017755508422852, + "num_tokens": 134377817.0, + "step": 5196 + }, + { + "epoch": 0.5707226004831979, + "grad_norm": 1.7831577062606812, + "learning_rate": 5e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7108668088912964, + "num_tokens": 134406546.0, + "step": 5197 + }, + { + "epoch": 0.5708324181858115, + "grad_norm": 1.7141717672348022, + "learning_rate": 5e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6736218929290771, + "num_tokens": 134437614.0, + "step": 5198 + }, + { + "epoch": 0.5709422358884252, + "grad_norm": 1.927695631980896, + "learning_rate": 5e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7252471446990967, + "num_tokens": 134462514.0, + "step": 5199 + }, + { + "epoch": 0.5710520535910388, + "grad_norm": 1.6127170324325562, + "learning_rate": 5e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6980104446411133, + "num_tokens": 134494826.0, + "step": 5200 + }, + { + "epoch": 0.5711618712936526, + "grad_norm": 1.5637271404266357, + "learning_rate": 5e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7000788450241089, + "num_tokens": 134530222.0, + "step": 5201 + }, + { + "epoch": 0.5712716889962662, + "grad_norm": 1.8686034679412842, + "learning_rate": 5e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7098637819290161, + "num_tokens": 134554703.0, + "step": 5202 + }, + { + "epoch": 0.5713815066988799, + "grad_norm": 1.9994175434112549, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7276338338851929, + "num_tokens": 134577296.0, + "step": 5203 + }, + { + "epoch": 0.5714913244014935, + "grad_norm": 2.167707681655884, + "learning_rate": 5e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7264561653137207, + "num_tokens": 134595217.0, + "step": 5204 + }, + { + "epoch": 0.5716011421041072, + "grad_norm": 2.006728410720825, + "learning_rate": 5e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7044554948806763, + "num_tokens": 134618439.0, + "step": 5205 + }, + { + "epoch": 0.5717109598067208, + "grad_norm": 1.9614087343215942, + "learning_rate": 5e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6921698451042175, + "num_tokens": 134645007.0, + "step": 5206 + }, + { + "epoch": 0.5718207775093345, + "grad_norm": 1.8225419521331787, + "learning_rate": 5e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7035019397735596, + "num_tokens": 134671989.0, + "step": 5207 + }, + { + "epoch": 0.5719305952119481, + "grad_norm": 1.9061092138290405, + "learning_rate": 5e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7006441950798035, + "num_tokens": 134696707.0, + "step": 5208 + }, + { + "epoch": 0.5720404129145619, + "grad_norm": 1.7274597883224487, + "learning_rate": 5e-06, + "loss": 1.089, + "mean_token_accuracy": 0.6723424792289734, + "num_tokens": 134729009.0, + "step": 5209 + }, + { + "epoch": 0.5721502306171755, + "grad_norm": 1.6052201986312866, + "learning_rate": 5e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.690963864326477, + "num_tokens": 134762190.0, + "step": 5210 + }, + { + "epoch": 0.5722600483197892, + "grad_norm": 2.008591890335083, + "learning_rate": 5e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7056410908699036, + "num_tokens": 134786633.0, + "step": 5211 + }, + { + "epoch": 0.5723698660224028, + "grad_norm": 1.7437176704406738, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.705896258354187, + "num_tokens": 134814665.0, + "step": 5212 + }, + { + "epoch": 0.5724796837250165, + "grad_norm": 1.6365808248519897, + "learning_rate": 5e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7106891870498657, + "num_tokens": 134846579.0, + "step": 5213 + }, + { + "epoch": 0.5725895014276301, + "grad_norm": 2.19146466255188, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7280067801475525, + "num_tokens": 134865251.0, + "step": 5214 + }, + { + "epoch": 0.5726993191302437, + "grad_norm": 1.9541232585906982, + "learning_rate": 5e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.694301187992096, + "num_tokens": 134888563.0, + "step": 5215 + }, + { + "epoch": 0.5728091368328575, + "grad_norm": 1.677762746810913, + "learning_rate": 5e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6872503757476807, + "num_tokens": 134919504.0, + "step": 5216 + }, + { + "epoch": 0.5729189545354711, + "grad_norm": 1.8972573280334473, + "learning_rate": 5e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6931651830673218, + "num_tokens": 134946750.0, + "step": 5217 + }, + { + "epoch": 0.5730287722380848, + "grad_norm": 2.0436182022094727, + "learning_rate": 5e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7052809596061707, + "num_tokens": 134969313.0, + "step": 5218 + }, + { + "epoch": 0.5731385899406984, + "grad_norm": 1.8480007648468018, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7076456546783447, + "num_tokens": 134992595.0, + "step": 5219 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 1.8248977661132812, + "learning_rate": 5e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6919634938240051, + "num_tokens": 135020739.0, + "step": 5220 + }, + { + "epoch": 0.5733582253459257, + "grad_norm": 1.788489580154419, + "learning_rate": 5e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7057010531425476, + "num_tokens": 135048887.0, + "step": 5221 + }, + { + "epoch": 0.5734680430485394, + "grad_norm": 1.810744047164917, + "learning_rate": 5e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.697919487953186, + "num_tokens": 135077981.0, + "step": 5222 + }, + { + "epoch": 0.5735778607511531, + "grad_norm": 1.9032509326934814, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7265148162841797, + "num_tokens": 135100331.0, + "step": 5223 + }, + { + "epoch": 0.5736876784537668, + "grad_norm": 1.9409434795379639, + "learning_rate": 5e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7103257179260254, + "num_tokens": 135124670.0, + "step": 5224 + }, + { + "epoch": 0.5737974961563804, + "grad_norm": 2.0427286624908447, + "learning_rate": 5e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7047064304351807, + "num_tokens": 135146373.0, + "step": 5225 + }, + { + "epoch": 0.5739073138589941, + "grad_norm": 1.8678104877471924, + "learning_rate": 5e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7011672258377075, + "num_tokens": 135169757.0, + "step": 5226 + }, + { + "epoch": 0.5740171315616077, + "grad_norm": 1.7498754262924194, + "learning_rate": 5e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6920864582061768, + "num_tokens": 135199408.0, + "step": 5227 + }, + { + "epoch": 0.5741269492642214, + "grad_norm": 1.953847885131836, + "learning_rate": 5e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7083805799484253, + "num_tokens": 135221297.0, + "step": 5228 + }, + { + "epoch": 0.574236766966835, + "grad_norm": 1.686652660369873, + "learning_rate": 5e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6801996827125549, + "num_tokens": 135254291.0, + "step": 5229 + }, + { + "epoch": 0.5743465846694488, + "grad_norm": 1.8244839906692505, + "learning_rate": 5e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7252560257911682, + "num_tokens": 135278039.0, + "step": 5230 + }, + { + "epoch": 0.5744564023720624, + "grad_norm": 1.8513777256011963, + "learning_rate": 5e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7115194797515869, + "num_tokens": 135305646.0, + "step": 5231 + }, + { + "epoch": 0.5745662200746761, + "grad_norm": 1.8604553937911987, + "learning_rate": 5e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7049486637115479, + "num_tokens": 135331962.0, + "step": 5232 + }, + { + "epoch": 0.5746760377772897, + "grad_norm": 1.8446931838989258, + "learning_rate": 5e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.713276743888855, + "num_tokens": 135360487.0, + "step": 5233 + }, + { + "epoch": 0.5747858554799034, + "grad_norm": 1.9262689352035522, + "learning_rate": 5e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6938921809196472, + "num_tokens": 135386006.0, + "step": 5234 + }, + { + "epoch": 0.574895673182517, + "grad_norm": 1.7333060503005981, + "learning_rate": 5e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7007278203964233, + "num_tokens": 135416196.0, + "step": 5235 + }, + { + "epoch": 0.5750054908851306, + "grad_norm": 1.7183122634887695, + "learning_rate": 5e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6954406499862671, + "num_tokens": 135446509.0, + "step": 5236 + }, + { + "epoch": 0.5751153085877443, + "grad_norm": 1.939048409461975, + "learning_rate": 5e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7108889818191528, + "num_tokens": 135469407.0, + "step": 5237 + }, + { + "epoch": 0.575225126290358, + "grad_norm": 1.7654051780700684, + "learning_rate": 5e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7008084654808044, + "num_tokens": 135498320.0, + "step": 5238 + }, + { + "epoch": 0.5753349439929717, + "grad_norm": 2.4468624591827393, + "learning_rate": 5e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7132625579833984, + "num_tokens": 135515079.0, + "step": 5239 + }, + { + "epoch": 0.5754447616955853, + "grad_norm": 2.0016531944274902, + "learning_rate": 5e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.710885763168335, + "num_tokens": 135538261.0, + "step": 5240 + }, + { + "epoch": 0.575554579398199, + "grad_norm": 1.8458927869796753, + "learning_rate": 5e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7176737785339355, + "num_tokens": 135565371.0, + "step": 5241 + }, + { + "epoch": 0.5756643971008126, + "grad_norm": 1.691076397895813, + "learning_rate": 5e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6918317079544067, + "num_tokens": 135595511.0, + "step": 5242 + }, + { + "epoch": 0.5757742148034263, + "grad_norm": 1.6738991737365723, + "learning_rate": 5e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.700403094291687, + "num_tokens": 135627443.0, + "step": 5243 + }, + { + "epoch": 0.5758840325060399, + "grad_norm": 2.103903293609619, + "learning_rate": 5e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7097656726837158, + "num_tokens": 135650691.0, + "step": 5244 + }, + { + "epoch": 0.5759938502086537, + "grad_norm": 1.8523541688919067, + "learning_rate": 5e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6958820819854736, + "num_tokens": 135677418.0, + "step": 5245 + }, + { + "epoch": 0.5761036679112673, + "grad_norm": 1.969685673713684, + "learning_rate": 5e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7030565738677979, + "num_tokens": 135700989.0, + "step": 5246 + }, + { + "epoch": 0.576213485613881, + "grad_norm": 2.072376012802124, + "learning_rate": 5e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.713926374912262, + "num_tokens": 135722736.0, + "step": 5247 + }, + { + "epoch": 0.5763233033164946, + "grad_norm": 1.8962578773498535, + "learning_rate": 5e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7149771451950073, + "num_tokens": 135745710.0, + "step": 5248 + }, + { + "epoch": 0.5764331210191083, + "grad_norm": 1.9515101909637451, + "learning_rate": 5e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6838513612747192, + "num_tokens": 135769923.0, + "step": 5249 + }, + { + "epoch": 0.5765429387217219, + "grad_norm": 1.839358925819397, + "learning_rate": 5e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7070639133453369, + "num_tokens": 135797449.0, + "step": 5250 + }, + { + "epoch": 0.5766527564243356, + "grad_norm": 2.100198745727539, + "learning_rate": 5e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7008010745048523, + "num_tokens": 135818879.0, + "step": 5251 + }, + { + "epoch": 0.5767625741269493, + "grad_norm": 1.9490679502487183, + "learning_rate": 5e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6972463130950928, + "num_tokens": 135843311.0, + "step": 5252 + }, + { + "epoch": 0.576872391829563, + "grad_norm": 1.7376796007156372, + "learning_rate": 5e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6771167516708374, + "num_tokens": 135874804.0, + "step": 5253 + }, + { + "epoch": 0.5769822095321766, + "grad_norm": 1.7948158979415894, + "learning_rate": 5e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.69044029712677, + "num_tokens": 135905851.0, + "step": 5254 + }, + { + "epoch": 0.5770920272347903, + "grad_norm": 1.5441465377807617, + "learning_rate": 5e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7061296701431274, + "num_tokens": 135939960.0, + "step": 5255 + }, + { + "epoch": 0.5772018449374039, + "grad_norm": 1.842972755432129, + "learning_rate": 5e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7218247652053833, + "num_tokens": 135963983.0, + "step": 5256 + }, + { + "epoch": 0.5773116626400175, + "grad_norm": 1.7944152355194092, + "learning_rate": 5e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6862947940826416, + "num_tokens": 135991686.0, + "step": 5257 + }, + { + "epoch": 0.5774214803426312, + "grad_norm": 1.674542784690857, + "learning_rate": 5e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6964232921600342, + "num_tokens": 136020241.0, + "step": 5258 + }, + { + "epoch": 0.577531298045245, + "grad_norm": 1.665502667427063, + "learning_rate": 5e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6960952281951904, + "num_tokens": 136049239.0, + "step": 5259 + }, + { + "epoch": 0.5776411157478586, + "grad_norm": 1.9392138719558716, + "learning_rate": 5e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7283754348754883, + "num_tokens": 136072073.0, + "step": 5260 + }, + { + "epoch": 0.5777509334504722, + "grad_norm": 1.8403263092041016, + "learning_rate": 5e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7338112592697144, + "num_tokens": 136097729.0, + "step": 5261 + }, + { + "epoch": 0.5778607511530859, + "grad_norm": 2.087669849395752, + "learning_rate": 5e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.6990261673927307, + "num_tokens": 136119232.0, + "step": 5262 + }, + { + "epoch": 0.5779705688556995, + "grad_norm": 1.7566077709197998, + "learning_rate": 5e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7125849723815918, + "num_tokens": 136148571.0, + "step": 5263 + }, + { + "epoch": 0.5780803865583132, + "grad_norm": 1.8692429065704346, + "learning_rate": 5e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6818283796310425, + "num_tokens": 136174418.0, + "step": 5264 + }, + { + "epoch": 0.5781902042609268, + "grad_norm": 2.010781764984131, + "learning_rate": 5e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7075574398040771, + "num_tokens": 136197545.0, + "step": 5265 + }, + { + "epoch": 0.5783000219635406, + "grad_norm": 1.7260476350784302, + "learning_rate": 5e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6845869421958923, + "num_tokens": 136229695.0, + "step": 5266 + }, + { + "epoch": 0.5784098396661542, + "grad_norm": 1.7153422832489014, + "learning_rate": 5e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6943636536598206, + "num_tokens": 136259659.0, + "step": 5267 + }, + { + "epoch": 0.5785196573687679, + "grad_norm": 2.009373188018799, + "learning_rate": 5e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7450777292251587, + "num_tokens": 136279158.0, + "step": 5268 + }, + { + "epoch": 0.5786294750713815, + "grad_norm": 1.7279354333877563, + "learning_rate": 5e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7039854526519775, + "num_tokens": 136310109.0, + "step": 5269 + }, + { + "epoch": 0.5787392927739952, + "grad_norm": 1.9168955087661743, + "learning_rate": 5e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7119165658950806, + "num_tokens": 136335373.0, + "step": 5270 + }, + { + "epoch": 0.5788491104766088, + "grad_norm": 1.945539951324463, + "learning_rate": 5e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.704412043094635, + "num_tokens": 136358846.0, + "step": 5271 + }, + { + "epoch": 0.5789589281792225, + "grad_norm": 1.9322627782821655, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7246523499488831, + "num_tokens": 136385544.0, + "step": 5272 + }, + { + "epoch": 0.5790687458818361, + "grad_norm": 1.7776048183441162, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6874773502349854, + "num_tokens": 136414806.0, + "step": 5273 + }, + { + "epoch": 0.5791785635844499, + "grad_norm": 1.9091089963912964, + "learning_rate": 5e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6841135025024414, + "num_tokens": 136441099.0, + "step": 5274 + }, + { + "epoch": 0.5792883812870635, + "grad_norm": 1.5599206686019897, + "learning_rate": 5e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.710644543170929, + "num_tokens": 136476996.0, + "step": 5275 + }, + { + "epoch": 0.5793981989896771, + "grad_norm": 2.0249946117401123, + "learning_rate": 5e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6962259411811829, + "num_tokens": 136499121.0, + "step": 5276 + }, + { + "epoch": 0.5795080166922908, + "grad_norm": 1.9473536014556885, + "learning_rate": 5e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6791382431983948, + "num_tokens": 136526123.0, + "step": 5277 + }, + { + "epoch": 0.5796178343949044, + "grad_norm": 1.9380840063095093, + "learning_rate": 5e-06, + "loss": 1.072, + "mean_token_accuracy": 0.6851399540901184, + "num_tokens": 136553109.0, + "step": 5278 + }, + { + "epoch": 0.5797276520975181, + "grad_norm": 1.8846659660339355, + "learning_rate": 5e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7107743620872498, + "num_tokens": 136576784.0, + "step": 5279 + }, + { + "epoch": 0.5798374698001317, + "grad_norm": 2.0290281772613525, + "learning_rate": 5e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.709544837474823, + "num_tokens": 136597949.0, + "step": 5280 + }, + { + "epoch": 0.5799472875027455, + "grad_norm": 1.6908818483352661, + "learning_rate": 5e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6862221956253052, + "num_tokens": 136633343.0, + "step": 5281 + }, + { + "epoch": 0.5800571052053591, + "grad_norm": 1.545089602470398, + "learning_rate": 5e-06, + "loss": 1.1028, + "mean_token_accuracy": 0.6760746240615845, + "num_tokens": 136670828.0, + "step": 5282 + }, + { + "epoch": 0.5801669229079728, + "grad_norm": 1.7695449590682983, + "learning_rate": 5e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7090305685997009, + "num_tokens": 136697106.0, + "step": 5283 + }, + { + "epoch": 0.5802767406105864, + "grad_norm": 1.8827890157699585, + "learning_rate": 5e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6790083050727844, + "num_tokens": 136727619.0, + "step": 5284 + }, + { + "epoch": 0.5803865583132001, + "grad_norm": 1.6846331357955933, + "learning_rate": 5e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6873772144317627, + "num_tokens": 136758691.0, + "step": 5285 + }, + { + "epoch": 0.5804963760158137, + "grad_norm": 1.7973318099975586, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7191301584243774, + "num_tokens": 136785648.0, + "step": 5286 + }, + { + "epoch": 0.5806061937184274, + "grad_norm": 2.1533286571502686, + "learning_rate": 5e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7149283289909363, + "num_tokens": 136805210.0, + "step": 5287 + }, + { + "epoch": 0.5807160114210411, + "grad_norm": 1.7206199169158936, + "learning_rate": 5e-06, + "loss": 1.0962, + "mean_token_accuracy": 0.6746503114700317, + "num_tokens": 136837614.0, + "step": 5288 + }, + { + "epoch": 0.5808258291236548, + "grad_norm": 1.950632095336914, + "learning_rate": 5e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6995136737823486, + "num_tokens": 136864338.0, + "step": 5289 + }, + { + "epoch": 0.5809356468262684, + "grad_norm": 1.9507648944854736, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7203656435012817, + "num_tokens": 136887318.0, + "step": 5290 + }, + { + "epoch": 0.5810454645288821, + "grad_norm": 1.7640444040298462, + "learning_rate": 5e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7002158164978027, + "num_tokens": 136919215.0, + "step": 5291 + }, + { + "epoch": 0.5811552822314957, + "grad_norm": 1.951341152191162, + "learning_rate": 5e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7080528736114502, + "num_tokens": 136942584.0, + "step": 5292 + }, + { + "epoch": 0.5812650999341094, + "grad_norm": 1.6999962329864502, + "learning_rate": 5e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6971396207809448, + "num_tokens": 136970854.0, + "step": 5293 + }, + { + "epoch": 0.581374917636723, + "grad_norm": 1.9335755109786987, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7035166025161743, + "num_tokens": 136995772.0, + "step": 5294 + }, + { + "epoch": 0.5814847353393368, + "grad_norm": 1.9352214336395264, + "learning_rate": 5e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7123621702194214, + "num_tokens": 137019289.0, + "step": 5295 + }, + { + "epoch": 0.5815945530419504, + "grad_norm": 1.7385923862457275, + "learning_rate": 5e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7087850570678711, + "num_tokens": 137047696.0, + "step": 5296 + }, + { + "epoch": 0.581704370744564, + "grad_norm": 1.7695244550704956, + "learning_rate": 5e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6850255727767944, + "num_tokens": 137075132.0, + "step": 5297 + }, + { + "epoch": 0.5818141884471777, + "grad_norm": 2.0668270587921143, + "learning_rate": 5e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7089979648590088, + "num_tokens": 137095441.0, + "step": 5298 + }, + { + "epoch": 0.5819240061497913, + "grad_norm": 1.9510176181793213, + "learning_rate": 5e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7054749131202698, + "num_tokens": 137122072.0, + "step": 5299 + }, + { + "epoch": 0.582033823852405, + "grad_norm": 1.7064017057418823, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7142595052719116, + "num_tokens": 137149624.0, + "step": 5300 + }, + { + "epoch": 0.5821436415550186, + "grad_norm": 1.7577381134033203, + "learning_rate": 5e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7115217447280884, + "num_tokens": 137176042.0, + "step": 5301 + }, + { + "epoch": 0.5822534592576323, + "grad_norm": 2.496272563934326, + "learning_rate": 5e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7111618518829346, + "num_tokens": 137193633.0, + "step": 5302 + }, + { + "epoch": 0.582363276960246, + "grad_norm": 2.07486891746521, + "learning_rate": 5e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.7001196146011353, + "num_tokens": 137215720.0, + "step": 5303 + }, + { + "epoch": 0.5824730946628597, + "grad_norm": 1.7851577997207642, + "learning_rate": 5e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7072535753250122, + "num_tokens": 137244229.0, + "step": 5304 + }, + { + "epoch": 0.5825829123654733, + "grad_norm": 2.2004263401031494, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7335699796676636, + "num_tokens": 137261668.0, + "step": 5305 + }, + { + "epoch": 0.582692730068087, + "grad_norm": 1.9043041467666626, + "learning_rate": 5e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7182866930961609, + "num_tokens": 137285715.0, + "step": 5306 + }, + { + "epoch": 0.5828025477707006, + "grad_norm": 2.026015043258667, + "learning_rate": 5e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7231866717338562, + "num_tokens": 137307653.0, + "step": 5307 + }, + { + "epoch": 0.5829123654733143, + "grad_norm": 1.8277068138122559, + "learning_rate": 5e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7136281132698059, + "num_tokens": 137332158.0, + "step": 5308 + }, + { + "epoch": 0.5830221831759279, + "grad_norm": 1.859824299812317, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6950940489768982, + "num_tokens": 137356913.0, + "step": 5309 + }, + { + "epoch": 0.5831320008785417, + "grad_norm": 1.7420696020126343, + "learning_rate": 5e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7032490968704224, + "num_tokens": 137388069.0, + "step": 5310 + }, + { + "epoch": 0.5832418185811553, + "grad_norm": 2.3395919799804688, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7204599380493164, + "num_tokens": 137409861.0, + "step": 5311 + }, + { + "epoch": 0.583351636283769, + "grad_norm": 2.2008116245269775, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7321729063987732, + "num_tokens": 137428701.0, + "step": 5312 + }, + { + "epoch": 0.5834614539863826, + "grad_norm": 1.7894890308380127, + "learning_rate": 5e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7095754146575928, + "num_tokens": 137454181.0, + "step": 5313 + }, + { + "epoch": 0.5835712716889963, + "grad_norm": 1.8684970140457153, + "learning_rate": 5e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.6971396207809448, + "num_tokens": 137479375.0, + "step": 5314 + }, + { + "epoch": 0.5836810893916099, + "grad_norm": 1.9678748846054077, + "learning_rate": 5e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6893455386161804, + "num_tokens": 137504892.0, + "step": 5315 + }, + { + "epoch": 0.5837909070942235, + "grad_norm": 1.814869999885559, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7307815551757812, + "num_tokens": 137528902.0, + "step": 5316 + }, + { + "epoch": 0.5839007247968373, + "grad_norm": 2.014183759689331, + "learning_rate": 5e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7219030857086182, + "num_tokens": 137549452.0, + "step": 5317 + }, + { + "epoch": 0.584010542499451, + "grad_norm": 1.885058045387268, + "learning_rate": 5e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6916199922561646, + "num_tokens": 137574591.0, + "step": 5318 + }, + { + "epoch": 0.5841203602020646, + "grad_norm": 1.8722935914993286, + "learning_rate": 5e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.692080020904541, + "num_tokens": 137600149.0, + "step": 5319 + }, + { + "epoch": 0.5842301779046782, + "grad_norm": 1.862424373626709, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7356867790222168, + "num_tokens": 137623730.0, + "step": 5320 + }, + { + "epoch": 0.5843399956072919, + "grad_norm": 1.751717448234558, + "learning_rate": 5e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6853121519088745, + "num_tokens": 137654896.0, + "step": 5321 + }, + { + "epoch": 0.5844498133099055, + "grad_norm": 1.7605466842651367, + "learning_rate": 5e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7117127180099487, + "num_tokens": 137682954.0, + "step": 5322 + }, + { + "epoch": 0.5845596310125192, + "grad_norm": 1.904881477355957, + "learning_rate": 5e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7218842506408691, + "num_tokens": 137707564.0, + "step": 5323 + }, + { + "epoch": 0.5846694487151329, + "grad_norm": 2.426720380783081, + "learning_rate": 5e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7039995193481445, + "num_tokens": 137725924.0, + "step": 5324 + }, + { + "epoch": 0.5847792664177466, + "grad_norm": 1.997040033340454, + "learning_rate": 5e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7122489809989929, + "num_tokens": 137748721.0, + "step": 5325 + }, + { + "epoch": 0.5848890841203602, + "grad_norm": 2.0046491622924805, + "learning_rate": 5e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7124527096748352, + "num_tokens": 137771633.0, + "step": 5326 + }, + { + "epoch": 0.5849989018229739, + "grad_norm": 2.304720878601074, + "learning_rate": 5e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7097117900848389, + "num_tokens": 137794389.0, + "step": 5327 + }, + { + "epoch": 0.5851087195255875, + "grad_norm": 1.8126695156097412, + "learning_rate": 5e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.694233775138855, + "num_tokens": 137823083.0, + "step": 5328 + }, + { + "epoch": 0.5852185372282012, + "grad_norm": 1.9711357355117798, + "learning_rate": 5e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.692040205001831, + "num_tokens": 137846277.0, + "step": 5329 + }, + { + "epoch": 0.5853283549308148, + "grad_norm": 1.6643905639648438, + "learning_rate": 5e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6820389628410339, + "num_tokens": 137878621.0, + "step": 5330 + }, + { + "epoch": 0.5854381726334285, + "grad_norm": 1.7314262390136719, + "learning_rate": 5e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6828328371047974, + "num_tokens": 137908916.0, + "step": 5331 + }, + { + "epoch": 0.5855479903360422, + "grad_norm": 2.1081440448760986, + "learning_rate": 5e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7202501893043518, + "num_tokens": 137927649.0, + "step": 5332 + }, + { + "epoch": 0.5856578080386559, + "grad_norm": 1.8279491662979126, + "learning_rate": 5e-06, + "loss": 1.1187, + "mean_token_accuracy": 0.6641958951950073, + "num_tokens": 137954940.0, + "step": 5333 + }, + { + "epoch": 0.5857676257412695, + "grad_norm": 1.7833346128463745, + "learning_rate": 5e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6733229160308838, + "num_tokens": 137984267.0, + "step": 5334 + }, + { + "epoch": 0.5858774434438832, + "grad_norm": 1.7716729640960693, + "learning_rate": 5e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7119894623756409, + "num_tokens": 138011282.0, + "step": 5335 + }, + { + "epoch": 0.5859872611464968, + "grad_norm": 1.8161991834640503, + "learning_rate": 5e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6946377158164978, + "num_tokens": 138036896.0, + "step": 5336 + }, + { + "epoch": 0.5860970788491104, + "grad_norm": 1.7557094097137451, + "learning_rate": 5e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6983320713043213, + "num_tokens": 138067352.0, + "step": 5337 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 2.0651192665100098, + "learning_rate": 5e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6955263018608093, + "num_tokens": 138089716.0, + "step": 5338 + }, + { + "epoch": 0.5863167142543378, + "grad_norm": 1.7870157957077026, + "learning_rate": 5e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7056047916412354, + "num_tokens": 138116870.0, + "step": 5339 + }, + { + "epoch": 0.5864265319569515, + "grad_norm": 1.9923170804977417, + "learning_rate": 5e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6879115700721741, + "num_tokens": 138141502.0, + "step": 5340 + }, + { + "epoch": 0.5865363496595651, + "grad_norm": 1.936879277229309, + "learning_rate": 5e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6898889541625977, + "num_tokens": 138166929.0, + "step": 5341 + }, + { + "epoch": 0.5866461673621788, + "grad_norm": 1.684200406074524, + "learning_rate": 5e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6786108016967773, + "num_tokens": 138201234.0, + "step": 5342 + }, + { + "epoch": 0.5867559850647924, + "grad_norm": 2.0272488594055176, + "learning_rate": 5e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7137762308120728, + "num_tokens": 138224622.0, + "step": 5343 + }, + { + "epoch": 0.5868658027674061, + "grad_norm": 1.8068028688430786, + "learning_rate": 5e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6821358799934387, + "num_tokens": 138250591.0, + "step": 5344 + }, + { + "epoch": 0.5869756204700197, + "grad_norm": 1.9584437608718872, + "learning_rate": 5e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6933836340904236, + "num_tokens": 138275280.0, + "step": 5345 + }, + { + "epoch": 0.5870854381726335, + "grad_norm": 2.109912395477295, + "learning_rate": 5e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7461007833480835, + "num_tokens": 138294107.0, + "step": 5346 + }, + { + "epoch": 0.5871952558752471, + "grad_norm": 1.747308373451233, + "learning_rate": 5e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7046321630477905, + "num_tokens": 138322530.0, + "step": 5347 + }, + { + "epoch": 0.5873050735778608, + "grad_norm": 1.8793739080429077, + "learning_rate": 5e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7146635055541992, + "num_tokens": 138347873.0, + "step": 5348 + }, + { + "epoch": 0.5874148912804744, + "grad_norm": 1.941788673400879, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.697598397731781, + "num_tokens": 138370881.0, + "step": 5349 + }, + { + "epoch": 0.5875247089830881, + "grad_norm": 1.8653322458267212, + "learning_rate": 5e-06, + "loss": 1.1088, + "mean_token_accuracy": 0.6707195043563843, + "num_tokens": 138401434.0, + "step": 5350 + }, + { + "epoch": 0.5876345266857017, + "grad_norm": 2.0217223167419434, + "learning_rate": 5e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.699999213218689, + "num_tokens": 138422344.0, + "step": 5351 + }, + { + "epoch": 0.5877443443883154, + "grad_norm": 1.9211812019348145, + "learning_rate": 5e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7020071148872375, + "num_tokens": 138447733.0, + "step": 5352 + }, + { + "epoch": 0.5878541620909291, + "grad_norm": 1.9783340692520142, + "learning_rate": 5e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7002108097076416, + "num_tokens": 138471916.0, + "step": 5353 + }, + { + "epoch": 0.5879639797935428, + "grad_norm": 1.7552988529205322, + "learning_rate": 5e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6937922239303589, + "num_tokens": 138499023.0, + "step": 5354 + }, + { + "epoch": 0.5880737974961564, + "grad_norm": 1.7794109582901, + "learning_rate": 5e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6857620477676392, + "num_tokens": 138525443.0, + "step": 5355 + }, + { + "epoch": 0.58818361519877, + "grad_norm": 1.7295736074447632, + "learning_rate": 5e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7000661492347717, + "num_tokens": 138557709.0, + "step": 5356 + }, + { + "epoch": 0.5882934329013837, + "grad_norm": 1.9493699073791504, + "learning_rate": 5e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7119691371917725, + "num_tokens": 138579710.0, + "step": 5357 + }, + { + "epoch": 0.5884032506039973, + "grad_norm": 1.6329054832458496, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7210267782211304, + "num_tokens": 138608068.0, + "step": 5358 + }, + { + "epoch": 0.588513068306611, + "grad_norm": 1.8947569131851196, + "learning_rate": 5e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7151677012443542, + "num_tokens": 138633250.0, + "step": 5359 + }, + { + "epoch": 0.5886228860092246, + "grad_norm": 1.9122345447540283, + "learning_rate": 5e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7022581696510315, + "num_tokens": 138660562.0, + "step": 5360 + }, + { + "epoch": 0.5887327037118384, + "grad_norm": 1.923410177230835, + "learning_rate": 5e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7130275368690491, + "num_tokens": 138685554.0, + "step": 5361 + }, + { + "epoch": 0.588842521414452, + "grad_norm": 1.9590696096420288, + "learning_rate": 5e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7102860808372498, + "num_tokens": 138711960.0, + "step": 5362 + }, + { + "epoch": 0.5889523391170657, + "grad_norm": 1.9961769580841064, + "learning_rate": 5e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.6998058557510376, + "num_tokens": 138738635.0, + "step": 5363 + }, + { + "epoch": 0.5890621568196793, + "grad_norm": 1.8679643869400024, + "learning_rate": 5e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7004340291023254, + "num_tokens": 138764465.0, + "step": 5364 + }, + { + "epoch": 0.589171974522293, + "grad_norm": 1.767099380493164, + "learning_rate": 5e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7046963572502136, + "num_tokens": 138793111.0, + "step": 5365 + }, + { + "epoch": 0.5892817922249066, + "grad_norm": 1.8207039833068848, + "learning_rate": 5e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7071987390518188, + "num_tokens": 138820307.0, + "step": 5366 + }, + { + "epoch": 0.5893916099275203, + "grad_norm": 1.8000985383987427, + "learning_rate": 5e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6895673274993896, + "num_tokens": 138848147.0, + "step": 5367 + }, + { + "epoch": 0.589501427630134, + "grad_norm": 1.6303311586380005, + "learning_rate": 5e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7161592245101929, + "num_tokens": 138877059.0, + "step": 5368 + }, + { + "epoch": 0.5896112453327477, + "grad_norm": 1.9388134479522705, + "learning_rate": 5e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7084759473800659, + "num_tokens": 138900828.0, + "step": 5369 + }, + { + "epoch": 0.5897210630353613, + "grad_norm": 1.9177438020706177, + "learning_rate": 5e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6994192004203796, + "num_tokens": 138924726.0, + "step": 5370 + }, + { + "epoch": 0.589830880737975, + "grad_norm": 1.8077633380889893, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7219769358634949, + "num_tokens": 138949602.0, + "step": 5371 + }, + { + "epoch": 0.5899406984405886, + "grad_norm": 2.0036604404449463, + "learning_rate": 5e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7063940763473511, + "num_tokens": 138972070.0, + "step": 5372 + }, + { + "epoch": 0.5900505161432023, + "grad_norm": 1.9779053926467896, + "learning_rate": 5e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7016162872314453, + "num_tokens": 138997435.0, + "step": 5373 + }, + { + "epoch": 0.5901603338458159, + "grad_norm": 2.0063254833221436, + "learning_rate": 5e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7069545984268188, + "num_tokens": 139022149.0, + "step": 5374 + }, + { + "epoch": 0.5902701515484297, + "grad_norm": 1.854762315750122, + "learning_rate": 5e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7017673254013062, + "num_tokens": 139048486.0, + "step": 5375 + }, + { + "epoch": 0.5903799692510433, + "grad_norm": 1.784854531288147, + "learning_rate": 5e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7110331058502197, + "num_tokens": 139077597.0, + "step": 5376 + }, + { + "epoch": 0.590489786953657, + "grad_norm": 1.8126434087753296, + "learning_rate": 5e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7049403190612793, + "num_tokens": 139103210.0, + "step": 5377 + }, + { + "epoch": 0.5905996046562706, + "grad_norm": 1.8899595737457275, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6901111602783203, + "num_tokens": 139127421.0, + "step": 5378 + }, + { + "epoch": 0.5907094223588842, + "grad_norm": 2.1003787517547607, + "learning_rate": 5e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.689830482006073, + "num_tokens": 139148294.0, + "step": 5379 + }, + { + "epoch": 0.5908192400614979, + "grad_norm": 2.0261380672454834, + "learning_rate": 5e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7022213935852051, + "num_tokens": 139172785.0, + "step": 5380 + }, + { + "epoch": 0.5909290577641115, + "grad_norm": 1.784280776977539, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7395117282867432, + "num_tokens": 139196484.0, + "step": 5381 + }, + { + "epoch": 0.5910388754667253, + "grad_norm": 1.7837640047073364, + "learning_rate": 5e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7079717516899109, + "num_tokens": 139224433.0, + "step": 5382 + }, + { + "epoch": 0.5911486931693389, + "grad_norm": 1.7493445873260498, + "learning_rate": 5e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6991444826126099, + "num_tokens": 139250496.0, + "step": 5383 + }, + { + "epoch": 0.5912585108719526, + "grad_norm": 2.0755627155303955, + "learning_rate": 5e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6887797713279724, + "num_tokens": 139272429.0, + "step": 5384 + }, + { + "epoch": 0.5913683285745662, + "grad_norm": 1.8931641578674316, + "learning_rate": 5e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7179632782936096, + "num_tokens": 139298077.0, + "step": 5385 + }, + { + "epoch": 0.5914781462771799, + "grad_norm": 1.7745908498764038, + "learning_rate": 5e-06, + "loss": 1.085, + "mean_token_accuracy": 0.6725304126739502, + "num_tokens": 139325640.0, + "step": 5386 + }, + { + "epoch": 0.5915879639797935, + "grad_norm": 2.0540645122528076, + "learning_rate": 5e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7078480124473572, + "num_tokens": 139348975.0, + "step": 5387 + }, + { + "epoch": 0.5916977816824072, + "grad_norm": 1.7884936332702637, + "learning_rate": 5e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6985820531845093, + "num_tokens": 139377568.0, + "step": 5388 + }, + { + "epoch": 0.5918075993850208, + "grad_norm": 1.8864752054214478, + "learning_rate": 5e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6892058849334717, + "num_tokens": 139403333.0, + "step": 5389 + }, + { + "epoch": 0.5919174170876346, + "grad_norm": 1.8055874109268188, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6995365023612976, + "num_tokens": 139433607.0, + "step": 5390 + }, + { + "epoch": 0.5920272347902482, + "grad_norm": 1.8643550872802734, + "learning_rate": 5e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7047433257102966, + "num_tokens": 139459570.0, + "step": 5391 + }, + { + "epoch": 0.5921370524928619, + "grad_norm": 2.0897281169891357, + "learning_rate": 5e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6870020627975464, + "num_tokens": 139482698.0, + "step": 5392 + }, + { + "epoch": 0.5922468701954755, + "grad_norm": 2.17755126953125, + "learning_rate": 5e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7114778757095337, + "num_tokens": 139500397.0, + "step": 5393 + }, + { + "epoch": 0.5923566878980892, + "grad_norm": 1.7216352224349976, + "learning_rate": 5e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6835665702819824, + "num_tokens": 139529154.0, + "step": 5394 + }, + { + "epoch": 0.5924665056007028, + "grad_norm": 1.8960405588150024, + "learning_rate": 5e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6989222764968872, + "num_tokens": 139554094.0, + "step": 5395 + }, + { + "epoch": 0.5925763233033164, + "grad_norm": 1.6778169870376587, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7000761032104492, + "num_tokens": 139584986.0, + "step": 5396 + }, + { + "epoch": 0.5926861410059302, + "grad_norm": 2.3601527214050293, + "learning_rate": 5e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7025418877601624, + "num_tokens": 139602389.0, + "step": 5397 + }, + { + "epoch": 0.5927959587085438, + "grad_norm": 1.9360620975494385, + "learning_rate": 5e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6939554214477539, + "num_tokens": 139631089.0, + "step": 5398 + }, + { + "epoch": 0.5929057764111575, + "grad_norm": 1.5963964462280273, + "learning_rate": 5e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7053223848342896, + "num_tokens": 139661511.0, + "step": 5399 + }, + { + "epoch": 0.5930155941137711, + "grad_norm": 1.6413071155548096, + "learning_rate": 5e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7108711004257202, + "num_tokens": 139692176.0, + "step": 5400 + }, + { + "epoch": 0.5931254118163848, + "grad_norm": 1.8857704401016235, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7246909141540527, + "num_tokens": 139716777.0, + "step": 5401 + }, + { + "epoch": 0.5932352295189984, + "grad_norm": 1.9088431596755981, + "learning_rate": 5e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6913819909095764, + "num_tokens": 139744636.0, + "step": 5402 + }, + { + "epoch": 0.5933450472216121, + "grad_norm": 1.753902792930603, + "learning_rate": 5e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7084371447563171, + "num_tokens": 139774432.0, + "step": 5403 + }, + { + "epoch": 0.5934548649242258, + "grad_norm": 1.7359802722930908, + "learning_rate": 5e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6971192359924316, + "num_tokens": 139804777.0, + "step": 5404 + }, + { + "epoch": 0.5935646826268395, + "grad_norm": 2.2015671730041504, + "learning_rate": 5e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.702865481376648, + "num_tokens": 139823791.0, + "step": 5405 + }, + { + "epoch": 0.5936745003294531, + "grad_norm": 1.8389577865600586, + "learning_rate": 5e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6946926116943359, + "num_tokens": 139850718.0, + "step": 5406 + }, + { + "epoch": 0.5937843180320668, + "grad_norm": 1.7429386377334595, + "learning_rate": 5e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7005257606506348, + "num_tokens": 139876723.0, + "step": 5407 + }, + { + "epoch": 0.5938941357346804, + "grad_norm": 1.7011533975601196, + "learning_rate": 5e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6824572086334229, + "num_tokens": 139906177.0, + "step": 5408 + }, + { + "epoch": 0.5940039534372941, + "grad_norm": 1.734269380569458, + "learning_rate": 5e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7055889368057251, + "num_tokens": 139933528.0, + "step": 5409 + }, + { + "epoch": 0.5941137711399077, + "grad_norm": 1.937156081199646, + "learning_rate": 5e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6908508539199829, + "num_tokens": 139956693.0, + "step": 5410 + }, + { + "epoch": 0.5942235888425215, + "grad_norm": 1.796590805053711, + "learning_rate": 5e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.6994650363922119, + "num_tokens": 139982235.0, + "step": 5411 + }, + { + "epoch": 0.5943334065451351, + "grad_norm": 1.7797890901565552, + "learning_rate": 5e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7056902647018433, + "num_tokens": 140010201.0, + "step": 5412 + }, + { + "epoch": 0.5944432242477488, + "grad_norm": 1.9697819948196411, + "learning_rate": 5e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7007267475128174, + "num_tokens": 140032744.0, + "step": 5413 + }, + { + "epoch": 0.5945530419503624, + "grad_norm": 1.8972325325012207, + "learning_rate": 5e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7004864811897278, + "num_tokens": 140056067.0, + "step": 5414 + }, + { + "epoch": 0.594662859652976, + "grad_norm": 2.0226492881774902, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7115668058395386, + "num_tokens": 140078012.0, + "step": 5415 + }, + { + "epoch": 0.5947726773555897, + "grad_norm": 1.8233187198638916, + "learning_rate": 5e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7056474685668945, + "num_tokens": 140103777.0, + "step": 5416 + }, + { + "epoch": 0.5948824950582033, + "grad_norm": 1.8969051837921143, + "learning_rate": 5e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6874253153800964, + "num_tokens": 140130348.0, + "step": 5417 + }, + { + "epoch": 0.5949923127608171, + "grad_norm": 1.9401021003723145, + "learning_rate": 5e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7106524109840393, + "num_tokens": 140154510.0, + "step": 5418 + }, + { + "epoch": 0.5951021304634307, + "grad_norm": 1.778246521949768, + "learning_rate": 5e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6955928206443787, + "num_tokens": 140183015.0, + "step": 5419 + }, + { + "epoch": 0.5952119481660444, + "grad_norm": 1.6943535804748535, + "learning_rate": 5e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7001609802246094, + "num_tokens": 140213739.0, + "step": 5420 + }, + { + "epoch": 0.595321765868658, + "grad_norm": 1.7134820222854614, + "learning_rate": 5e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7033714652061462, + "num_tokens": 140243370.0, + "step": 5421 + }, + { + "epoch": 0.5954315835712717, + "grad_norm": 2.044125556945801, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7199721336364746, + "num_tokens": 140262707.0, + "step": 5422 + }, + { + "epoch": 0.5955414012738853, + "grad_norm": 1.847465991973877, + "learning_rate": 5e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7001997232437134, + "num_tokens": 140292442.0, + "step": 5423 + }, + { + "epoch": 0.595651218976499, + "grad_norm": 1.8034392595291138, + "learning_rate": 5e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6914523839950562, + "num_tokens": 140319741.0, + "step": 5424 + }, + { + "epoch": 0.5957610366791126, + "grad_norm": 2.032702684402466, + "learning_rate": 5e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7412745356559753, + "num_tokens": 140339599.0, + "step": 5425 + }, + { + "epoch": 0.5958708543817264, + "grad_norm": 1.870711088180542, + "learning_rate": 5e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7082387804985046, + "num_tokens": 140362419.0, + "step": 5426 + }, + { + "epoch": 0.59598067208434, + "grad_norm": 1.9307091236114502, + "learning_rate": 5e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7059540152549744, + "num_tokens": 140387242.0, + "step": 5427 + }, + { + "epoch": 0.5960904897869537, + "grad_norm": 1.7818964719772339, + "learning_rate": 5e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.720876932144165, + "num_tokens": 140413750.0, + "step": 5428 + }, + { + "epoch": 0.5962003074895673, + "grad_norm": 2.237792491912842, + "learning_rate": 5e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7144684791564941, + "num_tokens": 140432689.0, + "step": 5429 + }, + { + "epoch": 0.596310125192181, + "grad_norm": 2.165149211883545, + "learning_rate": 5e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6922853589057922, + "num_tokens": 140455397.0, + "step": 5430 + }, + { + "epoch": 0.5964199428947946, + "grad_norm": 1.8399094343185425, + "learning_rate": 5e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.6804910898208618, + "num_tokens": 140482851.0, + "step": 5431 + }, + { + "epoch": 0.5965297605974083, + "grad_norm": 1.877091407775879, + "learning_rate": 5e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7036761045455933, + "num_tokens": 140507190.0, + "step": 5432 + }, + { + "epoch": 0.596639578300022, + "grad_norm": 1.8463819026947021, + "learning_rate": 5e-06, + "loss": 0.935, + "mean_token_accuracy": 0.713994026184082, + "num_tokens": 140532621.0, + "step": 5433 + }, + { + "epoch": 0.5967493960026357, + "grad_norm": 2.069753408432007, + "learning_rate": 5e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7079998850822449, + "num_tokens": 140552706.0, + "step": 5434 + }, + { + "epoch": 0.5968592137052493, + "grad_norm": 1.6741161346435547, + "learning_rate": 5e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6889563202857971, + "num_tokens": 140583577.0, + "step": 5435 + }, + { + "epoch": 0.596969031407863, + "grad_norm": 1.6180508136749268, + "learning_rate": 5e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7054734230041504, + "num_tokens": 140616162.0, + "step": 5436 + }, + { + "epoch": 0.5970788491104766, + "grad_norm": 1.822282075881958, + "learning_rate": 5e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7015957236289978, + "num_tokens": 140642103.0, + "step": 5437 + }, + { + "epoch": 0.5971886668130902, + "grad_norm": 1.6423226594924927, + "learning_rate": 5e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6883949041366577, + "num_tokens": 140673486.0, + "step": 5438 + }, + { + "epoch": 0.5972984845157039, + "grad_norm": 1.8210804462432861, + "learning_rate": 5e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6973681449890137, + "num_tokens": 140699999.0, + "step": 5439 + }, + { + "epoch": 0.5974083022183176, + "grad_norm": 1.585256814956665, + "learning_rate": 5e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6937165856361389, + "num_tokens": 140731809.0, + "step": 5440 + }, + { + "epoch": 0.5975181199209313, + "grad_norm": 2.195202350616455, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7017391920089722, + "num_tokens": 140752277.0, + "step": 5441 + }, + { + "epoch": 0.5976279376235449, + "grad_norm": 1.9803303480148315, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7336947917938232, + "num_tokens": 140771924.0, + "step": 5442 + }, + { + "epoch": 0.5977377553261586, + "grad_norm": 2.0612149238586426, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7100090980529785, + "num_tokens": 140794522.0, + "step": 5443 + }, + { + "epoch": 0.5978475730287722, + "grad_norm": 2.146094799041748, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7310033440589905, + "num_tokens": 140813986.0, + "step": 5444 + }, + { + "epoch": 0.5979573907313859, + "grad_norm": 1.7201776504516602, + "learning_rate": 5e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7039331793785095, + "num_tokens": 140845601.0, + "step": 5445 + }, + { + "epoch": 0.5980672084339995, + "grad_norm": 1.9280402660369873, + "learning_rate": 5e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7054325342178345, + "num_tokens": 140868088.0, + "step": 5446 + }, + { + "epoch": 0.5981770261366133, + "grad_norm": 1.6932722330093384, + "learning_rate": 5e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.682732343673706, + "num_tokens": 140901735.0, + "step": 5447 + }, + { + "epoch": 0.5982868438392269, + "grad_norm": 1.811000943183899, + "learning_rate": 5e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7136045694351196, + "num_tokens": 140928602.0, + "step": 5448 + }, + { + "epoch": 0.5983966615418406, + "grad_norm": 1.7134422063827515, + "learning_rate": 5e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7078720331192017, + "num_tokens": 140960908.0, + "step": 5449 + }, + { + "epoch": 0.5985064792444542, + "grad_norm": 1.9310985803604126, + "learning_rate": 5e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6950254440307617, + "num_tokens": 140986489.0, + "step": 5450 + }, + { + "epoch": 0.5986162969470679, + "grad_norm": 1.9887073040008545, + "learning_rate": 5e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7144671678543091, + "num_tokens": 141008480.0, + "step": 5451 + }, + { + "epoch": 0.5987261146496815, + "grad_norm": 1.8921926021575928, + "learning_rate": 5e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7054749727249146, + "num_tokens": 141033405.0, + "step": 5452 + }, + { + "epoch": 0.5988359323522952, + "grad_norm": 2.3390355110168457, + "learning_rate": 5e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7070840001106262, + "num_tokens": 141050748.0, + "step": 5453 + }, + { + "epoch": 0.5989457500549088, + "grad_norm": 2.147347927093506, + "learning_rate": 5e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7299603223800659, + "num_tokens": 141068711.0, + "step": 5454 + }, + { + "epoch": 0.5990555677575226, + "grad_norm": 1.658698558807373, + "learning_rate": 5e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7082417011260986, + "num_tokens": 141099862.0, + "step": 5455 + }, + { + "epoch": 0.5991653854601362, + "grad_norm": 1.78751540184021, + "learning_rate": 5e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6873325109481812, + "num_tokens": 141126659.0, + "step": 5456 + }, + { + "epoch": 0.5992752031627498, + "grad_norm": 2.0951173305511475, + "learning_rate": 5e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7107903957366943, + "num_tokens": 141147184.0, + "step": 5457 + }, + { + "epoch": 0.5993850208653635, + "grad_norm": 1.994615077972412, + "learning_rate": 5e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6949402689933777, + "num_tokens": 141171554.0, + "step": 5458 + }, + { + "epoch": 0.5994948385679771, + "grad_norm": 1.9682328701019287, + "learning_rate": 5e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6996581554412842, + "num_tokens": 141195577.0, + "step": 5459 + }, + { + "epoch": 0.5996046562705908, + "grad_norm": 1.7320668697357178, + "learning_rate": 5e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7085689306259155, + "num_tokens": 141223725.0, + "step": 5460 + }, + { + "epoch": 0.5997144739732044, + "grad_norm": 1.684198021888733, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.712533175945282, + "num_tokens": 141252311.0, + "step": 5461 + }, + { + "epoch": 0.5998242916758182, + "grad_norm": 1.6846057176589966, + "learning_rate": 5e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7025645971298218, + "num_tokens": 141282456.0, + "step": 5462 + }, + { + "epoch": 0.5999341093784318, + "grad_norm": 1.5932899713516235, + "learning_rate": 5e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7037215232849121, + "num_tokens": 141313806.0, + "step": 5463 + }, + { + "epoch": 0.6000439270810455, + "grad_norm": 1.6271569728851318, + "learning_rate": 5e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7267707586288452, + "num_tokens": 141344470.0, + "step": 5464 + }, + { + "epoch": 0.6001537447836591, + "grad_norm": 1.769229531288147, + "learning_rate": 5e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6953136920928955, + "num_tokens": 141374912.0, + "step": 5465 + }, + { + "epoch": 0.6002635624862728, + "grad_norm": 1.9739758968353271, + "learning_rate": 5e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6971412897109985, + "num_tokens": 141399400.0, + "step": 5466 + }, + { + "epoch": 0.6003733801888864, + "grad_norm": 1.912805438041687, + "learning_rate": 5e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6956017017364502, + "num_tokens": 141422538.0, + "step": 5467 + }, + { + "epoch": 0.6004831978915001, + "grad_norm": 1.7955548763275146, + "learning_rate": 5e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6799529790878296, + "num_tokens": 141449906.0, + "step": 5468 + }, + { + "epoch": 0.6005930155941138, + "grad_norm": 1.8650199174880981, + "learning_rate": 5e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6993754506111145, + "num_tokens": 141474921.0, + "step": 5469 + }, + { + "epoch": 0.6007028332967275, + "grad_norm": 1.8595614433288574, + "learning_rate": 5e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7078297138214111, + "num_tokens": 141500616.0, + "step": 5470 + }, + { + "epoch": 0.6008126509993411, + "grad_norm": 2.0294978618621826, + "learning_rate": 5e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6998756527900696, + "num_tokens": 141523854.0, + "step": 5471 + }, + { + "epoch": 0.6009224687019548, + "grad_norm": 1.9713876247406006, + "learning_rate": 5e-06, + "loss": 1.013, + "mean_token_accuracy": 0.690003514289856, + "num_tokens": 141551619.0, + "step": 5472 + }, + { + "epoch": 0.6010322864045684, + "grad_norm": 1.8507297039031982, + "learning_rate": 5e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7028133273124695, + "num_tokens": 141577272.0, + "step": 5473 + }, + { + "epoch": 0.601142104107182, + "grad_norm": 1.633565902709961, + "learning_rate": 5e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7076433300971985, + "num_tokens": 141608411.0, + "step": 5474 + }, + { + "epoch": 0.6012519218097957, + "grad_norm": 1.918918490409851, + "learning_rate": 5e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6977719068527222, + "num_tokens": 141632134.0, + "step": 5475 + }, + { + "epoch": 0.6013617395124095, + "grad_norm": 1.8597087860107422, + "learning_rate": 5e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.727672815322876, + "num_tokens": 141656138.0, + "step": 5476 + }, + { + "epoch": 0.6014715572150231, + "grad_norm": 2.100217819213867, + "learning_rate": 5e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6856292486190796, + "num_tokens": 141679695.0, + "step": 5477 + }, + { + "epoch": 0.6015813749176367, + "grad_norm": 2.001919984817505, + "learning_rate": 5e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7316499352455139, + "num_tokens": 141699171.0, + "step": 5478 + }, + { + "epoch": 0.6016911926202504, + "grad_norm": 1.9449996948242188, + "learning_rate": 5e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7105074524879456, + "num_tokens": 141723389.0, + "step": 5479 + }, + { + "epoch": 0.601801010322864, + "grad_norm": 1.9693633317947388, + "learning_rate": 5e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.721096396446228, + "num_tokens": 141746437.0, + "step": 5480 + }, + { + "epoch": 0.6019108280254777, + "grad_norm": 2.000056743621826, + "learning_rate": 5e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7410387992858887, + "num_tokens": 141766635.0, + "step": 5481 + }, + { + "epoch": 0.6020206457280913, + "grad_norm": 1.7859817743301392, + "learning_rate": 5e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6903952360153198, + "num_tokens": 141794396.0, + "step": 5482 + }, + { + "epoch": 0.602130463430705, + "grad_norm": 1.6430633068084717, + "learning_rate": 5e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6859214305877686, + "num_tokens": 141826725.0, + "step": 5483 + }, + { + "epoch": 0.6022402811333187, + "grad_norm": 2.034390926361084, + "learning_rate": 5e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7093542218208313, + "num_tokens": 141848194.0, + "step": 5484 + }, + { + "epoch": 0.6023500988359324, + "grad_norm": 1.786603331565857, + "learning_rate": 5e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6902817487716675, + "num_tokens": 141875675.0, + "step": 5485 + }, + { + "epoch": 0.602459916538546, + "grad_norm": 1.7232894897460938, + "learning_rate": 5e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7177059650421143, + "num_tokens": 141901603.0, + "step": 5486 + }, + { + "epoch": 0.6025697342411597, + "grad_norm": 1.648248314857483, + "learning_rate": 5e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6765589714050293, + "num_tokens": 141934874.0, + "step": 5487 + }, + { + "epoch": 0.6026795519437733, + "grad_norm": 1.5469582080841064, + "learning_rate": 5e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.698616087436676, + "num_tokens": 141970744.0, + "step": 5488 + }, + { + "epoch": 0.602789369646387, + "grad_norm": 1.7180795669555664, + "learning_rate": 5e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7000612616539001, + "num_tokens": 141997906.0, + "step": 5489 + }, + { + "epoch": 0.6028991873490006, + "grad_norm": 1.7505857944488525, + "learning_rate": 5e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7097278833389282, + "num_tokens": 142026843.0, + "step": 5490 + }, + { + "epoch": 0.6030090050516144, + "grad_norm": 1.691527247428894, + "learning_rate": 5e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7172807455062866, + "num_tokens": 142054205.0, + "step": 5491 + }, + { + "epoch": 0.603118822754228, + "grad_norm": 1.8417354822158813, + "learning_rate": 5e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6979340314865112, + "num_tokens": 142079485.0, + "step": 5492 + }, + { + "epoch": 0.6032286404568417, + "grad_norm": 1.9432369470596313, + "learning_rate": 5e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7232307195663452, + "num_tokens": 142102485.0, + "step": 5493 + }, + { + "epoch": 0.6033384581594553, + "grad_norm": 1.7919225692749023, + "learning_rate": 5e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.6956015229225159, + "num_tokens": 142129468.0, + "step": 5494 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 1.6637483835220337, + "learning_rate": 5e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6960437297821045, + "num_tokens": 142164040.0, + "step": 5495 + }, + { + "epoch": 0.6035580935646826, + "grad_norm": 1.9927712678909302, + "learning_rate": 5e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.6995007991790771, + "num_tokens": 142185846.0, + "step": 5496 + }, + { + "epoch": 0.6036679112672962, + "grad_norm": 1.9163721799850464, + "learning_rate": 5e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7131246328353882, + "num_tokens": 142210501.0, + "step": 5497 + }, + { + "epoch": 0.60377772896991, + "grad_norm": 1.883779525756836, + "learning_rate": 5e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7191766500473022, + "num_tokens": 142236124.0, + "step": 5498 + }, + { + "epoch": 0.6038875466725236, + "grad_norm": 1.7123748064041138, + "learning_rate": 5e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7120411396026611, + "num_tokens": 142268249.0, + "step": 5499 + }, + { + "epoch": 0.6039973643751373, + "grad_norm": 2.061614513397217, + "learning_rate": 5e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7227182388305664, + "num_tokens": 142290278.0, + "step": 5500 + }, + { + "epoch": 0.6041071820777509, + "grad_norm": 1.8069252967834473, + "learning_rate": 5e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.689018726348877, + "num_tokens": 142315494.0, + "step": 5501 + }, + { + "epoch": 0.6042169997803646, + "grad_norm": 1.7888553142547607, + "learning_rate": 5e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.692091703414917, + "num_tokens": 142343474.0, + "step": 5502 + }, + { + "epoch": 0.6043268174829782, + "grad_norm": 1.8466973304748535, + "learning_rate": 5e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7031183242797852, + "num_tokens": 142367635.0, + "step": 5503 + }, + { + "epoch": 0.6044366351855919, + "grad_norm": 1.7311755418777466, + "learning_rate": 5e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6934393644332886, + "num_tokens": 142395401.0, + "step": 5504 + }, + { + "epoch": 0.6045464528882056, + "grad_norm": 1.8265280723571777, + "learning_rate": 5e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6887422800064087, + "num_tokens": 142423768.0, + "step": 5505 + }, + { + "epoch": 0.6046562705908193, + "grad_norm": 1.7568283081054688, + "learning_rate": 5e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7024188041687012, + "num_tokens": 142452009.0, + "step": 5506 + }, + { + "epoch": 0.6047660882934329, + "grad_norm": 1.8342458009719849, + "learning_rate": 5e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7276742458343506, + "num_tokens": 142475349.0, + "step": 5507 + }, + { + "epoch": 0.6048759059960466, + "grad_norm": 1.8207277059555054, + "learning_rate": 5e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.706113338470459, + "num_tokens": 142501881.0, + "step": 5508 + }, + { + "epoch": 0.6049857236986602, + "grad_norm": 1.6471319198608398, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.6967694759368896, + "num_tokens": 142532409.0, + "step": 5509 + }, + { + "epoch": 0.6050955414012739, + "grad_norm": 1.9062283039093018, + "learning_rate": 5e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7268694639205933, + "num_tokens": 142553812.0, + "step": 5510 + }, + { + "epoch": 0.6052053591038875, + "grad_norm": 1.7488542795181274, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7066258788108826, + "num_tokens": 142580779.0, + "step": 5511 + }, + { + "epoch": 0.6053151768065012, + "grad_norm": 1.6512067317962646, + "learning_rate": 5e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6779792904853821, + "num_tokens": 142614950.0, + "step": 5512 + }, + { + "epoch": 0.6054249945091149, + "grad_norm": 1.678416132926941, + "learning_rate": 5e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.677119255065918, + "num_tokens": 142648541.0, + "step": 5513 + }, + { + "epoch": 0.6055348122117286, + "grad_norm": 1.8751696348190308, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7101140022277832, + "num_tokens": 142671362.0, + "step": 5514 + }, + { + "epoch": 0.6056446299143422, + "grad_norm": 1.8203591108322144, + "learning_rate": 5e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.698581874370575, + "num_tokens": 142698038.0, + "step": 5515 + }, + { + "epoch": 0.6057544476169558, + "grad_norm": 1.895491361618042, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.719897985458374, + "num_tokens": 142722221.0, + "step": 5516 + }, + { + "epoch": 0.6058642653195695, + "grad_norm": 1.7732070684432983, + "learning_rate": 5e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7034879922866821, + "num_tokens": 142751508.0, + "step": 5517 + }, + { + "epoch": 0.6059740830221831, + "grad_norm": 1.862758755683899, + "learning_rate": 5e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.701507568359375, + "num_tokens": 142778512.0, + "step": 5518 + }, + { + "epoch": 0.6060839007247968, + "grad_norm": 1.8440485000610352, + "learning_rate": 5e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7128838300704956, + "num_tokens": 142802955.0, + "step": 5519 + }, + { + "epoch": 0.6061937184274105, + "grad_norm": 1.8856511116027832, + "learning_rate": 5e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7102411389350891, + "num_tokens": 142827910.0, + "step": 5520 + }, + { + "epoch": 0.6063035361300242, + "grad_norm": 1.7669312953948975, + "learning_rate": 5e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7017816305160522, + "num_tokens": 142856835.0, + "step": 5521 + }, + { + "epoch": 0.6064133538326378, + "grad_norm": 1.7650588750839233, + "learning_rate": 5e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7077218294143677, + "num_tokens": 142885214.0, + "step": 5522 + }, + { + "epoch": 0.6065231715352515, + "grad_norm": 2.0645558834075928, + "learning_rate": 5e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.6958508491516113, + "num_tokens": 142907349.0, + "step": 5523 + }, + { + "epoch": 0.6066329892378651, + "grad_norm": 1.6632453203201294, + "learning_rate": 5e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6847033500671387, + "num_tokens": 142940423.0, + "step": 5524 + }, + { + "epoch": 0.6067428069404788, + "grad_norm": 1.8604663610458374, + "learning_rate": 5e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7179403305053711, + "num_tokens": 142965407.0, + "step": 5525 + }, + { + "epoch": 0.6068526246430924, + "grad_norm": 1.8502061367034912, + "learning_rate": 5e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7135148644447327, + "num_tokens": 142991699.0, + "step": 5526 + }, + { + "epoch": 0.6069624423457062, + "grad_norm": 1.920541524887085, + "learning_rate": 5e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7044692039489746, + "num_tokens": 143014773.0, + "step": 5527 + }, + { + "epoch": 0.6070722600483198, + "grad_norm": 1.9416941404342651, + "learning_rate": 5e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6981394290924072, + "num_tokens": 143040949.0, + "step": 5528 + }, + { + "epoch": 0.6071820777509335, + "grad_norm": 1.8040027618408203, + "learning_rate": 5e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7029061317443848, + "num_tokens": 143070567.0, + "step": 5529 + }, + { + "epoch": 0.6072918954535471, + "grad_norm": 1.8950849771499634, + "learning_rate": 5e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6962556838989258, + "num_tokens": 143095850.0, + "step": 5530 + }, + { + "epoch": 0.6074017131561608, + "grad_norm": 1.7638888359069824, + "learning_rate": 5e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7007871866226196, + "num_tokens": 143123622.0, + "step": 5531 + }, + { + "epoch": 0.6075115308587744, + "grad_norm": 1.8587603569030762, + "learning_rate": 5e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6832948923110962, + "num_tokens": 143151933.0, + "step": 5532 + }, + { + "epoch": 0.607621348561388, + "grad_norm": 1.8452516794204712, + "learning_rate": 5e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7135943174362183, + "num_tokens": 143178995.0, + "step": 5533 + }, + { + "epoch": 0.6077311662640018, + "grad_norm": 1.7422550916671753, + "learning_rate": 5e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7116349935531616, + "num_tokens": 143203734.0, + "step": 5534 + }, + { + "epoch": 0.6078409839666155, + "grad_norm": 2.1206185817718506, + "learning_rate": 5e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7155063152313232, + "num_tokens": 143227149.0, + "step": 5535 + }, + { + "epoch": 0.6079508016692291, + "grad_norm": 1.7335889339447021, + "learning_rate": 5e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6826620101928711, + "num_tokens": 143256693.0, + "step": 5536 + }, + { + "epoch": 0.6080606193718427, + "grad_norm": 1.969838261604309, + "learning_rate": 5e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7058922052383423, + "num_tokens": 143279771.0, + "step": 5537 + }, + { + "epoch": 0.6081704370744564, + "grad_norm": 1.6779142618179321, + "learning_rate": 5e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7117280960083008, + "num_tokens": 143307941.0, + "step": 5538 + }, + { + "epoch": 0.60828025477707, + "grad_norm": 2.043869733810425, + "learning_rate": 5e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7118349075317383, + "num_tokens": 143329747.0, + "step": 5539 + }, + { + "epoch": 0.6083900724796837, + "grad_norm": 1.8303697109222412, + "learning_rate": 5e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7088054418563843, + "num_tokens": 143356459.0, + "step": 5540 + }, + { + "epoch": 0.6084998901822973, + "grad_norm": 1.7602671384811401, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7139714360237122, + "num_tokens": 143385391.0, + "step": 5541 + }, + { + "epoch": 0.6086097078849111, + "grad_norm": 1.9984989166259766, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7147508859634399, + "num_tokens": 143408603.0, + "step": 5542 + }, + { + "epoch": 0.6087195255875247, + "grad_norm": 1.8707844018936157, + "learning_rate": 5e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7117440700531006, + "num_tokens": 143433747.0, + "step": 5543 + }, + { + "epoch": 0.6088293432901384, + "grad_norm": 2.109880208969116, + "learning_rate": 5e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7007331252098083, + "num_tokens": 143457517.0, + "step": 5544 + }, + { + "epoch": 0.608939160992752, + "grad_norm": 1.8169128894805908, + "learning_rate": 5e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.687260627746582, + "num_tokens": 143487975.0, + "step": 5545 + }, + { + "epoch": 0.6090489786953657, + "grad_norm": 1.7704353332519531, + "learning_rate": 5e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7156803607940674, + "num_tokens": 143514935.0, + "step": 5546 + }, + { + "epoch": 0.6091587963979793, + "grad_norm": 1.8125553131103516, + "learning_rate": 5e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7068932056427002, + "num_tokens": 143538835.0, + "step": 5547 + }, + { + "epoch": 0.609268614100593, + "grad_norm": 1.849226474761963, + "learning_rate": 5e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7025341987609863, + "num_tokens": 143565863.0, + "step": 5548 + }, + { + "epoch": 0.6093784318032067, + "grad_norm": 1.8713970184326172, + "learning_rate": 5e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7030987739562988, + "num_tokens": 143591036.0, + "step": 5549 + }, + { + "epoch": 0.6094882495058204, + "grad_norm": 1.6939927339553833, + "learning_rate": 5e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.6967563629150391, + "num_tokens": 143621976.0, + "step": 5550 + }, + { + "epoch": 0.609598067208434, + "grad_norm": 1.756652593612671, + "learning_rate": 5e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.700943648815155, + "num_tokens": 143649637.0, + "step": 5551 + }, + { + "epoch": 0.6097078849110477, + "grad_norm": 2.0310304164886475, + "learning_rate": 5e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7202773094177246, + "num_tokens": 143670113.0, + "step": 5552 + }, + { + "epoch": 0.6098177026136613, + "grad_norm": 1.740708589553833, + "learning_rate": 5e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7102845907211304, + "num_tokens": 143700756.0, + "step": 5553 + }, + { + "epoch": 0.609927520316275, + "grad_norm": 2.1129534244537354, + "learning_rate": 5e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7344982624053955, + "num_tokens": 143720982.0, + "step": 5554 + }, + { + "epoch": 0.6100373380188886, + "grad_norm": 1.934179663658142, + "learning_rate": 5e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.71431565284729, + "num_tokens": 143746854.0, + "step": 5555 + }, + { + "epoch": 0.6101471557215024, + "grad_norm": 1.7983901500701904, + "learning_rate": 5e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6989012956619263, + "num_tokens": 143773395.0, + "step": 5556 + }, + { + "epoch": 0.610256973424116, + "grad_norm": 1.73501718044281, + "learning_rate": 5e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6943631172180176, + "num_tokens": 143801406.0, + "step": 5557 + }, + { + "epoch": 0.6103667911267296, + "grad_norm": 1.8673641681671143, + "learning_rate": 5e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7241353392601013, + "num_tokens": 143826545.0, + "step": 5558 + }, + { + "epoch": 0.6104766088293433, + "grad_norm": 2.116220474243164, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7162559628486633, + "num_tokens": 143848287.0, + "step": 5559 + }, + { + "epoch": 0.6105864265319569, + "grad_norm": 1.9805247783660889, + "learning_rate": 5e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.703426718711853, + "num_tokens": 143871572.0, + "step": 5560 + }, + { + "epoch": 0.6106962442345706, + "grad_norm": 2.0370941162109375, + "learning_rate": 5e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6893224716186523, + "num_tokens": 143896573.0, + "step": 5561 + }, + { + "epoch": 0.6108060619371842, + "grad_norm": 1.9326801300048828, + "learning_rate": 5e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6972132921218872, + "num_tokens": 143921924.0, + "step": 5562 + }, + { + "epoch": 0.610915879639798, + "grad_norm": 1.955731987953186, + "learning_rate": 5e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7099825143814087, + "num_tokens": 143945896.0, + "step": 5563 + }, + { + "epoch": 0.6110256973424116, + "grad_norm": 1.6914604902267456, + "learning_rate": 5e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7051348090171814, + "num_tokens": 143976156.0, + "step": 5564 + }, + { + "epoch": 0.6111355150450253, + "grad_norm": 1.788210153579712, + "learning_rate": 5e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6857955455780029, + "num_tokens": 144007254.0, + "step": 5565 + }, + { + "epoch": 0.6112453327476389, + "grad_norm": 1.8995689153671265, + "learning_rate": 5e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.701880156993866, + "num_tokens": 144032213.0, + "step": 5566 + }, + { + "epoch": 0.6113551504502526, + "grad_norm": 1.7712008953094482, + "learning_rate": 5e-06, + "loss": 1.0944, + "mean_token_accuracy": 0.668962299823761, + "num_tokens": 144063315.0, + "step": 5567 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 2.034752607345581, + "learning_rate": 5e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.6987764835357666, + "num_tokens": 144085012.0, + "step": 5568 + }, + { + "epoch": 0.6115747858554799, + "grad_norm": 1.8422174453735352, + "learning_rate": 5e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6825242042541504, + "num_tokens": 144114668.0, + "step": 5569 + }, + { + "epoch": 0.6116846035580935, + "grad_norm": 1.7840017080307007, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7151931524276733, + "num_tokens": 144140296.0, + "step": 5570 + }, + { + "epoch": 0.6117944212607073, + "grad_norm": 1.946866750717163, + "learning_rate": 5e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6881556510925293, + "num_tokens": 144162759.0, + "step": 5571 + }, + { + "epoch": 0.6119042389633209, + "grad_norm": 1.6716125011444092, + "learning_rate": 5e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6940586566925049, + "num_tokens": 144191499.0, + "step": 5572 + }, + { + "epoch": 0.6120140566659346, + "grad_norm": 1.6730871200561523, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.721131443977356, + "num_tokens": 144219568.0, + "step": 5573 + }, + { + "epoch": 0.6121238743685482, + "grad_norm": 1.864495873451233, + "learning_rate": 5e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.717406153678894, + "num_tokens": 144243124.0, + "step": 5574 + }, + { + "epoch": 0.6122336920711619, + "grad_norm": 1.6979929208755493, + "learning_rate": 5e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6888687014579773, + "num_tokens": 144274200.0, + "step": 5575 + }, + { + "epoch": 0.6123435097737755, + "grad_norm": 1.9377950429916382, + "learning_rate": 5e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6901181936264038, + "num_tokens": 144299323.0, + "step": 5576 + }, + { + "epoch": 0.6124533274763891, + "grad_norm": 1.8410675525665283, + "learning_rate": 5e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7037217617034912, + "num_tokens": 144324641.0, + "step": 5577 + }, + { + "epoch": 0.6125631451790029, + "grad_norm": 1.6737968921661377, + "learning_rate": 5e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7066627144813538, + "num_tokens": 144355902.0, + "step": 5578 + }, + { + "epoch": 0.6126729628816165, + "grad_norm": 1.7614517211914062, + "learning_rate": 5e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.6777637004852295, + "num_tokens": 144384799.0, + "step": 5579 + }, + { + "epoch": 0.6127827805842302, + "grad_norm": 1.859699010848999, + "learning_rate": 5e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7010072469711304, + "num_tokens": 144410042.0, + "step": 5580 + }, + { + "epoch": 0.6128925982868438, + "grad_norm": 1.797520637512207, + "learning_rate": 5e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6968927383422852, + "num_tokens": 144437529.0, + "step": 5581 + }, + { + "epoch": 0.6130024159894575, + "grad_norm": 1.8854663372039795, + "learning_rate": 5e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.6985194683074951, + "num_tokens": 144461484.0, + "step": 5582 + }, + { + "epoch": 0.6131122336920711, + "grad_norm": 1.8127254247665405, + "learning_rate": 5e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6864736080169678, + "num_tokens": 144487794.0, + "step": 5583 + }, + { + "epoch": 0.6132220513946848, + "grad_norm": 1.8129626512527466, + "learning_rate": 5e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7448176145553589, + "num_tokens": 144511304.0, + "step": 5584 + }, + { + "epoch": 0.6133318690972985, + "grad_norm": 2.034108877182007, + "learning_rate": 5e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.696169376373291, + "num_tokens": 144533149.0, + "step": 5585 + }, + { + "epoch": 0.6134416867999122, + "grad_norm": 2.100687026977539, + "learning_rate": 5e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7089099884033203, + "num_tokens": 144553481.0, + "step": 5586 + }, + { + "epoch": 0.6135515045025258, + "grad_norm": 1.9665186405181885, + "learning_rate": 5e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7272394299507141, + "num_tokens": 144576636.0, + "step": 5587 + }, + { + "epoch": 0.6136613222051395, + "grad_norm": 1.9373109340667725, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7213391065597534, + "num_tokens": 144599277.0, + "step": 5588 + }, + { + "epoch": 0.6137711399077531, + "grad_norm": 1.6270854473114014, + "learning_rate": 5e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7185395956039429, + "num_tokens": 144631250.0, + "step": 5589 + }, + { + "epoch": 0.6138809576103668, + "grad_norm": 1.9539438486099243, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7198014259338379, + "num_tokens": 144653351.0, + "step": 5590 + }, + { + "epoch": 0.6139907753129804, + "grad_norm": 1.981245994567871, + "learning_rate": 5e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.712775707244873, + "num_tokens": 144675214.0, + "step": 5591 + }, + { + "epoch": 0.6141005930155942, + "grad_norm": 1.7260578870773315, + "learning_rate": 5e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7187875509262085, + "num_tokens": 144703809.0, + "step": 5592 + }, + { + "epoch": 0.6142104107182078, + "grad_norm": 1.978411316871643, + "learning_rate": 5e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.691760778427124, + "num_tokens": 144730736.0, + "step": 5593 + }, + { + "epoch": 0.6143202284208215, + "grad_norm": 1.9493975639343262, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7177762985229492, + "num_tokens": 144752899.0, + "step": 5594 + }, + { + "epoch": 0.6144300461234351, + "grad_norm": 1.8358157873153687, + "learning_rate": 5e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7130379676818848, + "num_tokens": 144778339.0, + "step": 5595 + }, + { + "epoch": 0.6145398638260487, + "grad_norm": 1.736380934715271, + "learning_rate": 5e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7203468084335327, + "num_tokens": 144801813.0, + "step": 5596 + }, + { + "epoch": 0.6146496815286624, + "grad_norm": 1.8510122299194336, + "learning_rate": 5e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6867706775665283, + "num_tokens": 144828988.0, + "step": 5597 + }, + { + "epoch": 0.614759499231276, + "grad_norm": 1.9709104299545288, + "learning_rate": 5e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.697486937046051, + "num_tokens": 144853054.0, + "step": 5598 + }, + { + "epoch": 0.6148693169338898, + "grad_norm": 1.9086023569107056, + "learning_rate": 5e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.6833181381225586, + "num_tokens": 144880402.0, + "step": 5599 + }, + { + "epoch": 0.6149791346365034, + "grad_norm": 1.9183781147003174, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7363316416740417, + "num_tokens": 144903389.0, + "step": 5600 + }, + { + "epoch": 0.6150889523391171, + "grad_norm": 1.9025697708129883, + "learning_rate": 5e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7120852470397949, + "num_tokens": 144927699.0, + "step": 5601 + }, + { + "epoch": 0.6151987700417307, + "grad_norm": 1.977475881576538, + "learning_rate": 5e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6798790693283081, + "num_tokens": 144954708.0, + "step": 5602 + }, + { + "epoch": 0.6153085877443444, + "grad_norm": 1.795078158378601, + "learning_rate": 5e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6968204379081726, + "num_tokens": 144983477.0, + "step": 5603 + }, + { + "epoch": 0.615418405446958, + "grad_norm": 1.8502060174942017, + "learning_rate": 5e-06, + "loss": 1.0784, + "mean_token_accuracy": 0.6767832040786743, + "num_tokens": 145012095.0, + "step": 5604 + }, + { + "epoch": 0.6155282231495717, + "grad_norm": 1.8828492164611816, + "learning_rate": 5e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6985294818878174, + "num_tokens": 145037230.0, + "step": 5605 + }, + { + "epoch": 0.6156380408521853, + "grad_norm": 1.7717249393463135, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.731415867805481, + "num_tokens": 145064621.0, + "step": 5606 + }, + { + "epoch": 0.6157478585547991, + "grad_norm": 1.8775522708892822, + "learning_rate": 5e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6896318197250366, + "num_tokens": 145091398.0, + "step": 5607 + }, + { + "epoch": 0.6158576762574127, + "grad_norm": 1.8257101774215698, + "learning_rate": 5e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6950209140777588, + "num_tokens": 145119018.0, + "step": 5608 + }, + { + "epoch": 0.6159674939600264, + "grad_norm": 1.8753163814544678, + "learning_rate": 5e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6773715019226074, + "num_tokens": 145146020.0, + "step": 5609 + }, + { + "epoch": 0.61607731166264, + "grad_norm": 1.8901557922363281, + "learning_rate": 5e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7041707038879395, + "num_tokens": 145170097.0, + "step": 5610 + }, + { + "epoch": 0.6161871293652537, + "grad_norm": 1.8755567073822021, + "learning_rate": 5e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7026163339614868, + "num_tokens": 145194322.0, + "step": 5611 + }, + { + "epoch": 0.6162969470678673, + "grad_norm": 1.7758674621582031, + "learning_rate": 5e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7343952655792236, + "num_tokens": 145219858.0, + "step": 5612 + }, + { + "epoch": 0.616406764770481, + "grad_norm": 1.7206461429595947, + "learning_rate": 5e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7274860143661499, + "num_tokens": 145246215.0, + "step": 5613 + }, + { + "epoch": 0.6165165824730947, + "grad_norm": 1.713202714920044, + "learning_rate": 5e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7345105409622192, + "num_tokens": 145274326.0, + "step": 5614 + }, + { + "epoch": 0.6166264001757084, + "grad_norm": 2.039591073989868, + "learning_rate": 5e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.713595986366272, + "num_tokens": 145295399.0, + "step": 5615 + }, + { + "epoch": 0.616736217878322, + "grad_norm": 1.9962822198867798, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.718208372592926, + "num_tokens": 145317796.0, + "step": 5616 + }, + { + "epoch": 0.6168460355809356, + "grad_norm": 1.781469464302063, + "learning_rate": 5e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.694115400314331, + "num_tokens": 145347683.0, + "step": 5617 + }, + { + "epoch": 0.6169558532835493, + "grad_norm": 2.099240303039551, + "learning_rate": 5e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7361688017845154, + "num_tokens": 145367264.0, + "step": 5618 + }, + { + "epoch": 0.6170656709861629, + "grad_norm": 1.9836671352386475, + "learning_rate": 5e-06, + "loss": 1.0541, + "mean_token_accuracy": 0.67821204662323, + "num_tokens": 145391669.0, + "step": 5619 + }, + { + "epoch": 0.6171754886887766, + "grad_norm": 1.8117296695709229, + "learning_rate": 5e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6811395883560181, + "num_tokens": 145420432.0, + "step": 5620 + }, + { + "epoch": 0.6172853063913903, + "grad_norm": 1.6407687664031982, + "learning_rate": 5e-06, + "loss": 1.0707, + "mean_token_accuracy": 0.6832002401351929, + "num_tokens": 145451780.0, + "step": 5621 + }, + { + "epoch": 0.617395124094004, + "grad_norm": 1.8013442754745483, + "learning_rate": 5e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.710384726524353, + "num_tokens": 145477261.0, + "step": 5622 + }, + { + "epoch": 0.6175049417966176, + "grad_norm": 1.8363213539123535, + "learning_rate": 5e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7287101745605469, + "num_tokens": 145500054.0, + "step": 5623 + }, + { + "epoch": 0.6176147594992313, + "grad_norm": 2.061643362045288, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7054204940795898, + "num_tokens": 145524164.0, + "step": 5624 + }, + { + "epoch": 0.6177245772018449, + "grad_norm": 2.1194190979003906, + "learning_rate": 5e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7195949554443359, + "num_tokens": 145542129.0, + "step": 5625 + }, + { + "epoch": 0.6178343949044586, + "grad_norm": 1.8072561025619507, + "learning_rate": 5e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6930128931999207, + "num_tokens": 145572486.0, + "step": 5626 + }, + { + "epoch": 0.6179442126070722, + "grad_norm": 1.9667750597000122, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7122358083724976, + "num_tokens": 145594389.0, + "step": 5627 + }, + { + "epoch": 0.618054030309686, + "grad_norm": 1.966597080230713, + "learning_rate": 5e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7053570747375488, + "num_tokens": 145617777.0, + "step": 5628 + }, + { + "epoch": 0.6181638480122996, + "grad_norm": 1.7483789920806885, + "learning_rate": 5e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6934890747070312, + "num_tokens": 145649080.0, + "step": 5629 + }, + { + "epoch": 0.6182736657149133, + "grad_norm": 1.925046682357788, + "learning_rate": 5e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7133221626281738, + "num_tokens": 145674263.0, + "step": 5630 + }, + { + "epoch": 0.6183834834175269, + "grad_norm": 2.111196994781494, + "learning_rate": 5e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6909164190292358, + "num_tokens": 145695183.0, + "step": 5631 + }, + { + "epoch": 0.6184933011201406, + "grad_norm": 1.7772271633148193, + "learning_rate": 5e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6912623047828674, + "num_tokens": 145728692.0, + "step": 5632 + }, + { + "epoch": 0.6186031188227542, + "grad_norm": 1.8058066368103027, + "learning_rate": 5e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.726276159286499, + "num_tokens": 145752382.0, + "step": 5633 + }, + { + "epoch": 0.6187129365253679, + "grad_norm": 1.9688231945037842, + "learning_rate": 5e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.7004848718643188, + "num_tokens": 145775642.0, + "step": 5634 + }, + { + "epoch": 0.6188227542279815, + "grad_norm": 1.9518128633499146, + "learning_rate": 5e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6973059773445129, + "num_tokens": 145801297.0, + "step": 5635 + }, + { + "epoch": 0.6189325719305953, + "grad_norm": 2.0069823265075684, + "learning_rate": 5e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7042381763458252, + "num_tokens": 145826132.0, + "step": 5636 + }, + { + "epoch": 0.6190423896332089, + "grad_norm": 1.8837766647338867, + "learning_rate": 5e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7064571380615234, + "num_tokens": 145852114.0, + "step": 5637 + }, + { + "epoch": 0.6191522073358225, + "grad_norm": 1.7823286056518555, + "learning_rate": 5e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7030715942382812, + "num_tokens": 145882029.0, + "step": 5638 + }, + { + "epoch": 0.6192620250384362, + "grad_norm": 1.7648115158081055, + "learning_rate": 5e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7003307342529297, + "num_tokens": 145911806.0, + "step": 5639 + }, + { + "epoch": 0.6193718427410498, + "grad_norm": 1.6853302717208862, + "learning_rate": 5e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6841288208961487, + "num_tokens": 145942131.0, + "step": 5640 + }, + { + "epoch": 0.6194816604436635, + "grad_norm": 1.9919673204421997, + "learning_rate": 5e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7070137858390808, + "num_tokens": 145966556.0, + "step": 5641 + }, + { + "epoch": 0.6195914781462771, + "grad_norm": 2.0783607959747314, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7271078824996948, + "num_tokens": 145985922.0, + "step": 5642 + }, + { + "epoch": 0.6197012958488909, + "grad_norm": 1.8147433996200562, + "learning_rate": 5e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.709355354309082, + "num_tokens": 146011027.0, + "step": 5643 + }, + { + "epoch": 0.6198111135515045, + "grad_norm": 1.9443308115005493, + "learning_rate": 5e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7098656892776489, + "num_tokens": 146037586.0, + "step": 5644 + }, + { + "epoch": 0.6199209312541182, + "grad_norm": 1.8626035451889038, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7374864816665649, + "num_tokens": 146062229.0, + "step": 5645 + }, + { + "epoch": 0.6200307489567318, + "grad_norm": 1.8448041677474976, + "learning_rate": 5e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6940355896949768, + "num_tokens": 146092840.0, + "step": 5646 + }, + { + "epoch": 0.6201405666593455, + "grad_norm": 1.9823460578918457, + "learning_rate": 5e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6952261924743652, + "num_tokens": 146118266.0, + "step": 5647 + }, + { + "epoch": 0.6202503843619591, + "grad_norm": 1.7600491046905518, + "learning_rate": 5e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6873773336410522, + "num_tokens": 146149428.0, + "step": 5648 + }, + { + "epoch": 0.6203602020645728, + "grad_norm": 1.9517697095870972, + "learning_rate": 5e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7036393284797668, + "num_tokens": 146174036.0, + "step": 5649 + }, + { + "epoch": 0.6204700197671865, + "grad_norm": 1.8958015441894531, + "learning_rate": 5e-06, + "loss": 0.967, + "mean_token_accuracy": 0.704338550567627, + "num_tokens": 146201549.0, + "step": 5650 + }, + { + "epoch": 0.6205798374698002, + "grad_norm": 1.8500200510025024, + "learning_rate": 5e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.6996814012527466, + "num_tokens": 146228905.0, + "step": 5651 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 1.893261194229126, + "learning_rate": 5e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7182268500328064, + "num_tokens": 146253126.0, + "step": 5652 + }, + { + "epoch": 0.6207994728750275, + "grad_norm": 2.08251953125, + "learning_rate": 5e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6955043077468872, + "num_tokens": 146276457.0, + "step": 5653 + }, + { + "epoch": 0.6209092905776411, + "grad_norm": 1.6957095861434937, + "learning_rate": 5e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6871453523635864, + "num_tokens": 146306072.0, + "step": 5654 + }, + { + "epoch": 0.6210191082802548, + "grad_norm": 1.98270845413208, + "learning_rate": 5e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7019034624099731, + "num_tokens": 146330418.0, + "step": 5655 + }, + { + "epoch": 0.6211289259828684, + "grad_norm": 1.868430495262146, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7162121534347534, + "num_tokens": 146357494.0, + "step": 5656 + }, + { + "epoch": 0.6212387436854822, + "grad_norm": 1.7519217729568481, + "learning_rate": 5e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6896231174468994, + "num_tokens": 146386123.0, + "step": 5657 + }, + { + "epoch": 0.6213485613880958, + "grad_norm": 1.9231048822402954, + "learning_rate": 5e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7258222699165344, + "num_tokens": 146410236.0, + "step": 5658 + }, + { + "epoch": 0.6214583790907094, + "grad_norm": 2.068244457244873, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7219469547271729, + "num_tokens": 146430131.0, + "step": 5659 + }, + { + "epoch": 0.6215681967933231, + "grad_norm": 1.7913010120391846, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7185876369476318, + "num_tokens": 146455584.0, + "step": 5660 + }, + { + "epoch": 0.6216780144959367, + "grad_norm": 1.7296913862228394, + "learning_rate": 5e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.688654363155365, + "num_tokens": 146484113.0, + "step": 5661 + }, + { + "epoch": 0.6217878321985504, + "grad_norm": 1.6847652196884155, + "learning_rate": 5e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6805882453918457, + "num_tokens": 146514785.0, + "step": 5662 + }, + { + "epoch": 0.621897649901164, + "grad_norm": 1.625657320022583, + "learning_rate": 5e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6880068778991699, + "num_tokens": 146547557.0, + "step": 5663 + }, + { + "epoch": 0.6220074676037777, + "grad_norm": 2.0978989601135254, + "learning_rate": 5e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6912039518356323, + "num_tokens": 146567803.0, + "step": 5664 + }, + { + "epoch": 0.6221172853063914, + "grad_norm": 1.788865089416504, + "learning_rate": 5e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6939429640769958, + "num_tokens": 146595897.0, + "step": 5665 + }, + { + "epoch": 0.6222271030090051, + "grad_norm": 1.7542359828948975, + "learning_rate": 5e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6981314420700073, + "num_tokens": 146626517.0, + "step": 5666 + }, + { + "epoch": 0.6223369207116187, + "grad_norm": 1.9326813220977783, + "learning_rate": 5e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7027866840362549, + "num_tokens": 146648673.0, + "step": 5667 + }, + { + "epoch": 0.6224467384142324, + "grad_norm": 1.9853382110595703, + "learning_rate": 5e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6819497346878052, + "num_tokens": 146671872.0, + "step": 5668 + }, + { + "epoch": 0.622556556116846, + "grad_norm": 1.7284797430038452, + "learning_rate": 5e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6960927248001099, + "num_tokens": 146700541.0, + "step": 5669 + }, + { + "epoch": 0.6226663738194597, + "grad_norm": 1.7595144510269165, + "learning_rate": 5e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7185826301574707, + "num_tokens": 146725156.0, + "step": 5670 + }, + { + "epoch": 0.6227761915220733, + "grad_norm": 1.8399481773376465, + "learning_rate": 5e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.699559211730957, + "num_tokens": 146748011.0, + "step": 5671 + }, + { + "epoch": 0.6228860092246871, + "grad_norm": 1.7829878330230713, + "learning_rate": 5e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6876938343048096, + "num_tokens": 146779035.0, + "step": 5672 + }, + { + "epoch": 0.6229958269273007, + "grad_norm": 1.8607460260391235, + "learning_rate": 5e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.696043848991394, + "num_tokens": 146805918.0, + "step": 5673 + }, + { + "epoch": 0.6231056446299144, + "grad_norm": 1.9167311191558838, + "learning_rate": 5e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7203090786933899, + "num_tokens": 146828464.0, + "step": 5674 + }, + { + "epoch": 0.623215462332528, + "grad_norm": 1.7821576595306396, + "learning_rate": 5e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6915638446807861, + "num_tokens": 146855663.0, + "step": 5675 + }, + { + "epoch": 0.6233252800351416, + "grad_norm": 1.9839556217193604, + "learning_rate": 5e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7091670632362366, + "num_tokens": 146877413.0, + "step": 5676 + }, + { + "epoch": 0.6234350977377553, + "grad_norm": 1.7941153049468994, + "learning_rate": 5e-06, + "loss": 1.08, + "mean_token_accuracy": 0.67911696434021, + "num_tokens": 146906148.0, + "step": 5677 + }, + { + "epoch": 0.6235449154403689, + "grad_norm": 1.5858564376831055, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6883363127708435, + "num_tokens": 146941721.0, + "step": 5678 + }, + { + "epoch": 0.6236547331429827, + "grad_norm": 1.9313141107559204, + "learning_rate": 5e-06, + "loss": 1.0852, + "mean_token_accuracy": 0.6776633262634277, + "num_tokens": 146968045.0, + "step": 5679 + }, + { + "epoch": 0.6237645508455963, + "grad_norm": 1.6957402229309082, + "learning_rate": 5e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6897467374801636, + "num_tokens": 146996287.0, + "step": 5680 + }, + { + "epoch": 0.62387436854821, + "grad_norm": 1.914899468421936, + "learning_rate": 5e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6948076486587524, + "num_tokens": 147021526.0, + "step": 5681 + }, + { + "epoch": 0.6239841862508236, + "grad_norm": 1.9927538633346558, + "learning_rate": 5e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7295931577682495, + "num_tokens": 147042860.0, + "step": 5682 + }, + { + "epoch": 0.6240940039534373, + "grad_norm": 1.750793695449829, + "learning_rate": 5e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6925613880157471, + "num_tokens": 147069723.0, + "step": 5683 + }, + { + "epoch": 0.6242038216560509, + "grad_norm": 1.8386090993881226, + "learning_rate": 5e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7078971862792969, + "num_tokens": 147095390.0, + "step": 5684 + }, + { + "epoch": 0.6243136393586646, + "grad_norm": 1.8845645189285278, + "learning_rate": 5e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6954128742218018, + "num_tokens": 147123097.0, + "step": 5685 + }, + { + "epoch": 0.6244234570612783, + "grad_norm": 1.8522638082504272, + "learning_rate": 5e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7209625840187073, + "num_tokens": 147147183.0, + "step": 5686 + }, + { + "epoch": 0.624533274763892, + "grad_norm": 1.5309444665908813, + "learning_rate": 5e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6828325986862183, + "num_tokens": 147183404.0, + "step": 5687 + }, + { + "epoch": 0.6246430924665056, + "grad_norm": 2.0179970264434814, + "learning_rate": 5e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7022432684898376, + "num_tokens": 147203345.0, + "step": 5688 + }, + { + "epoch": 0.6247529101691193, + "grad_norm": 1.9775909185409546, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7200301885604858, + "num_tokens": 147224469.0, + "step": 5689 + }, + { + "epoch": 0.6248627278717329, + "grad_norm": 2.004526138305664, + "learning_rate": 5e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6929723024368286, + "num_tokens": 147247065.0, + "step": 5690 + }, + { + "epoch": 0.6249725455743466, + "grad_norm": 1.7432681322097778, + "learning_rate": 5e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7027984857559204, + "num_tokens": 147275879.0, + "step": 5691 + }, + { + "epoch": 0.6250823632769602, + "grad_norm": 2.094459056854248, + "learning_rate": 5e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7050657272338867, + "num_tokens": 147296750.0, + "step": 5692 + }, + { + "epoch": 0.6251921809795739, + "grad_norm": 1.9737353324890137, + "learning_rate": 5e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7129843235015869, + "num_tokens": 147322015.0, + "step": 5693 + }, + { + "epoch": 0.6253019986821876, + "grad_norm": 1.603757619857788, + "learning_rate": 5e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7176693677902222, + "num_tokens": 147356464.0, + "step": 5694 + }, + { + "epoch": 0.6254118163848013, + "grad_norm": 1.9547351598739624, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7141646146774292, + "num_tokens": 147378665.0, + "step": 5695 + }, + { + "epoch": 0.6255216340874149, + "grad_norm": 1.82142972946167, + "learning_rate": 5e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7173714637756348, + "num_tokens": 147406032.0, + "step": 5696 + }, + { + "epoch": 0.6256314517900285, + "grad_norm": 1.8145779371261597, + "learning_rate": 5e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7072065472602844, + "num_tokens": 147431489.0, + "step": 5697 + }, + { + "epoch": 0.6257412694926422, + "grad_norm": 2.0548858642578125, + "learning_rate": 5e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.70548415184021, + "num_tokens": 147457186.0, + "step": 5698 + }, + { + "epoch": 0.6258510871952558, + "grad_norm": 1.9492031335830688, + "learning_rate": 5e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.6965223550796509, + "num_tokens": 147482877.0, + "step": 5699 + }, + { + "epoch": 0.6259609048978695, + "grad_norm": 1.920424461364746, + "learning_rate": 5e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7030270099639893, + "num_tokens": 147506963.0, + "step": 5700 + }, + { + "epoch": 0.6260707226004832, + "grad_norm": 1.8950837850570679, + "learning_rate": 5e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.69344162940979, + "num_tokens": 147534077.0, + "step": 5701 + }, + { + "epoch": 0.6261805403030969, + "grad_norm": 2.1037652492523193, + "learning_rate": 5e-06, + "loss": 1.1087, + "mean_token_accuracy": 0.674737811088562, + "num_tokens": 147558368.0, + "step": 5702 + }, + { + "epoch": 0.6262903580057105, + "grad_norm": 1.7454248666763306, + "learning_rate": 5e-06, + "loss": 1.029, + "mean_token_accuracy": 0.689497709274292, + "num_tokens": 147587150.0, + "step": 5703 + }, + { + "epoch": 0.6264001757083242, + "grad_norm": 1.6947122812271118, + "learning_rate": 5e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6844956874847412, + "num_tokens": 147615168.0, + "step": 5704 + }, + { + "epoch": 0.6265099934109378, + "grad_norm": 1.8680641651153564, + "learning_rate": 5e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6926775574684143, + "num_tokens": 147641190.0, + "step": 5705 + }, + { + "epoch": 0.6266198111135515, + "grad_norm": 1.8973859548568726, + "learning_rate": 5e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7207159399986267, + "num_tokens": 147663008.0, + "step": 5706 + }, + { + "epoch": 0.6267296288161651, + "grad_norm": 1.7712641954421997, + "learning_rate": 5e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6867038607597351, + "num_tokens": 147691571.0, + "step": 5707 + }, + { + "epoch": 0.6268394465187789, + "grad_norm": 1.7984970808029175, + "learning_rate": 5e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7093427181243896, + "num_tokens": 147718645.0, + "step": 5708 + }, + { + "epoch": 0.6269492642213925, + "grad_norm": 1.9581049680709839, + "learning_rate": 5e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.6999580264091492, + "num_tokens": 147743571.0, + "step": 5709 + }, + { + "epoch": 0.6270590819240062, + "grad_norm": 2.022031784057617, + "learning_rate": 5e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7014062404632568, + "num_tokens": 147768628.0, + "step": 5710 + }, + { + "epoch": 0.6271688996266198, + "grad_norm": 1.87026846408844, + "learning_rate": 5e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7050628662109375, + "num_tokens": 147796921.0, + "step": 5711 + }, + { + "epoch": 0.6272787173292335, + "grad_norm": 1.9765468835830688, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7302941083908081, + "num_tokens": 147818711.0, + "step": 5712 + }, + { + "epoch": 0.6273885350318471, + "grad_norm": 1.861090898513794, + "learning_rate": 5e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7111237049102783, + "num_tokens": 147846705.0, + "step": 5713 + }, + { + "epoch": 0.6274983527344608, + "grad_norm": 2.016277313232422, + "learning_rate": 5e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6928709745407104, + "num_tokens": 147871044.0, + "step": 5714 + }, + { + "epoch": 0.6276081704370745, + "grad_norm": 1.6848022937774658, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7201945781707764, + "num_tokens": 147902690.0, + "step": 5715 + }, + { + "epoch": 0.6277179881396882, + "grad_norm": 1.6790398359298706, + "learning_rate": 5e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.711432933807373, + "num_tokens": 147933342.0, + "step": 5716 + }, + { + "epoch": 0.6278278058423018, + "grad_norm": 1.7944291830062866, + "learning_rate": 5e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7041424512863159, + "num_tokens": 147959308.0, + "step": 5717 + }, + { + "epoch": 0.6279376235449154, + "grad_norm": 1.9416923522949219, + "learning_rate": 5e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6884254217147827, + "num_tokens": 147983982.0, + "step": 5718 + }, + { + "epoch": 0.6280474412475291, + "grad_norm": 1.8681838512420654, + "learning_rate": 5e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7319967150688171, + "num_tokens": 148005577.0, + "step": 5719 + }, + { + "epoch": 0.6281572589501427, + "grad_norm": 1.9735348224639893, + "learning_rate": 5e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6956559419631958, + "num_tokens": 148028100.0, + "step": 5720 + }, + { + "epoch": 0.6282670766527564, + "grad_norm": 1.9596936702728271, + "learning_rate": 5e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6982817649841309, + "num_tokens": 148051499.0, + "step": 5721 + }, + { + "epoch": 0.62837689435537, + "grad_norm": 2.0498971939086914, + "learning_rate": 5e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7259956002235413, + "num_tokens": 148072581.0, + "step": 5722 + }, + { + "epoch": 0.6284867120579838, + "grad_norm": 1.9626131057739258, + "learning_rate": 5e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6895277500152588, + "num_tokens": 148097368.0, + "step": 5723 + }, + { + "epoch": 0.6285965297605974, + "grad_norm": 1.9017493724822998, + "learning_rate": 5e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7089862823486328, + "num_tokens": 148123237.0, + "step": 5724 + }, + { + "epoch": 0.6287063474632111, + "grad_norm": 2.22123646736145, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6975795030593872, + "num_tokens": 148144594.0, + "step": 5725 + }, + { + "epoch": 0.6288161651658247, + "grad_norm": 1.7573884725570679, + "learning_rate": 5e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6805041432380676, + "num_tokens": 148174005.0, + "step": 5726 + }, + { + "epoch": 0.6289259828684384, + "grad_norm": 2.161179780960083, + "learning_rate": 5e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7061617374420166, + "num_tokens": 148194195.0, + "step": 5727 + }, + { + "epoch": 0.629035800571052, + "grad_norm": 1.633742094039917, + "learning_rate": 5e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7118560075759888, + "num_tokens": 148224782.0, + "step": 5728 + }, + { + "epoch": 0.6291456182736657, + "grad_norm": 1.6401535272598267, + "learning_rate": 5e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6940613389015198, + "num_tokens": 148255325.0, + "step": 5729 + }, + { + "epoch": 0.6292554359762794, + "grad_norm": 2.093447685241699, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7230032682418823, + "num_tokens": 148275624.0, + "step": 5730 + }, + { + "epoch": 0.6293652536788931, + "grad_norm": 1.6175044775009155, + "learning_rate": 5e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6950252056121826, + "num_tokens": 148308418.0, + "step": 5731 + }, + { + "epoch": 0.6294750713815067, + "grad_norm": 1.7067961692810059, + "learning_rate": 5e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6956934928894043, + "num_tokens": 148337373.0, + "step": 5732 + }, + { + "epoch": 0.6295848890841204, + "grad_norm": 1.7655515670776367, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7377855777740479, + "num_tokens": 148363073.0, + "step": 5733 + }, + { + "epoch": 0.629694706786734, + "grad_norm": 1.7804292440414429, + "learning_rate": 5e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6893197298049927, + "num_tokens": 148390096.0, + "step": 5734 + }, + { + "epoch": 0.6298045244893477, + "grad_norm": 1.7637836933135986, + "learning_rate": 5e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.683276891708374, + "num_tokens": 148419027.0, + "step": 5735 + }, + { + "epoch": 0.6299143421919613, + "grad_norm": 1.7482025623321533, + "learning_rate": 5e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7114164233207703, + "num_tokens": 148445455.0, + "step": 5736 + }, + { + "epoch": 0.630024159894575, + "grad_norm": 2.0726096630096436, + "learning_rate": 5e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7325789332389832, + "num_tokens": 148464312.0, + "step": 5737 + }, + { + "epoch": 0.6301339775971887, + "grad_norm": 1.7286920547485352, + "learning_rate": 5e-06, + "loss": 1.0778, + "mean_token_accuracy": 0.6764376163482666, + "num_tokens": 148497003.0, + "step": 5738 + }, + { + "epoch": 0.6302437952998023, + "grad_norm": 1.6647191047668457, + "learning_rate": 5e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6961352825164795, + "num_tokens": 148524419.0, + "step": 5739 + }, + { + "epoch": 0.630353613002416, + "grad_norm": 2.1162545680999756, + "learning_rate": 5e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7126452326774597, + "num_tokens": 148544507.0, + "step": 5740 + }, + { + "epoch": 0.6304634307050296, + "grad_norm": 1.9519603252410889, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7239318490028381, + "num_tokens": 148566691.0, + "step": 5741 + }, + { + "epoch": 0.6305732484076433, + "grad_norm": 1.8246722221374512, + "learning_rate": 5e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.6991671323776245, + "num_tokens": 148593572.0, + "step": 5742 + }, + { + "epoch": 0.6306830661102569, + "grad_norm": 2.104052782058716, + "learning_rate": 5e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7185577750205994, + "num_tokens": 148612845.0, + "step": 5743 + }, + { + "epoch": 0.6307928838128707, + "grad_norm": 1.8743456602096558, + "learning_rate": 5e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7319016456604004, + "num_tokens": 148638041.0, + "step": 5744 + }, + { + "epoch": 0.6309027015154843, + "grad_norm": 2.122124433517456, + "learning_rate": 5e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.6833065152168274, + "num_tokens": 148661830.0, + "step": 5745 + }, + { + "epoch": 0.631012519218098, + "grad_norm": 1.6852481365203857, + "learning_rate": 5e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7034021019935608, + "num_tokens": 148690702.0, + "step": 5746 + }, + { + "epoch": 0.6311223369207116, + "grad_norm": 1.6819919347763062, + "learning_rate": 5e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7262859344482422, + "num_tokens": 148716491.0, + "step": 5747 + }, + { + "epoch": 0.6312321546233253, + "grad_norm": 1.9988635778427124, + "learning_rate": 5e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7043979167938232, + "num_tokens": 148738801.0, + "step": 5748 + }, + { + "epoch": 0.6313419723259389, + "grad_norm": 2.0123274326324463, + "learning_rate": 5e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6922727823257446, + "num_tokens": 148760844.0, + "step": 5749 + }, + { + "epoch": 0.6314517900285526, + "grad_norm": 1.7317259311676025, + "learning_rate": 5e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7175995707511902, + "num_tokens": 148788155.0, + "step": 5750 + }, + { + "epoch": 0.6315616077311663, + "grad_norm": 1.8025610446929932, + "learning_rate": 5e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.70147305727005, + "num_tokens": 148814429.0, + "step": 5751 + }, + { + "epoch": 0.63167142543378, + "grad_norm": 2.0771825313568115, + "learning_rate": 5e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.6990888118743896, + "num_tokens": 148835607.0, + "step": 5752 + }, + { + "epoch": 0.6317812431363936, + "grad_norm": 1.8321810960769653, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7339136600494385, + "num_tokens": 148859691.0, + "step": 5753 + }, + { + "epoch": 0.6318910608390073, + "grad_norm": 1.7768306732177734, + "learning_rate": 5e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6965055465698242, + "num_tokens": 148885986.0, + "step": 5754 + }, + { + "epoch": 0.6320008785416209, + "grad_norm": 1.917144536972046, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7219148874282837, + "num_tokens": 148908861.0, + "step": 5755 + }, + { + "epoch": 0.6321106962442345, + "grad_norm": 1.6666829586029053, + "learning_rate": 5e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7147285342216492, + "num_tokens": 148938283.0, + "step": 5756 + }, + { + "epoch": 0.6322205139468482, + "grad_norm": 1.8286998271942139, + "learning_rate": 5e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7201051712036133, + "num_tokens": 148960933.0, + "step": 5757 + }, + { + "epoch": 0.6323303316494618, + "grad_norm": 1.9025135040283203, + "learning_rate": 5e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7177188396453857, + "num_tokens": 148986862.0, + "step": 5758 + }, + { + "epoch": 0.6324401493520756, + "grad_norm": 1.9852049350738525, + "learning_rate": 5e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7175954580307007, + "num_tokens": 149008144.0, + "step": 5759 + }, + { + "epoch": 0.6325499670546892, + "grad_norm": 1.7948172092437744, + "learning_rate": 5e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7142056822776794, + "num_tokens": 149034209.0, + "step": 5760 + }, + { + "epoch": 0.6326597847573029, + "grad_norm": 1.6085563898086548, + "learning_rate": 5e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6950849890708923, + "num_tokens": 149065907.0, + "step": 5761 + }, + { + "epoch": 0.6327696024599165, + "grad_norm": 1.7491099834442139, + "learning_rate": 5e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.717204749584198, + "num_tokens": 149091717.0, + "step": 5762 + }, + { + "epoch": 0.6328794201625302, + "grad_norm": 1.6894601583480835, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7060291171073914, + "num_tokens": 149121629.0, + "step": 5763 + }, + { + "epoch": 0.6329892378651438, + "grad_norm": 1.8995847702026367, + "learning_rate": 5e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6957353353500366, + "num_tokens": 149146870.0, + "step": 5764 + }, + { + "epoch": 0.6330990555677575, + "grad_norm": 1.7381895780563354, + "learning_rate": 5e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7139699459075928, + "num_tokens": 149173235.0, + "step": 5765 + }, + { + "epoch": 0.6332088732703712, + "grad_norm": 1.9715977907180786, + "learning_rate": 5e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7064663767814636, + "num_tokens": 149194871.0, + "step": 5766 + }, + { + "epoch": 0.6333186909729849, + "grad_norm": 1.7403075695037842, + "learning_rate": 5e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7044351100921631, + "num_tokens": 149223598.0, + "step": 5767 + }, + { + "epoch": 0.6334285086755985, + "grad_norm": 2.138873815536499, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7168657779693604, + "num_tokens": 149243112.0, + "step": 5768 + }, + { + "epoch": 0.6335383263782122, + "grad_norm": 1.6745507717132568, + "learning_rate": 5e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6988143920898438, + "num_tokens": 149274915.0, + "step": 5769 + }, + { + "epoch": 0.6336481440808258, + "grad_norm": 1.711226224899292, + "learning_rate": 5e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7001881003379822, + "num_tokens": 149306130.0, + "step": 5770 + }, + { + "epoch": 0.6337579617834395, + "grad_norm": 1.752082109451294, + "learning_rate": 5e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7212703227996826, + "num_tokens": 149332362.0, + "step": 5771 + }, + { + "epoch": 0.6338677794860531, + "grad_norm": 1.751759648323059, + "learning_rate": 5e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7022058963775635, + "num_tokens": 149360506.0, + "step": 5772 + }, + { + "epoch": 0.6339775971886669, + "grad_norm": 2.1265721321105957, + "learning_rate": 5e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7134822607040405, + "num_tokens": 149382402.0, + "step": 5773 + }, + { + "epoch": 0.6340874148912805, + "grad_norm": 1.6883156299591064, + "learning_rate": 5e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6879916191101074, + "num_tokens": 149413230.0, + "step": 5774 + }, + { + "epoch": 0.6341972325938942, + "grad_norm": 1.832430124282837, + "learning_rate": 5e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6991488337516785, + "num_tokens": 149437770.0, + "step": 5775 + }, + { + "epoch": 0.6343070502965078, + "grad_norm": 1.8784964084625244, + "learning_rate": 5e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7027318477630615, + "num_tokens": 149462197.0, + "step": 5776 + }, + { + "epoch": 0.6344168679991214, + "grad_norm": 1.9257088899612427, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7101233005523682, + "num_tokens": 149485966.0, + "step": 5777 + }, + { + "epoch": 0.6345266857017351, + "grad_norm": 2.1546788215637207, + "learning_rate": 5e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7094225883483887, + "num_tokens": 149506801.0, + "step": 5778 + }, + { + "epoch": 0.6346365034043487, + "grad_norm": 1.7364404201507568, + "learning_rate": 5e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6889011263847351, + "num_tokens": 149535580.0, + "step": 5779 + }, + { + "epoch": 0.6347463211069625, + "grad_norm": 1.8668299913406372, + "learning_rate": 5e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7246363759040833, + "num_tokens": 149559488.0, + "step": 5780 + }, + { + "epoch": 0.6348561388095761, + "grad_norm": 1.9535961151123047, + "learning_rate": 5e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7118525505065918, + "num_tokens": 149583287.0, + "step": 5781 + }, + { + "epoch": 0.6349659565121898, + "grad_norm": 1.8085346221923828, + "learning_rate": 5e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7320008873939514, + "num_tokens": 149607167.0, + "step": 5782 + }, + { + "epoch": 0.6350757742148034, + "grad_norm": 1.8867803812026978, + "learning_rate": 5e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7144550681114197, + "num_tokens": 149629596.0, + "step": 5783 + }, + { + "epoch": 0.6351855919174171, + "grad_norm": 1.9209442138671875, + "learning_rate": 5e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.71650630235672, + "num_tokens": 149650541.0, + "step": 5784 + }, + { + "epoch": 0.6352954096200307, + "grad_norm": 1.9326285123825073, + "learning_rate": 5e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6864811778068542, + "num_tokens": 149677580.0, + "step": 5785 + }, + { + "epoch": 0.6354052273226444, + "grad_norm": 1.8444905281066895, + "learning_rate": 5e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6971849203109741, + "num_tokens": 149704663.0, + "step": 5786 + }, + { + "epoch": 0.635515045025258, + "grad_norm": 1.8156681060791016, + "learning_rate": 5e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6879240870475769, + "num_tokens": 149733249.0, + "step": 5787 + }, + { + "epoch": 0.6356248627278718, + "grad_norm": 1.8124279975891113, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7014416456222534, + "num_tokens": 149759716.0, + "step": 5788 + }, + { + "epoch": 0.6357346804304854, + "grad_norm": 1.7118663787841797, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6886025667190552, + "num_tokens": 149788787.0, + "step": 5789 + }, + { + "epoch": 0.6358444981330991, + "grad_norm": 1.7618061304092407, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7053720355033875, + "num_tokens": 149814422.0, + "step": 5790 + }, + { + "epoch": 0.6359543158357127, + "grad_norm": 1.8157011270523071, + "learning_rate": 5e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6964631080627441, + "num_tokens": 149842917.0, + "step": 5791 + }, + { + "epoch": 0.6360641335383264, + "grad_norm": 1.7480651140213013, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7045741081237793, + "num_tokens": 149874309.0, + "step": 5792 + }, + { + "epoch": 0.63617395124094, + "grad_norm": 1.9066822528839111, + "learning_rate": 5e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7164326906204224, + "num_tokens": 149897810.0, + "step": 5793 + }, + { + "epoch": 0.6362837689435537, + "grad_norm": 1.8203409910202026, + "learning_rate": 5e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6961861848831177, + "num_tokens": 149925378.0, + "step": 5794 + }, + { + "epoch": 0.6363935866461674, + "grad_norm": 1.7733460664749146, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7028157711029053, + "num_tokens": 149953456.0, + "step": 5795 + }, + { + "epoch": 0.636503404348781, + "grad_norm": 1.851967453956604, + "learning_rate": 5e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7055619955062866, + "num_tokens": 149979831.0, + "step": 5796 + }, + { + "epoch": 0.6366132220513947, + "grad_norm": 1.9138988256454468, + "learning_rate": 5e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7010864019393921, + "num_tokens": 150003737.0, + "step": 5797 + }, + { + "epoch": 0.6367230397540083, + "grad_norm": 1.9037927389144897, + "learning_rate": 5e-06, + "loss": 0.968, + "mean_token_accuracy": 0.713265061378479, + "num_tokens": 150029839.0, + "step": 5798 + }, + { + "epoch": 0.636832857456622, + "grad_norm": 1.7983646392822266, + "learning_rate": 5e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6799328327178955, + "num_tokens": 150059966.0, + "step": 5799 + }, + { + "epoch": 0.6369426751592356, + "grad_norm": 1.7996574640274048, + "learning_rate": 5e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.712238609790802, + "num_tokens": 150087612.0, + "step": 5800 + }, + { + "epoch": 0.6370524928618493, + "grad_norm": 2.012742757797241, + "learning_rate": 5e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7114670276641846, + "num_tokens": 150108556.0, + "step": 5801 + }, + { + "epoch": 0.637162310564463, + "grad_norm": 1.8421107530593872, + "learning_rate": 5e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7062997817993164, + "num_tokens": 150133605.0, + "step": 5802 + }, + { + "epoch": 0.6372721282670767, + "grad_norm": 2.0847718715667725, + "learning_rate": 5e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7224704027175903, + "num_tokens": 150153698.0, + "step": 5803 + }, + { + "epoch": 0.6373819459696903, + "grad_norm": 1.900380253791809, + "learning_rate": 5e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7177796363830566, + "num_tokens": 150179807.0, + "step": 5804 + }, + { + "epoch": 0.637491763672304, + "grad_norm": 1.841120719909668, + "learning_rate": 5e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.718207836151123, + "num_tokens": 150204299.0, + "step": 5805 + }, + { + "epoch": 0.6376015813749176, + "grad_norm": 2.011093854904175, + "learning_rate": 5e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7198692560195923, + "num_tokens": 150227691.0, + "step": 5806 + }, + { + "epoch": 0.6377113990775313, + "grad_norm": 1.7022663354873657, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7030233144760132, + "num_tokens": 150255777.0, + "step": 5807 + }, + { + "epoch": 0.6378212167801449, + "grad_norm": 2.0159695148468018, + "learning_rate": 5e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.689923882484436, + "num_tokens": 150278354.0, + "step": 5808 + }, + { + "epoch": 0.6379310344827587, + "grad_norm": 1.9243566989898682, + "learning_rate": 5e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.709404706954956, + "num_tokens": 150302877.0, + "step": 5809 + }, + { + "epoch": 0.6380408521853723, + "grad_norm": 1.7544053792953491, + "learning_rate": 5e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.6969032287597656, + "num_tokens": 150331284.0, + "step": 5810 + }, + { + "epoch": 0.638150669887986, + "grad_norm": 1.7733604907989502, + "learning_rate": 5e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7001174688339233, + "num_tokens": 150362622.0, + "step": 5811 + }, + { + "epoch": 0.6382604875905996, + "grad_norm": 1.8618192672729492, + "learning_rate": 5e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7033064365386963, + "num_tokens": 150387750.0, + "step": 5812 + }, + { + "epoch": 0.6383703052932133, + "grad_norm": 1.7672144174575806, + "learning_rate": 5e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6914325952529907, + "num_tokens": 150417717.0, + "step": 5813 + }, + { + "epoch": 0.6384801229958269, + "grad_norm": 1.866218090057373, + "learning_rate": 5e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7093938589096069, + "num_tokens": 150442496.0, + "step": 5814 + }, + { + "epoch": 0.6385899406984406, + "grad_norm": 2.1716909408569336, + "learning_rate": 5e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7089860439300537, + "num_tokens": 150464923.0, + "step": 5815 + }, + { + "epoch": 0.6386997584010542, + "grad_norm": 2.014299154281616, + "learning_rate": 5e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7142254710197449, + "num_tokens": 150487508.0, + "step": 5816 + }, + { + "epoch": 0.638809576103668, + "grad_norm": 1.7275726795196533, + "learning_rate": 5e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.678452730178833, + "num_tokens": 150517294.0, + "step": 5817 + }, + { + "epoch": 0.6389193938062816, + "grad_norm": 1.7958084344863892, + "learning_rate": 5e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7054167985916138, + "num_tokens": 150544447.0, + "step": 5818 + }, + { + "epoch": 0.6390292115088952, + "grad_norm": 1.9780774116516113, + "learning_rate": 5e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7047531008720398, + "num_tokens": 150566612.0, + "step": 5819 + }, + { + "epoch": 0.6391390292115089, + "grad_norm": 1.8410755395889282, + "learning_rate": 5e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7143793106079102, + "num_tokens": 150593458.0, + "step": 5820 + }, + { + "epoch": 0.6392488469141225, + "grad_norm": 1.6384083032608032, + "learning_rate": 5e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7176303863525391, + "num_tokens": 150621918.0, + "step": 5821 + }, + { + "epoch": 0.6393586646167362, + "grad_norm": 1.6617159843444824, + "learning_rate": 5e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6816794276237488, + "num_tokens": 150654017.0, + "step": 5822 + }, + { + "epoch": 0.6394684823193498, + "grad_norm": 2.069028854370117, + "learning_rate": 5e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7099368572235107, + "num_tokens": 150673530.0, + "step": 5823 + }, + { + "epoch": 0.6395783000219636, + "grad_norm": 1.879281997680664, + "learning_rate": 5e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.705606997013092, + "num_tokens": 150696393.0, + "step": 5824 + }, + { + "epoch": 0.6396881177245772, + "grad_norm": 1.8442317247390747, + "learning_rate": 5e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7005640864372253, + "num_tokens": 150722203.0, + "step": 5825 + }, + { + "epoch": 0.6397979354271909, + "grad_norm": 2.129460573196411, + "learning_rate": 5e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7118827104568481, + "num_tokens": 150741760.0, + "step": 5826 + }, + { + "epoch": 0.6399077531298045, + "grad_norm": 1.6676576137542725, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7268283367156982, + "num_tokens": 150771183.0, + "step": 5827 + }, + { + "epoch": 0.6400175708324182, + "grad_norm": 1.9613091945648193, + "learning_rate": 5e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6888135671615601, + "num_tokens": 150797179.0, + "step": 5828 + }, + { + "epoch": 0.6401273885350318, + "grad_norm": 1.9866883754730225, + "learning_rate": 5e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6950474977493286, + "num_tokens": 150820582.0, + "step": 5829 + }, + { + "epoch": 0.6402372062376455, + "grad_norm": 1.924453616142273, + "learning_rate": 5e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7065685391426086, + "num_tokens": 150845606.0, + "step": 5830 + }, + { + "epoch": 0.6403470239402592, + "grad_norm": 1.8485532999038696, + "learning_rate": 5e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7133677005767822, + "num_tokens": 150871295.0, + "step": 5831 + }, + { + "epoch": 0.6404568416428729, + "grad_norm": 2.0439679622650146, + "learning_rate": 5e-06, + "loss": 1.0604, + "mean_token_accuracy": 0.6839247941970825, + "num_tokens": 150895609.0, + "step": 5832 + }, + { + "epoch": 0.6405666593454865, + "grad_norm": 1.6594245433807373, + "learning_rate": 5e-06, + "loss": 0.975, + "mean_token_accuracy": 0.6977300643920898, + "num_tokens": 150924794.0, + "step": 5833 + }, + { + "epoch": 0.6406764770481002, + "grad_norm": 2.019608497619629, + "learning_rate": 5e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7061268091201782, + "num_tokens": 150946286.0, + "step": 5834 + }, + { + "epoch": 0.6407862947507138, + "grad_norm": 1.9370061159133911, + "learning_rate": 5e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.696790337562561, + "num_tokens": 150970784.0, + "step": 5835 + }, + { + "epoch": 0.6408961124533274, + "grad_norm": 1.89640474319458, + "learning_rate": 5e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.704268217086792, + "num_tokens": 150995339.0, + "step": 5836 + }, + { + "epoch": 0.6410059301559411, + "grad_norm": 1.7061536312103271, + "learning_rate": 5e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.6975394487380981, + "num_tokens": 151025796.0, + "step": 5837 + }, + { + "epoch": 0.6411157478585549, + "grad_norm": 1.936804175376892, + "learning_rate": 5e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7138787508010864, + "num_tokens": 151047469.0, + "step": 5838 + }, + { + "epoch": 0.6412255655611685, + "grad_norm": 1.7671817541122437, + "learning_rate": 5e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7175049781799316, + "num_tokens": 151072708.0, + "step": 5839 + }, + { + "epoch": 0.6413353832637821, + "grad_norm": 1.9655228853225708, + "learning_rate": 5e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6927913427352905, + "num_tokens": 151097966.0, + "step": 5840 + }, + { + "epoch": 0.6414452009663958, + "grad_norm": 1.7350929975509644, + "learning_rate": 5e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7061368227005005, + "num_tokens": 151128575.0, + "step": 5841 + }, + { + "epoch": 0.6415550186690094, + "grad_norm": 2.122136354446411, + "learning_rate": 5e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7053545713424683, + "num_tokens": 151148326.0, + "step": 5842 + }, + { + "epoch": 0.6416648363716231, + "grad_norm": 1.990930438041687, + "learning_rate": 5e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6944553256034851, + "num_tokens": 151172267.0, + "step": 5843 + }, + { + "epoch": 0.6417746540742367, + "grad_norm": 1.8089032173156738, + "learning_rate": 5e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7115389108657837, + "num_tokens": 151198309.0, + "step": 5844 + }, + { + "epoch": 0.6418844717768504, + "grad_norm": 1.8095970153808594, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7225245237350464, + "num_tokens": 151223407.0, + "step": 5845 + }, + { + "epoch": 0.6419942894794641, + "grad_norm": 1.6886656284332275, + "learning_rate": 5e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6935927867889404, + "num_tokens": 151256140.0, + "step": 5846 + }, + { + "epoch": 0.6421041071820778, + "grad_norm": 1.933113932609558, + "learning_rate": 5e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6984059810638428, + "num_tokens": 151279625.0, + "step": 5847 + }, + { + "epoch": 0.6422139248846914, + "grad_norm": 2.0988409519195557, + "learning_rate": 5e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7593647241592407, + "num_tokens": 151298339.0, + "step": 5848 + }, + { + "epoch": 0.6423237425873051, + "grad_norm": 1.9927685260772705, + "learning_rate": 5e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7158052921295166, + "num_tokens": 151320963.0, + "step": 5849 + }, + { + "epoch": 0.6424335602899187, + "grad_norm": 1.747192621231079, + "learning_rate": 5e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7025550603866577, + "num_tokens": 151351799.0, + "step": 5850 + }, + { + "epoch": 0.6425433779925324, + "grad_norm": 1.8299773931503296, + "learning_rate": 5e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7172590494155884, + "num_tokens": 151374653.0, + "step": 5851 + }, + { + "epoch": 0.642653195695146, + "grad_norm": 1.8838005065917969, + "learning_rate": 5e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6930158734321594, + "num_tokens": 151400041.0, + "step": 5852 + }, + { + "epoch": 0.6427630133977598, + "grad_norm": 2.212911367416382, + "learning_rate": 5e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7133158445358276, + "num_tokens": 151419428.0, + "step": 5853 + }, + { + "epoch": 0.6428728311003734, + "grad_norm": 2.0048041343688965, + "learning_rate": 5e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7024862170219421, + "num_tokens": 151443080.0, + "step": 5854 + }, + { + "epoch": 0.6429826488029871, + "grad_norm": 2.020784616470337, + "learning_rate": 5e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7215120196342468, + "num_tokens": 151463711.0, + "step": 5855 + }, + { + "epoch": 0.6430924665056007, + "grad_norm": 1.9008499383926392, + "learning_rate": 5e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7204892635345459, + "num_tokens": 151487261.0, + "step": 5856 + }, + { + "epoch": 0.6432022842082143, + "grad_norm": 2.016601800918579, + "learning_rate": 5e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6986370086669922, + "num_tokens": 151509944.0, + "step": 5857 + }, + { + "epoch": 0.643312101910828, + "grad_norm": 1.870415449142456, + "learning_rate": 5e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.697365939617157, + "num_tokens": 151536345.0, + "step": 5858 + }, + { + "epoch": 0.6434219196134416, + "grad_norm": 1.9140303134918213, + "learning_rate": 5e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7020921111106873, + "num_tokens": 151561052.0, + "step": 5859 + }, + { + "epoch": 0.6435317373160554, + "grad_norm": 1.8330134153366089, + "learning_rate": 5e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7088630199432373, + "num_tokens": 151591386.0, + "step": 5860 + }, + { + "epoch": 0.643641555018669, + "grad_norm": 1.7196910381317139, + "learning_rate": 5e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6912266612052917, + "num_tokens": 151621106.0, + "step": 5861 + }, + { + "epoch": 0.6437513727212827, + "grad_norm": 1.8213651180267334, + "learning_rate": 5e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.708757221698761, + "num_tokens": 151650461.0, + "step": 5862 + }, + { + "epoch": 0.6438611904238963, + "grad_norm": 2.274709463119507, + "learning_rate": 5e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7124536633491516, + "num_tokens": 151669568.0, + "step": 5863 + }, + { + "epoch": 0.64397100812651, + "grad_norm": 2.1495096683502197, + "learning_rate": 5e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6954283118247986, + "num_tokens": 151690436.0, + "step": 5864 + }, + { + "epoch": 0.6440808258291236, + "grad_norm": 1.852781891822815, + "learning_rate": 5e-06, + "loss": 1.007, + "mean_token_accuracy": 0.697449803352356, + "num_tokens": 151716938.0, + "step": 5865 + }, + { + "epoch": 0.6441906435317373, + "grad_norm": 1.658186912536621, + "learning_rate": 5e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6790765523910522, + "num_tokens": 151748662.0, + "step": 5866 + }, + { + "epoch": 0.644300461234351, + "grad_norm": 1.6677522659301758, + "learning_rate": 5e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.691415548324585, + "num_tokens": 151779007.0, + "step": 5867 + }, + { + "epoch": 0.6444102789369647, + "grad_norm": 1.7983734607696533, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7349960803985596, + "num_tokens": 151802770.0, + "step": 5868 + }, + { + "epoch": 0.6445200966395783, + "grad_norm": 1.9552757740020752, + "learning_rate": 5e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6873195767402649, + "num_tokens": 151829726.0, + "step": 5869 + }, + { + "epoch": 0.644629914342192, + "grad_norm": 1.8962863683700562, + "learning_rate": 5e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6995418071746826, + "num_tokens": 151854256.0, + "step": 5870 + }, + { + "epoch": 0.6447397320448056, + "grad_norm": 1.5536794662475586, + "learning_rate": 5e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7071479558944702, + "num_tokens": 151889526.0, + "step": 5871 + }, + { + "epoch": 0.6448495497474193, + "grad_norm": 1.7671148777008057, + "learning_rate": 5e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6945508122444153, + "num_tokens": 151915716.0, + "step": 5872 + }, + { + "epoch": 0.6449593674500329, + "grad_norm": 1.8659460544586182, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7342345714569092, + "num_tokens": 151938644.0, + "step": 5873 + }, + { + "epoch": 0.6450691851526466, + "grad_norm": 2.0120937824249268, + "learning_rate": 5e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6850574016571045, + "num_tokens": 151967545.0, + "step": 5874 + }, + { + "epoch": 0.6451790028552603, + "grad_norm": 1.8549805879592896, + "learning_rate": 5e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7435915470123291, + "num_tokens": 151991651.0, + "step": 5875 + }, + { + "epoch": 0.645288820557874, + "grad_norm": 1.6348315477371216, + "learning_rate": 5e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7101104259490967, + "num_tokens": 152024497.0, + "step": 5876 + }, + { + "epoch": 0.6453986382604876, + "grad_norm": 2.004823684692383, + "learning_rate": 5e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7041606903076172, + "num_tokens": 152049197.0, + "step": 5877 + }, + { + "epoch": 0.6455084559631012, + "grad_norm": 1.938036322593689, + "learning_rate": 5e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7163383960723877, + "num_tokens": 152073198.0, + "step": 5878 + }, + { + "epoch": 0.6456182736657149, + "grad_norm": 1.9821693897247314, + "learning_rate": 5e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7063255906105042, + "num_tokens": 152097893.0, + "step": 5879 + }, + { + "epoch": 0.6457280913683285, + "grad_norm": 1.8937933444976807, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.713053822517395, + "num_tokens": 152124583.0, + "step": 5880 + }, + { + "epoch": 0.6458379090709422, + "grad_norm": 1.7070655822753906, + "learning_rate": 5e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6997352838516235, + "num_tokens": 152157095.0, + "step": 5881 + }, + { + "epoch": 0.6459477267735559, + "grad_norm": 1.887352466583252, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7299765348434448, + "num_tokens": 152179141.0, + "step": 5882 + }, + { + "epoch": 0.6460575444761696, + "grad_norm": 2.1132726669311523, + "learning_rate": 5e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7063107490539551, + "num_tokens": 152200778.0, + "step": 5883 + }, + { + "epoch": 0.6461673621787832, + "grad_norm": 1.953361988067627, + "learning_rate": 5e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6879228949546814, + "num_tokens": 152229301.0, + "step": 5884 + }, + { + "epoch": 0.6462771798813969, + "grad_norm": 1.965131402015686, + "learning_rate": 5e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6923824548721313, + "num_tokens": 152256543.0, + "step": 5885 + }, + { + "epoch": 0.6463869975840105, + "grad_norm": 1.9128795862197876, + "learning_rate": 5e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7125657796859741, + "num_tokens": 152279594.0, + "step": 5886 + }, + { + "epoch": 0.6464968152866242, + "grad_norm": 1.8361870050430298, + "learning_rate": 5e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7107183337211609, + "num_tokens": 152306848.0, + "step": 5887 + }, + { + "epoch": 0.6466066329892378, + "grad_norm": 1.7577632665634155, + "learning_rate": 5e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.702185869216919, + "num_tokens": 152333716.0, + "step": 5888 + }, + { + "epoch": 0.6467164506918516, + "grad_norm": 1.9967328310012817, + "learning_rate": 5e-06, + "loss": 1.0806, + "mean_token_accuracy": 0.6742100715637207, + "num_tokens": 152358255.0, + "step": 5889 + }, + { + "epoch": 0.6468262683944652, + "grad_norm": 1.8752939701080322, + "learning_rate": 5e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7073410153388977, + "num_tokens": 152382338.0, + "step": 5890 + }, + { + "epoch": 0.6469360860970789, + "grad_norm": 1.8415169715881348, + "learning_rate": 5e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6825844049453735, + "num_tokens": 152412364.0, + "step": 5891 + }, + { + "epoch": 0.6470459037996925, + "grad_norm": 2.147306442260742, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7157754898071289, + "num_tokens": 152430925.0, + "step": 5892 + }, + { + "epoch": 0.6471557215023062, + "grad_norm": 1.683717131614685, + "learning_rate": 5e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6991305351257324, + "num_tokens": 152463476.0, + "step": 5893 + }, + { + "epoch": 0.6472655392049198, + "grad_norm": 1.6830486059188843, + "learning_rate": 5e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6909199953079224, + "num_tokens": 152493924.0, + "step": 5894 + }, + { + "epoch": 0.6473753569075335, + "grad_norm": 1.9843759536743164, + "learning_rate": 5e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7079561948776245, + "num_tokens": 152517340.0, + "step": 5895 + }, + { + "epoch": 0.6474851746101472, + "grad_norm": 1.5769122838974, + "learning_rate": 5e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7145716547966003, + "num_tokens": 152551321.0, + "step": 5896 + }, + { + "epoch": 0.6475949923127609, + "grad_norm": 2.006317377090454, + "learning_rate": 5e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.692764937877655, + "num_tokens": 152577328.0, + "step": 5897 + }, + { + "epoch": 0.6477048100153745, + "grad_norm": 1.9365317821502686, + "learning_rate": 5e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6894254088401794, + "num_tokens": 152605334.0, + "step": 5898 + }, + { + "epoch": 0.6478146277179881, + "grad_norm": 1.9081461429595947, + "learning_rate": 5e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6950145363807678, + "num_tokens": 152629857.0, + "step": 5899 + }, + { + "epoch": 0.6479244454206018, + "grad_norm": 1.6133356094360352, + "learning_rate": 5e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6924402117729187, + "num_tokens": 152660947.0, + "step": 5900 + }, + { + "epoch": 0.6480342631232154, + "grad_norm": 2.818366050720215, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7361345291137695, + "num_tokens": 152682403.0, + "step": 5901 + }, + { + "epoch": 0.6481440808258291, + "grad_norm": 1.9726759195327759, + "learning_rate": 5e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7046555280685425, + "num_tokens": 152708217.0, + "step": 5902 + }, + { + "epoch": 0.6482538985284428, + "grad_norm": 1.7455849647521973, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7311828136444092, + "num_tokens": 152735345.0, + "step": 5903 + }, + { + "epoch": 0.6483637162310565, + "grad_norm": 1.9010875225067139, + "learning_rate": 5e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.719009280204773, + "num_tokens": 152758004.0, + "step": 5904 + }, + { + "epoch": 0.6484735339336701, + "grad_norm": 1.9076579809188843, + "learning_rate": 5e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6991675496101379, + "num_tokens": 152781635.0, + "step": 5905 + }, + { + "epoch": 0.6485833516362838, + "grad_norm": 1.791040301322937, + "learning_rate": 5e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.704801082611084, + "num_tokens": 152807852.0, + "step": 5906 + }, + { + "epoch": 0.6486931693388974, + "grad_norm": 1.7400565147399902, + "learning_rate": 5e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6928550004959106, + "num_tokens": 152839037.0, + "step": 5907 + }, + { + "epoch": 0.6488029870415111, + "grad_norm": 1.94003427028656, + "learning_rate": 5e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7047839760780334, + "num_tokens": 152861205.0, + "step": 5908 + }, + { + "epoch": 0.6489128047441247, + "grad_norm": 1.7560808658599854, + "learning_rate": 5e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7206020951271057, + "num_tokens": 152891205.0, + "step": 5909 + }, + { + "epoch": 0.6490226224467384, + "grad_norm": 1.8821752071380615, + "learning_rate": 5e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6910477876663208, + "num_tokens": 152918738.0, + "step": 5910 + }, + { + "epoch": 0.6491324401493521, + "grad_norm": 2.0190510749816895, + "learning_rate": 5e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7135066986083984, + "num_tokens": 152941231.0, + "step": 5911 + }, + { + "epoch": 0.6492422578519658, + "grad_norm": 2.0357143878936768, + "learning_rate": 5e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6951650977134705, + "num_tokens": 152964887.0, + "step": 5912 + }, + { + "epoch": 0.6493520755545794, + "grad_norm": 1.807913064956665, + "learning_rate": 5e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6826627254486084, + "num_tokens": 152990925.0, + "step": 5913 + }, + { + "epoch": 0.6494618932571931, + "grad_norm": 1.7314355373382568, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7203978300094604, + "num_tokens": 153018474.0, + "step": 5914 + }, + { + "epoch": 0.6495717109598067, + "grad_norm": 1.7568295001983643, + "learning_rate": 5e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7078732252120972, + "num_tokens": 153047548.0, + "step": 5915 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 2.000883102416992, + "learning_rate": 5e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7138099670410156, + "num_tokens": 153069439.0, + "step": 5916 + }, + { + "epoch": 0.649791346365034, + "grad_norm": 1.734175443649292, + "learning_rate": 5e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7107346057891846, + "num_tokens": 153099700.0, + "step": 5917 + }, + { + "epoch": 0.6499011640676478, + "grad_norm": 2.0416510105133057, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7198246717453003, + "num_tokens": 153119243.0, + "step": 5918 + }, + { + "epoch": 0.6500109817702614, + "grad_norm": 1.8108596801757812, + "learning_rate": 5e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.6997572183609009, + "num_tokens": 153146494.0, + "step": 5919 + }, + { + "epoch": 0.650120799472875, + "grad_norm": 1.8674650192260742, + "learning_rate": 5e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.6814396977424622, + "num_tokens": 153175726.0, + "step": 5920 + }, + { + "epoch": 0.6502306171754887, + "grad_norm": 1.8672078847885132, + "learning_rate": 5e-06, + "loss": 0.911, + "mean_token_accuracy": 0.729637086391449, + "num_tokens": 153200732.0, + "step": 5921 + }, + { + "epoch": 0.6503404348781023, + "grad_norm": 2.0106160640716553, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7384954690933228, + "num_tokens": 153221701.0, + "step": 5922 + }, + { + "epoch": 0.650450252580716, + "grad_norm": 1.9748399257659912, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7124653458595276, + "num_tokens": 153243836.0, + "step": 5923 + }, + { + "epoch": 0.6505600702833296, + "grad_norm": 2.0751397609710693, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7226933240890503, + "num_tokens": 153265322.0, + "step": 5924 + }, + { + "epoch": 0.6506698879859434, + "grad_norm": 1.893264651298523, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7241278290748596, + "num_tokens": 153288859.0, + "step": 5925 + }, + { + "epoch": 0.650779705688557, + "grad_norm": 1.7872071266174316, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.718846321105957, + "num_tokens": 153315965.0, + "step": 5926 + }, + { + "epoch": 0.6508895233911707, + "grad_norm": 1.8275072574615479, + "learning_rate": 5e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7092719078063965, + "num_tokens": 153341478.0, + "step": 5927 + }, + { + "epoch": 0.6509993410937843, + "grad_norm": 1.7499957084655762, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7225522994995117, + "num_tokens": 153369105.0, + "step": 5928 + }, + { + "epoch": 0.651109158796398, + "grad_norm": 1.7087143659591675, + "learning_rate": 5e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7007675170898438, + "num_tokens": 153398836.0, + "step": 5929 + }, + { + "epoch": 0.6512189764990116, + "grad_norm": 2.0276284217834473, + "learning_rate": 5e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7042768597602844, + "num_tokens": 153421418.0, + "step": 5930 + }, + { + "epoch": 0.6513287942016253, + "grad_norm": 1.943215250968933, + "learning_rate": 5e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.68630051612854, + "num_tokens": 153445912.0, + "step": 5931 + }, + { + "epoch": 0.651438611904239, + "grad_norm": 1.716185212135315, + "learning_rate": 5e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6912967562675476, + "num_tokens": 153477003.0, + "step": 5932 + }, + { + "epoch": 0.6515484296068527, + "grad_norm": 1.9728448390960693, + "learning_rate": 5e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7038092613220215, + "num_tokens": 153500461.0, + "step": 5933 + }, + { + "epoch": 0.6516582473094663, + "grad_norm": 2.071671724319458, + "learning_rate": 5e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7222015261650085, + "num_tokens": 153521304.0, + "step": 5934 + }, + { + "epoch": 0.65176806501208, + "grad_norm": 1.6813195943832397, + "learning_rate": 5e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7023648619651794, + "num_tokens": 153553994.0, + "step": 5935 + }, + { + "epoch": 0.6518778827146936, + "grad_norm": 2.106175661087036, + "learning_rate": 5e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7055230140686035, + "num_tokens": 153573526.0, + "step": 5936 + }, + { + "epoch": 0.6519877004173072, + "grad_norm": 1.9236493110656738, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7188507914543152, + "num_tokens": 153597587.0, + "step": 5937 + }, + { + "epoch": 0.6520975181199209, + "grad_norm": 1.87161123752594, + "learning_rate": 5e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7071304321289062, + "num_tokens": 153623125.0, + "step": 5938 + }, + { + "epoch": 0.6522073358225345, + "grad_norm": 1.633131980895996, + "learning_rate": 5e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6861379742622375, + "num_tokens": 153655807.0, + "step": 5939 + }, + { + "epoch": 0.6523171535251483, + "grad_norm": 1.707646131515503, + "learning_rate": 5e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6749134063720703, + "num_tokens": 153689274.0, + "step": 5940 + }, + { + "epoch": 0.6524269712277619, + "grad_norm": 1.81630539894104, + "learning_rate": 5e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.693947970867157, + "num_tokens": 153716041.0, + "step": 5941 + }, + { + "epoch": 0.6525367889303756, + "grad_norm": 1.7944222688674927, + "learning_rate": 5e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7391195297241211, + "num_tokens": 153740103.0, + "step": 5942 + }, + { + "epoch": 0.6526466066329892, + "grad_norm": 1.8886582851409912, + "learning_rate": 5e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7076311111450195, + "num_tokens": 153766568.0, + "step": 5943 + }, + { + "epoch": 0.6527564243356029, + "grad_norm": 1.8858506679534912, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7242676019668579, + "num_tokens": 153791013.0, + "step": 5944 + }, + { + "epoch": 0.6528662420382165, + "grad_norm": 1.8155758380889893, + "learning_rate": 5e-06, + "loss": 0.97, + "mean_token_accuracy": 0.706933856010437, + "num_tokens": 153818709.0, + "step": 5945 + }, + { + "epoch": 0.6529760597408302, + "grad_norm": 1.7840718030929565, + "learning_rate": 5e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6764984130859375, + "num_tokens": 153847912.0, + "step": 5946 + }, + { + "epoch": 0.6530858774434439, + "grad_norm": 1.7091997861862183, + "learning_rate": 5e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6816105246543884, + "num_tokens": 153878583.0, + "step": 5947 + }, + { + "epoch": 0.6531956951460576, + "grad_norm": 1.8329551219940186, + "learning_rate": 5e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7151843309402466, + "num_tokens": 153902902.0, + "step": 5948 + }, + { + "epoch": 0.6533055128486712, + "grad_norm": 1.8255393505096436, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.6975494027137756, + "num_tokens": 153928595.0, + "step": 5949 + }, + { + "epoch": 0.6534153305512849, + "grad_norm": 2.1123735904693604, + "learning_rate": 5e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.70600825548172, + "num_tokens": 153950723.0, + "step": 5950 + }, + { + "epoch": 0.6535251482538985, + "grad_norm": 1.7274590730667114, + "learning_rate": 5e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6941819190979004, + "num_tokens": 153980229.0, + "step": 5951 + }, + { + "epoch": 0.6536349659565122, + "grad_norm": 1.9952510595321655, + "learning_rate": 5e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7152449488639832, + "num_tokens": 154001303.0, + "step": 5952 + }, + { + "epoch": 0.6537447836591258, + "grad_norm": 1.7895045280456543, + "learning_rate": 5e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7125037908554077, + "num_tokens": 154027480.0, + "step": 5953 + }, + { + "epoch": 0.6538546013617396, + "grad_norm": 1.8222651481628418, + "learning_rate": 5e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6916487216949463, + "num_tokens": 154055431.0, + "step": 5954 + }, + { + "epoch": 0.6539644190643532, + "grad_norm": 1.6270684003829956, + "learning_rate": 5e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7008876800537109, + "num_tokens": 154088296.0, + "step": 5955 + }, + { + "epoch": 0.6540742367669669, + "grad_norm": 1.8705778121948242, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7359154224395752, + "num_tokens": 154111416.0, + "step": 5956 + }, + { + "epoch": 0.6541840544695805, + "grad_norm": 1.7847172021865845, + "learning_rate": 5e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6889691948890686, + "num_tokens": 154139512.0, + "step": 5957 + }, + { + "epoch": 0.6542938721721941, + "grad_norm": 1.6946841478347778, + "learning_rate": 5e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7003101110458374, + "num_tokens": 154165583.0, + "step": 5958 + }, + { + "epoch": 0.6544036898748078, + "grad_norm": 1.7833551168441772, + "learning_rate": 5e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7130595445632935, + "num_tokens": 154191363.0, + "step": 5959 + }, + { + "epoch": 0.6545135075774214, + "grad_norm": 1.8141473531723022, + "learning_rate": 5e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.711296796798706, + "num_tokens": 154217237.0, + "step": 5960 + }, + { + "epoch": 0.6546233252800352, + "grad_norm": 2.101142406463623, + "learning_rate": 5e-06, + "loss": 0.883, + "mean_token_accuracy": 0.720045804977417, + "num_tokens": 154236266.0, + "step": 5961 + }, + { + "epoch": 0.6547331429826488, + "grad_norm": 1.718734860420227, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7075151205062866, + "num_tokens": 154264569.0, + "step": 5962 + }, + { + "epoch": 0.6548429606852625, + "grad_norm": 1.6625826358795166, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7255940437316895, + "num_tokens": 154292451.0, + "step": 5963 + }, + { + "epoch": 0.6549527783878761, + "grad_norm": 1.750718355178833, + "learning_rate": 5e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7025792598724365, + "num_tokens": 154318969.0, + "step": 5964 + }, + { + "epoch": 0.6550625960904898, + "grad_norm": 1.991086721420288, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7166175842285156, + "num_tokens": 154342866.0, + "step": 5965 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 2.159454107284546, + "learning_rate": 5e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7034214735031128, + "num_tokens": 154365007.0, + "step": 5966 + }, + { + "epoch": 0.6552822314957171, + "grad_norm": 1.7694365978240967, + "learning_rate": 5e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7190796136856079, + "num_tokens": 154390392.0, + "step": 5967 + }, + { + "epoch": 0.6553920491983307, + "grad_norm": 2.037743330001831, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7168077230453491, + "num_tokens": 154409874.0, + "step": 5968 + }, + { + "epoch": 0.6555018669009445, + "grad_norm": 1.768949031829834, + "learning_rate": 5e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6907661557197571, + "num_tokens": 154437363.0, + "step": 5969 + }, + { + "epoch": 0.6556116846035581, + "grad_norm": 1.6522068977355957, + "learning_rate": 5e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7074075937271118, + "num_tokens": 154470339.0, + "step": 5970 + }, + { + "epoch": 0.6557215023061718, + "grad_norm": 1.7921327352523804, + "learning_rate": 5e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7191364169120789, + "num_tokens": 154496161.0, + "step": 5971 + }, + { + "epoch": 0.6558313200087854, + "grad_norm": 1.7564293146133423, + "learning_rate": 5e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7148539423942566, + "num_tokens": 154523183.0, + "step": 5972 + }, + { + "epoch": 0.6559411377113991, + "grad_norm": 1.8712716102600098, + "learning_rate": 5e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6971819996833801, + "num_tokens": 154548433.0, + "step": 5973 + }, + { + "epoch": 0.6560509554140127, + "grad_norm": 1.8223748207092285, + "learning_rate": 5e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7143757343292236, + "num_tokens": 154574282.0, + "step": 5974 + }, + { + "epoch": 0.6561607731166264, + "grad_norm": 2.054091215133667, + "learning_rate": 5e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6865540742874146, + "num_tokens": 154596240.0, + "step": 5975 + }, + { + "epoch": 0.6562705908192401, + "grad_norm": 2.006174325942993, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7113820314407349, + "num_tokens": 154618406.0, + "step": 5976 + }, + { + "epoch": 0.6563804085218538, + "grad_norm": 1.8394172191619873, + "learning_rate": 5e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6969778537750244, + "num_tokens": 154644257.0, + "step": 5977 + }, + { + "epoch": 0.6564902262244674, + "grad_norm": 1.8969521522521973, + "learning_rate": 5e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6959322690963745, + "num_tokens": 154668316.0, + "step": 5978 + }, + { + "epoch": 0.656600043927081, + "grad_norm": 1.6448454856872559, + "learning_rate": 5e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6942247748374939, + "num_tokens": 154700980.0, + "step": 5979 + }, + { + "epoch": 0.6567098616296947, + "grad_norm": 1.7233455181121826, + "learning_rate": 5e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6995952129364014, + "num_tokens": 154730098.0, + "step": 5980 + }, + { + "epoch": 0.6568196793323083, + "grad_norm": 2.036867141723633, + "learning_rate": 5e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7336483597755432, + "num_tokens": 154749993.0, + "step": 5981 + }, + { + "epoch": 0.656929497034922, + "grad_norm": 1.971908450126648, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7195598483085632, + "num_tokens": 154772334.0, + "step": 5982 + }, + { + "epoch": 0.6570393147375357, + "grad_norm": 1.8290220499038696, + "learning_rate": 5e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7114496231079102, + "num_tokens": 154796343.0, + "step": 5983 + }, + { + "epoch": 0.6571491324401494, + "grad_norm": 1.654837727546692, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6999287605285645, + "num_tokens": 154826554.0, + "step": 5984 + }, + { + "epoch": 0.657258950142763, + "grad_norm": 1.9686908721923828, + "learning_rate": 5e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.731708824634552, + "num_tokens": 154847277.0, + "step": 5985 + }, + { + "epoch": 0.6573687678453767, + "grad_norm": 1.894439935684204, + "learning_rate": 5e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7065443396568298, + "num_tokens": 154869671.0, + "step": 5986 + }, + { + "epoch": 0.6574785855479903, + "grad_norm": 1.7376128435134888, + "learning_rate": 5e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7032908797264099, + "num_tokens": 154897675.0, + "step": 5987 + }, + { + "epoch": 0.657588403250604, + "grad_norm": 1.7181297540664673, + "learning_rate": 5e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7096737623214722, + "num_tokens": 154926858.0, + "step": 5988 + }, + { + "epoch": 0.6576982209532176, + "grad_norm": 2.0310959815979004, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7041966915130615, + "num_tokens": 154948179.0, + "step": 5989 + }, + { + "epoch": 0.6578080386558314, + "grad_norm": 1.751584529876709, + "learning_rate": 5e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6965335011482239, + "num_tokens": 154974516.0, + "step": 5990 + }, + { + "epoch": 0.657917856358445, + "grad_norm": 1.6840417385101318, + "learning_rate": 5e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7034475207328796, + "num_tokens": 155003443.0, + "step": 5991 + }, + { + "epoch": 0.6580276740610587, + "grad_norm": 1.7286888360977173, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7174646258354187, + "num_tokens": 155029186.0, + "step": 5992 + }, + { + "epoch": 0.6581374917636723, + "grad_norm": 1.789326786994934, + "learning_rate": 5e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6844520568847656, + "num_tokens": 155055861.0, + "step": 5993 + }, + { + "epoch": 0.658247309466286, + "grad_norm": 1.960154414176941, + "learning_rate": 5e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7079834938049316, + "num_tokens": 155078484.0, + "step": 5994 + }, + { + "epoch": 0.6583571271688996, + "grad_norm": 1.9552438259124756, + "learning_rate": 5e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6950615644454956, + "num_tokens": 155101540.0, + "step": 5995 + }, + { + "epoch": 0.6584669448715132, + "grad_norm": 1.7557933330535889, + "learning_rate": 5e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7087142467498779, + "num_tokens": 155127230.0, + "step": 5996 + }, + { + "epoch": 0.6585767625741269, + "grad_norm": 1.8640813827514648, + "learning_rate": 5e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.709352970123291, + "num_tokens": 155151093.0, + "step": 5997 + }, + { + "epoch": 0.6586865802767407, + "grad_norm": 1.5959808826446533, + "learning_rate": 5e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7110148668289185, + "num_tokens": 155180221.0, + "step": 5998 + }, + { + "epoch": 0.6587963979793543, + "grad_norm": 1.977187991142273, + "learning_rate": 5e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7086697816848755, + "num_tokens": 155200935.0, + "step": 5999 + }, + { + "epoch": 0.6589062156819679, + "grad_norm": 1.7327042818069458, + "learning_rate": 5e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6905207633972168, + "num_tokens": 155230585.0, + "step": 6000 + }, + { + "epoch": 0.6590160333845816, + "grad_norm": 1.8596174716949463, + "learning_rate": 5e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7223324179649353, + "num_tokens": 155254070.0, + "step": 6001 + }, + { + "epoch": 0.6591258510871952, + "grad_norm": 1.8249788284301758, + "learning_rate": 5e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7047092318534851, + "num_tokens": 155279598.0, + "step": 6002 + }, + { + "epoch": 0.6592356687898089, + "grad_norm": 1.9146829843521118, + "learning_rate": 5e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6951251029968262, + "num_tokens": 155304650.0, + "step": 6003 + }, + { + "epoch": 0.6593454864924225, + "grad_norm": 1.9784650802612305, + "learning_rate": 5e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6869503259658813, + "num_tokens": 155328754.0, + "step": 6004 + }, + { + "epoch": 0.6594553041950363, + "grad_norm": 1.6139512062072754, + "learning_rate": 5e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7124471664428711, + "num_tokens": 155360629.0, + "step": 6005 + }, + { + "epoch": 0.6595651218976499, + "grad_norm": 1.6987868547439575, + "learning_rate": 5e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7121959924697876, + "num_tokens": 155393186.0, + "step": 6006 + }, + { + "epoch": 0.6596749396002636, + "grad_norm": 2.045257568359375, + "learning_rate": 5e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7203117609024048, + "num_tokens": 155415482.0, + "step": 6007 + }, + { + "epoch": 0.6597847573028772, + "grad_norm": 1.6617419719696045, + "learning_rate": 5e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7058454751968384, + "num_tokens": 155444292.0, + "step": 6008 + }, + { + "epoch": 0.6598945750054909, + "grad_norm": 2.038616180419922, + "learning_rate": 5e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7246947288513184, + "num_tokens": 155466785.0, + "step": 6009 + }, + { + "epoch": 0.6600043927081045, + "grad_norm": 1.8837275505065918, + "learning_rate": 5e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7065988779067993, + "num_tokens": 155489599.0, + "step": 6010 + }, + { + "epoch": 0.6601142104107182, + "grad_norm": 2.004033088684082, + "learning_rate": 5e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7047648429870605, + "num_tokens": 155511734.0, + "step": 6011 + }, + { + "epoch": 0.6602240281133319, + "grad_norm": 1.7931602001190186, + "learning_rate": 5e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.709465503692627, + "num_tokens": 155537145.0, + "step": 6012 + }, + { + "epoch": 0.6603338458159456, + "grad_norm": 2.0152297019958496, + "learning_rate": 5e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7122074365615845, + "num_tokens": 155559817.0, + "step": 6013 + }, + { + "epoch": 0.6604436635185592, + "grad_norm": 1.821691632270813, + "learning_rate": 5e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7024098634719849, + "num_tokens": 155589551.0, + "step": 6014 + }, + { + "epoch": 0.6605534812211729, + "grad_norm": 1.9438730478286743, + "learning_rate": 5e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7330828905105591, + "num_tokens": 155611960.0, + "step": 6015 + }, + { + "epoch": 0.6606632989237865, + "grad_norm": 1.8707810640335083, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7224735021591187, + "num_tokens": 155636974.0, + "step": 6016 + }, + { + "epoch": 0.6607731166264001, + "grad_norm": 1.8036317825317383, + "learning_rate": 5e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7171437740325928, + "num_tokens": 155664099.0, + "step": 6017 + }, + { + "epoch": 0.6608829343290138, + "grad_norm": 1.96672785282135, + "learning_rate": 5e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7150835394859314, + "num_tokens": 155684866.0, + "step": 6018 + }, + { + "epoch": 0.6609927520316276, + "grad_norm": 1.840073585510254, + "learning_rate": 5e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7401263117790222, + "num_tokens": 155710219.0, + "step": 6019 + }, + { + "epoch": 0.6611025697342412, + "grad_norm": 1.9459350109100342, + "learning_rate": 5e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7110840082168579, + "num_tokens": 155734284.0, + "step": 6020 + }, + { + "epoch": 0.6612123874368548, + "grad_norm": 1.782135009765625, + "learning_rate": 5e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7049717903137207, + "num_tokens": 155760560.0, + "step": 6021 + }, + { + "epoch": 0.6613222051394685, + "grad_norm": 1.7042347192764282, + "learning_rate": 5e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6963106393814087, + "num_tokens": 155790683.0, + "step": 6022 + }, + { + "epoch": 0.6614320228420821, + "grad_norm": 2.130476713180542, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7323821783065796, + "num_tokens": 155810029.0, + "step": 6023 + }, + { + "epoch": 0.6615418405446958, + "grad_norm": 1.768551230430603, + "learning_rate": 5e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6893270611763, + "num_tokens": 155840090.0, + "step": 6024 + }, + { + "epoch": 0.6616516582473094, + "grad_norm": 1.8288285732269287, + "learning_rate": 5e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7063165903091431, + "num_tokens": 155865723.0, + "step": 6025 + }, + { + "epoch": 0.6617614759499231, + "grad_norm": 1.8299379348754883, + "learning_rate": 5e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7042344808578491, + "num_tokens": 155894919.0, + "step": 6026 + }, + { + "epoch": 0.6618712936525368, + "grad_norm": 1.8740696907043457, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7162512540817261, + "num_tokens": 155917052.0, + "step": 6027 + }, + { + "epoch": 0.6619811113551505, + "grad_norm": 1.5990837812423706, + "learning_rate": 5e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6852173209190369, + "num_tokens": 155949703.0, + "step": 6028 + }, + { + "epoch": 0.6620909290577641, + "grad_norm": 1.7309750318527222, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7135072350502014, + "num_tokens": 155978064.0, + "step": 6029 + }, + { + "epoch": 0.6622007467603778, + "grad_norm": 1.7093886137008667, + "learning_rate": 5e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7020180225372314, + "num_tokens": 156008100.0, + "step": 6030 + }, + { + "epoch": 0.6623105644629914, + "grad_norm": 1.9119211435317993, + "learning_rate": 5e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6921659708023071, + "num_tokens": 156034579.0, + "step": 6031 + }, + { + "epoch": 0.6624203821656051, + "grad_norm": 1.7952289581298828, + "learning_rate": 5e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7035621404647827, + "num_tokens": 156063068.0, + "step": 6032 + }, + { + "epoch": 0.6625301998682187, + "grad_norm": 1.6686127185821533, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7235981822013855, + "num_tokens": 156092676.0, + "step": 6033 + }, + { + "epoch": 0.6626400175708325, + "grad_norm": 1.6003787517547607, + "learning_rate": 5e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6879838109016418, + "num_tokens": 156127987.0, + "step": 6034 + }, + { + "epoch": 0.6627498352734461, + "grad_norm": 1.7945853471755981, + "learning_rate": 5e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.6665061712265015, + "num_tokens": 156157500.0, + "step": 6035 + }, + { + "epoch": 0.6628596529760598, + "grad_norm": 1.6750819683074951, + "learning_rate": 5e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.696948230266571, + "num_tokens": 156190549.0, + "step": 6036 + }, + { + "epoch": 0.6629694706786734, + "grad_norm": 1.9613516330718994, + "learning_rate": 5e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7110641002655029, + "num_tokens": 156216179.0, + "step": 6037 + }, + { + "epoch": 0.663079288381287, + "grad_norm": 1.826196551322937, + "learning_rate": 5e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6840991973876953, + "num_tokens": 156241510.0, + "step": 6038 + }, + { + "epoch": 0.6631891060839007, + "grad_norm": 1.828982949256897, + "learning_rate": 5e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7246284484863281, + "num_tokens": 156266684.0, + "step": 6039 + }, + { + "epoch": 0.6632989237865143, + "grad_norm": 2.093505382537842, + "learning_rate": 5e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.685867190361023, + "num_tokens": 156288762.0, + "step": 6040 + }, + { + "epoch": 0.6634087414891281, + "grad_norm": 1.7538338899612427, + "learning_rate": 5e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.690095067024231, + "num_tokens": 156318784.0, + "step": 6041 + }, + { + "epoch": 0.6635185591917417, + "grad_norm": 1.8444911241531372, + "learning_rate": 5e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6906753778457642, + "num_tokens": 156345930.0, + "step": 6042 + }, + { + "epoch": 0.6636283768943554, + "grad_norm": 1.7218295335769653, + "learning_rate": 5e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7028685212135315, + "num_tokens": 156373889.0, + "step": 6043 + }, + { + "epoch": 0.663738194596969, + "grad_norm": 1.6633327007293701, + "learning_rate": 5e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7166508436203003, + "num_tokens": 156402543.0, + "step": 6044 + }, + { + "epoch": 0.6638480122995827, + "grad_norm": 1.9562673568725586, + "learning_rate": 5e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6968010663986206, + "num_tokens": 156426457.0, + "step": 6045 + }, + { + "epoch": 0.6639578300021963, + "grad_norm": 1.8967386484146118, + "learning_rate": 5e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7265218496322632, + "num_tokens": 156450051.0, + "step": 6046 + }, + { + "epoch": 0.66406764770481, + "grad_norm": 1.7395567893981934, + "learning_rate": 5e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7132867574691772, + "num_tokens": 156478100.0, + "step": 6047 + }, + { + "epoch": 0.6641774654074237, + "grad_norm": 1.6100703477859497, + "learning_rate": 5e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.690101146697998, + "num_tokens": 156510643.0, + "step": 6048 + }, + { + "epoch": 0.6642872831100374, + "grad_norm": 1.8019160032272339, + "learning_rate": 5e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7230923771858215, + "num_tokens": 156534666.0, + "step": 6049 + }, + { + "epoch": 0.664397100812651, + "grad_norm": 1.718141794204712, + "learning_rate": 5e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6815089583396912, + "num_tokens": 156563956.0, + "step": 6050 + }, + { + "epoch": 0.6645069185152647, + "grad_norm": 1.721585750579834, + "learning_rate": 5e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6943431496620178, + "num_tokens": 156591116.0, + "step": 6051 + }, + { + "epoch": 0.6646167362178783, + "grad_norm": 1.8928042650222778, + "learning_rate": 5e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7401973009109497, + "num_tokens": 156614368.0, + "step": 6052 + }, + { + "epoch": 0.664726553920492, + "grad_norm": 1.7229737043380737, + "learning_rate": 5e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6851734519004822, + "num_tokens": 156643964.0, + "step": 6053 + }, + { + "epoch": 0.6648363716231056, + "grad_norm": 1.7416481971740723, + "learning_rate": 5e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7028060555458069, + "num_tokens": 156674001.0, + "step": 6054 + }, + { + "epoch": 0.6649461893257194, + "grad_norm": 1.8136149644851685, + "learning_rate": 5e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6846363544464111, + "num_tokens": 156702189.0, + "step": 6055 + }, + { + "epoch": 0.665056007028333, + "grad_norm": 1.6401325464248657, + "learning_rate": 5e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7074527740478516, + "num_tokens": 156730809.0, + "step": 6056 + }, + { + "epoch": 0.6651658247309467, + "grad_norm": 1.9605005979537964, + "learning_rate": 5e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7330335974693298, + "num_tokens": 156754506.0, + "step": 6057 + }, + { + "epoch": 0.6652756424335603, + "grad_norm": 2.0733699798583984, + "learning_rate": 5e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7000722289085388, + "num_tokens": 156778870.0, + "step": 6058 + }, + { + "epoch": 0.665385460136174, + "grad_norm": 1.659436821937561, + "learning_rate": 5e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.7029822468757629, + "num_tokens": 156809226.0, + "step": 6059 + }, + { + "epoch": 0.6654952778387876, + "grad_norm": 2.039189577102661, + "learning_rate": 5e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.709030270576477, + "num_tokens": 156829779.0, + "step": 6060 + }, + { + "epoch": 0.6656050955414012, + "grad_norm": 1.819075345993042, + "learning_rate": 5e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.720671534538269, + "num_tokens": 156855338.0, + "step": 6061 + }, + { + "epoch": 0.6657149132440149, + "grad_norm": 1.6112455129623413, + "learning_rate": 5e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7078653573989868, + "num_tokens": 156889857.0, + "step": 6062 + }, + { + "epoch": 0.6658247309466286, + "grad_norm": 1.8598408699035645, + "learning_rate": 5e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6927292943000793, + "num_tokens": 156915064.0, + "step": 6063 + }, + { + "epoch": 0.6659345486492423, + "grad_norm": 1.6885583400726318, + "learning_rate": 5e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6868370771408081, + "num_tokens": 156947423.0, + "step": 6064 + }, + { + "epoch": 0.6660443663518559, + "grad_norm": 1.713271975517273, + "learning_rate": 5e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7056032419204712, + "num_tokens": 156976505.0, + "step": 6065 + }, + { + "epoch": 0.6661541840544696, + "grad_norm": 1.7566691637039185, + "learning_rate": 5e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7173264026641846, + "num_tokens": 157001807.0, + "step": 6066 + }, + { + "epoch": 0.6662640017570832, + "grad_norm": 1.844545841217041, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.6990605592727661, + "num_tokens": 157026605.0, + "step": 6067 + }, + { + "epoch": 0.6663738194596969, + "grad_norm": 1.811285376548767, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7130305767059326, + "num_tokens": 157053288.0, + "step": 6068 + }, + { + "epoch": 0.6664836371623105, + "grad_norm": 1.9455293416976929, + "learning_rate": 5e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7190172076225281, + "num_tokens": 157076997.0, + "step": 6069 + }, + { + "epoch": 0.6665934548649243, + "grad_norm": 1.715363621711731, + "learning_rate": 5e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6871411800384521, + "num_tokens": 157108803.0, + "step": 6070 + }, + { + "epoch": 0.6667032725675379, + "grad_norm": 1.8334259986877441, + "learning_rate": 5e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7119953632354736, + "num_tokens": 157134871.0, + "step": 6071 + }, + { + "epoch": 0.6668130902701516, + "grad_norm": 1.8272019624710083, + "learning_rate": 5e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7227386236190796, + "num_tokens": 157159305.0, + "step": 6072 + }, + { + "epoch": 0.6669229079727652, + "grad_norm": 1.6427204608917236, + "learning_rate": 5e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6867409944534302, + "num_tokens": 157191098.0, + "step": 6073 + }, + { + "epoch": 0.6670327256753789, + "grad_norm": 1.688687801361084, + "learning_rate": 5e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6903257966041565, + "num_tokens": 157221136.0, + "step": 6074 + }, + { + "epoch": 0.6671425433779925, + "grad_norm": 1.8760491609573364, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7010482549667358, + "num_tokens": 157246685.0, + "step": 6075 + }, + { + "epoch": 0.6672523610806061, + "grad_norm": 1.8725671768188477, + "learning_rate": 5e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7048248052597046, + "num_tokens": 157270179.0, + "step": 6076 + }, + { + "epoch": 0.6673621787832199, + "grad_norm": 1.8537038564682007, + "learning_rate": 5e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7107804417610168, + "num_tokens": 157294324.0, + "step": 6077 + }, + { + "epoch": 0.6674719964858336, + "grad_norm": 1.7439825534820557, + "learning_rate": 5e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6976778507232666, + "num_tokens": 157319578.0, + "step": 6078 + }, + { + "epoch": 0.6675818141884472, + "grad_norm": 1.8210608959197998, + "learning_rate": 5e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7149733901023865, + "num_tokens": 157345179.0, + "step": 6079 + }, + { + "epoch": 0.6676916318910608, + "grad_norm": 1.8267860412597656, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7224066257476807, + "num_tokens": 157370426.0, + "step": 6080 + }, + { + "epoch": 0.6678014495936745, + "grad_norm": 1.8944220542907715, + "learning_rate": 5e-06, + "loss": 1.118, + "mean_token_accuracy": 0.6704108715057373, + "num_tokens": 157397418.0, + "step": 6081 + }, + { + "epoch": 0.6679112672962881, + "grad_norm": 2.0531609058380127, + "learning_rate": 5e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6974042057991028, + "num_tokens": 157417951.0, + "step": 6082 + }, + { + "epoch": 0.6680210849989018, + "grad_norm": 1.7960184812545776, + "learning_rate": 5e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.6994501352310181, + "num_tokens": 157445455.0, + "step": 6083 + }, + { + "epoch": 0.6681309027015155, + "grad_norm": 1.6815862655639648, + "learning_rate": 5e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7080361843109131, + "num_tokens": 157475224.0, + "step": 6084 + }, + { + "epoch": 0.6682407204041292, + "grad_norm": 1.8786205053329468, + "learning_rate": 5e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6968594193458557, + "num_tokens": 157499629.0, + "step": 6085 + }, + { + "epoch": 0.6683505381067428, + "grad_norm": 1.6625028848648071, + "learning_rate": 5e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7074160575866699, + "num_tokens": 157529413.0, + "step": 6086 + }, + { + "epoch": 0.6684603558093565, + "grad_norm": 1.9479707479476929, + "learning_rate": 5e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7120779156684875, + "num_tokens": 157551499.0, + "step": 6087 + }, + { + "epoch": 0.6685701735119701, + "grad_norm": 1.9397306442260742, + "learning_rate": 5e-06, + "loss": 0.893, + "mean_token_accuracy": 0.722313642501831, + "num_tokens": 157574550.0, + "step": 6088 + }, + { + "epoch": 0.6686799912145838, + "grad_norm": 1.8993924856185913, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7273736000061035, + "num_tokens": 157597712.0, + "step": 6089 + }, + { + "epoch": 0.6687898089171974, + "grad_norm": 1.8709275722503662, + "learning_rate": 5e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7077316045761108, + "num_tokens": 157623069.0, + "step": 6090 + }, + { + "epoch": 0.6688996266198111, + "grad_norm": 1.9610806703567505, + "learning_rate": 5e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7134910821914673, + "num_tokens": 157648167.0, + "step": 6091 + }, + { + "epoch": 0.6690094443224248, + "grad_norm": 1.8140615224838257, + "learning_rate": 5e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7116507291793823, + "num_tokens": 157675562.0, + "step": 6092 + }, + { + "epoch": 0.6691192620250385, + "grad_norm": 1.765136480331421, + "learning_rate": 5e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7237039804458618, + "num_tokens": 157702348.0, + "step": 6093 + }, + { + "epoch": 0.6692290797276521, + "grad_norm": 1.7855337858200073, + "learning_rate": 5e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6867873668670654, + "num_tokens": 157731256.0, + "step": 6094 + }, + { + "epoch": 0.6693388974302658, + "grad_norm": 1.81251060962677, + "learning_rate": 5e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7120066285133362, + "num_tokens": 157757593.0, + "step": 6095 + }, + { + "epoch": 0.6694487151328794, + "grad_norm": 1.8107359409332275, + "learning_rate": 5e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7151074409484863, + "num_tokens": 157782963.0, + "step": 6096 + }, + { + "epoch": 0.669558532835493, + "grad_norm": 1.9692168235778809, + "learning_rate": 5e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7176309823989868, + "num_tokens": 157805096.0, + "step": 6097 + }, + { + "epoch": 0.6696683505381067, + "grad_norm": 1.7415730953216553, + "learning_rate": 5e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6957244873046875, + "num_tokens": 157833558.0, + "step": 6098 + }, + { + "epoch": 0.6697781682407205, + "grad_norm": 2.053239345550537, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.722137451171875, + "num_tokens": 157855035.0, + "step": 6099 + }, + { + "epoch": 0.6698879859433341, + "grad_norm": 1.8092000484466553, + "learning_rate": 5e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7033213376998901, + "num_tokens": 157881424.0, + "step": 6100 + }, + { + "epoch": 0.6699978036459477, + "grad_norm": 1.7428321838378906, + "learning_rate": 5e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7072445154190063, + "num_tokens": 157907073.0, + "step": 6101 + }, + { + "epoch": 0.6701076213485614, + "grad_norm": 1.9011311531066895, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.725165605545044, + "num_tokens": 157930235.0, + "step": 6102 + }, + { + "epoch": 0.670217439051175, + "grad_norm": 1.834995985031128, + "learning_rate": 5e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6857246160507202, + "num_tokens": 157958208.0, + "step": 6103 + }, + { + "epoch": 0.6703272567537887, + "grad_norm": 1.8822342157363892, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7132042050361633, + "num_tokens": 157981728.0, + "step": 6104 + }, + { + "epoch": 0.6704370744564023, + "grad_norm": 1.9186294078826904, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6838653683662415, + "num_tokens": 158007148.0, + "step": 6105 + }, + { + "epoch": 0.6705468921590161, + "grad_norm": 2.0417397022247314, + "learning_rate": 5e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7029608488082886, + "num_tokens": 158029126.0, + "step": 6106 + }, + { + "epoch": 0.6706567098616297, + "grad_norm": 1.808373212814331, + "learning_rate": 5e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7114242911338806, + "num_tokens": 158058428.0, + "step": 6107 + }, + { + "epoch": 0.6707665275642434, + "grad_norm": 1.8643245697021484, + "learning_rate": 5e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.6998389959335327, + "num_tokens": 158081264.0, + "step": 6108 + }, + { + "epoch": 0.670876345266857, + "grad_norm": 1.8508176803588867, + "learning_rate": 5e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.717987060546875, + "num_tokens": 158106447.0, + "step": 6109 + }, + { + "epoch": 0.6709861629694707, + "grad_norm": 1.7588832378387451, + "learning_rate": 5e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6837888956069946, + "num_tokens": 158135215.0, + "step": 6110 + }, + { + "epoch": 0.6710959806720843, + "grad_norm": 1.9824484586715698, + "learning_rate": 5e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7177678346633911, + "num_tokens": 158156070.0, + "step": 6111 + }, + { + "epoch": 0.671205798374698, + "grad_norm": 1.9645423889160156, + "learning_rate": 5e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6833506226539612, + "num_tokens": 158180387.0, + "step": 6112 + }, + { + "epoch": 0.6713156160773117, + "grad_norm": 1.870734453201294, + "learning_rate": 5e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6972846984863281, + "num_tokens": 158207320.0, + "step": 6113 + }, + { + "epoch": 0.6714254337799254, + "grad_norm": 1.7402139902114868, + "learning_rate": 5e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6727961301803589, + "num_tokens": 158234962.0, + "step": 6114 + }, + { + "epoch": 0.671535251482539, + "grad_norm": 1.9469166994094849, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7144668102264404, + "num_tokens": 158257367.0, + "step": 6115 + }, + { + "epoch": 0.6716450691851527, + "grad_norm": 2.04032301902771, + "learning_rate": 5e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7082958221435547, + "num_tokens": 158278725.0, + "step": 6116 + }, + { + "epoch": 0.6717548868877663, + "grad_norm": 2.090562343597412, + "learning_rate": 5e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7041164636611938, + "num_tokens": 158299840.0, + "step": 6117 + }, + { + "epoch": 0.67186470459038, + "grad_norm": 1.5682587623596191, + "learning_rate": 5e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6887510418891907, + "num_tokens": 158332405.0, + "step": 6118 + }, + { + "epoch": 0.6719745222929936, + "grad_norm": 1.6586370468139648, + "learning_rate": 5e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7248141169548035, + "num_tokens": 158361798.0, + "step": 6119 + }, + { + "epoch": 0.6720843399956072, + "grad_norm": 1.7311421632766724, + "learning_rate": 5e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6842946410179138, + "num_tokens": 158390410.0, + "step": 6120 + }, + { + "epoch": 0.672194157698221, + "grad_norm": 1.8805763721466064, + "learning_rate": 5e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7007749080657959, + "num_tokens": 158415603.0, + "step": 6121 + }, + { + "epoch": 0.6723039754008346, + "grad_norm": 1.7912465333938599, + "learning_rate": 5e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6771938800811768, + "num_tokens": 158441986.0, + "step": 6122 + }, + { + "epoch": 0.6724137931034483, + "grad_norm": 2.019481658935547, + "learning_rate": 5e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7115874290466309, + "num_tokens": 158461514.0, + "step": 6123 + }, + { + "epoch": 0.6725236108060619, + "grad_norm": 1.738909363746643, + "learning_rate": 5e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7080789804458618, + "num_tokens": 158488505.0, + "step": 6124 + }, + { + "epoch": 0.6726334285086756, + "grad_norm": 1.8679369688034058, + "learning_rate": 5e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.715268611907959, + "num_tokens": 158512359.0, + "step": 6125 + }, + { + "epoch": 0.6727432462112892, + "grad_norm": 2.0673892498016357, + "learning_rate": 5e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.708020806312561, + "num_tokens": 158533208.0, + "step": 6126 + }, + { + "epoch": 0.6728530639139029, + "grad_norm": 1.8269582986831665, + "learning_rate": 5e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7019205689430237, + "num_tokens": 158559370.0, + "step": 6127 + }, + { + "epoch": 0.6729628816165166, + "grad_norm": 1.917500376701355, + "learning_rate": 5e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7097225189208984, + "num_tokens": 158582401.0, + "step": 6128 + }, + { + "epoch": 0.6730726993191303, + "grad_norm": 1.8514654636383057, + "learning_rate": 5e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7051610350608826, + "num_tokens": 158610150.0, + "step": 6129 + }, + { + "epoch": 0.6731825170217439, + "grad_norm": 1.8623147010803223, + "learning_rate": 5e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7214621305465698, + "num_tokens": 158638403.0, + "step": 6130 + }, + { + "epoch": 0.6732923347243576, + "grad_norm": 1.7587002515792847, + "learning_rate": 5e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7070579528808594, + "num_tokens": 158666266.0, + "step": 6131 + }, + { + "epoch": 0.6734021524269712, + "grad_norm": 1.8774464130401611, + "learning_rate": 5e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7152488231658936, + "num_tokens": 158690962.0, + "step": 6132 + }, + { + "epoch": 0.6735119701295849, + "grad_norm": 1.9834349155426025, + "learning_rate": 5e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7215489149093628, + "num_tokens": 158713357.0, + "step": 6133 + }, + { + "epoch": 0.6736217878321985, + "grad_norm": 1.9037939310073853, + "learning_rate": 5e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7096564173698425, + "num_tokens": 158737718.0, + "step": 6134 + }, + { + "epoch": 0.6737316055348123, + "grad_norm": 2.1109683513641357, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7101789712905884, + "num_tokens": 158757757.0, + "step": 6135 + }, + { + "epoch": 0.6738414232374259, + "grad_norm": 1.9912114143371582, + "learning_rate": 5e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7111190557479858, + "num_tokens": 158780143.0, + "step": 6136 + }, + { + "epoch": 0.6739512409400396, + "grad_norm": 1.9178731441497803, + "learning_rate": 5e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7207543849945068, + "num_tokens": 158801539.0, + "step": 6137 + }, + { + "epoch": 0.6740610586426532, + "grad_norm": 1.9584482908248901, + "learning_rate": 5e-06, + "loss": 1.008, + "mean_token_accuracy": 0.700148344039917, + "num_tokens": 158824697.0, + "step": 6138 + }, + { + "epoch": 0.6741708763452668, + "grad_norm": 1.7161906957626343, + "learning_rate": 5e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7241157293319702, + "num_tokens": 158854273.0, + "step": 6139 + }, + { + "epoch": 0.6742806940478805, + "grad_norm": 1.7488173246383667, + "learning_rate": 5e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6934738159179688, + "num_tokens": 158883062.0, + "step": 6140 + }, + { + "epoch": 0.6743905117504941, + "grad_norm": 1.694675326347351, + "learning_rate": 5e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6831242442131042, + "num_tokens": 158914155.0, + "step": 6141 + }, + { + "epoch": 0.6745003294531079, + "grad_norm": 2.0301003456115723, + "learning_rate": 5e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7160170078277588, + "num_tokens": 158938359.0, + "step": 6142 + }, + { + "epoch": 0.6746101471557215, + "grad_norm": 1.6955676078796387, + "learning_rate": 5e-06, + "loss": 0.983, + "mean_token_accuracy": 0.6983418464660645, + "num_tokens": 158969518.0, + "step": 6143 + }, + { + "epoch": 0.6747199648583352, + "grad_norm": 2.047915458679199, + "learning_rate": 5e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.704742968082428, + "num_tokens": 158991919.0, + "step": 6144 + }, + { + "epoch": 0.6748297825609488, + "grad_norm": 1.9752824306488037, + "learning_rate": 5e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7200862169265747, + "num_tokens": 159013486.0, + "step": 6145 + }, + { + "epoch": 0.6749396002635625, + "grad_norm": 1.8970564603805542, + "learning_rate": 5e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.715851366519928, + "num_tokens": 159038247.0, + "step": 6146 + }, + { + "epoch": 0.6750494179661761, + "grad_norm": 1.797882318496704, + "learning_rate": 5e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6992153525352478, + "num_tokens": 159065981.0, + "step": 6147 + }, + { + "epoch": 0.6751592356687898, + "grad_norm": 1.7850217819213867, + "learning_rate": 5e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.707236111164093, + "num_tokens": 159092632.0, + "step": 6148 + }, + { + "epoch": 0.6752690533714034, + "grad_norm": 1.8830257654190063, + "learning_rate": 5e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.6985030174255371, + "num_tokens": 159116358.0, + "step": 6149 + }, + { + "epoch": 0.6753788710740172, + "grad_norm": 1.8394135236740112, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7181301116943359, + "num_tokens": 159141073.0, + "step": 6150 + }, + { + "epoch": 0.6754886887766308, + "grad_norm": 1.8309309482574463, + "learning_rate": 5e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6990904808044434, + "num_tokens": 159165055.0, + "step": 6151 + }, + { + "epoch": 0.6755985064792445, + "grad_norm": 1.7992593050003052, + "learning_rate": 5e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7112160921096802, + "num_tokens": 159189804.0, + "step": 6152 + }, + { + "epoch": 0.6757083241818581, + "grad_norm": 1.7087078094482422, + "learning_rate": 5e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7062540054321289, + "num_tokens": 159218042.0, + "step": 6153 + }, + { + "epoch": 0.6758181418844718, + "grad_norm": 2.0654664039611816, + "learning_rate": 5e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6862878799438477, + "num_tokens": 159241709.0, + "step": 6154 + }, + { + "epoch": 0.6759279595870854, + "grad_norm": 1.8442108631134033, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7023320198059082, + "num_tokens": 159268270.0, + "step": 6155 + }, + { + "epoch": 0.676037777289699, + "grad_norm": 1.7505173683166504, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7037979364395142, + "num_tokens": 159296143.0, + "step": 6156 + }, + { + "epoch": 0.6761475949923128, + "grad_norm": 1.7674974203109741, + "learning_rate": 5e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6926283240318298, + "num_tokens": 159322772.0, + "step": 6157 + }, + { + "epoch": 0.6762574126949265, + "grad_norm": 1.8531925678253174, + "learning_rate": 5e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7268980741500854, + "num_tokens": 159346199.0, + "step": 6158 + }, + { + "epoch": 0.6763672303975401, + "grad_norm": 1.7405357360839844, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6942087411880493, + "num_tokens": 159375164.0, + "step": 6159 + }, + { + "epoch": 0.6764770481001537, + "grad_norm": 2.225907325744629, + "learning_rate": 5e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7154238224029541, + "num_tokens": 159393180.0, + "step": 6160 + }, + { + "epoch": 0.6765868658027674, + "grad_norm": 1.6804654598236084, + "learning_rate": 5e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7131826877593994, + "num_tokens": 159423173.0, + "step": 6161 + }, + { + "epoch": 0.676696683505381, + "grad_norm": 2.055528402328491, + "learning_rate": 5e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6836855411529541, + "num_tokens": 159443766.0, + "step": 6162 + }, + { + "epoch": 0.6768065012079947, + "grad_norm": 1.9691486358642578, + "learning_rate": 5e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7176669836044312, + "num_tokens": 159465563.0, + "step": 6163 + }, + { + "epoch": 0.6769163189106084, + "grad_norm": 1.8710005283355713, + "learning_rate": 5e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.6964948177337646, + "num_tokens": 159490006.0, + "step": 6164 + }, + { + "epoch": 0.6770261366132221, + "grad_norm": 1.8341090679168701, + "learning_rate": 5e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7045694589614868, + "num_tokens": 159514473.0, + "step": 6165 + }, + { + "epoch": 0.6771359543158357, + "grad_norm": 1.8361291885375977, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7273430824279785, + "num_tokens": 159540968.0, + "step": 6166 + }, + { + "epoch": 0.6772457720184494, + "grad_norm": 1.7633063793182373, + "learning_rate": 5e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7147789597511292, + "num_tokens": 159568132.0, + "step": 6167 + }, + { + "epoch": 0.677355589721063, + "grad_norm": 1.7577531337738037, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7167800664901733, + "num_tokens": 159594637.0, + "step": 6168 + }, + { + "epoch": 0.6774654074236767, + "grad_norm": 1.8620781898498535, + "learning_rate": 5e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7112804651260376, + "num_tokens": 159618305.0, + "step": 6169 + }, + { + "epoch": 0.6775752251262903, + "grad_norm": 2.069516181945801, + "learning_rate": 5e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.7027645111083984, + "num_tokens": 159641051.0, + "step": 6170 + }, + { + "epoch": 0.6776850428289041, + "grad_norm": 1.6191498041152954, + "learning_rate": 5e-06, + "loss": 1.086, + "mean_token_accuracy": 0.6689153909683228, + "num_tokens": 159674659.0, + "step": 6171 + }, + { + "epoch": 0.6777948605315177, + "grad_norm": 2.0968830585479736, + "learning_rate": 5e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7218322157859802, + "num_tokens": 159695877.0, + "step": 6172 + }, + { + "epoch": 0.6779046782341314, + "grad_norm": 1.7595813274383545, + "learning_rate": 5e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7119306325912476, + "num_tokens": 159722667.0, + "step": 6173 + }, + { + "epoch": 0.678014495936745, + "grad_norm": 1.880570888519287, + "learning_rate": 5e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7115217447280884, + "num_tokens": 159747151.0, + "step": 6174 + }, + { + "epoch": 0.6781243136393587, + "grad_norm": 1.9343831539154053, + "learning_rate": 5e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7144654989242554, + "num_tokens": 159769879.0, + "step": 6175 + }, + { + "epoch": 0.6782341313419723, + "grad_norm": 1.6676706075668335, + "learning_rate": 5e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6921676993370056, + "num_tokens": 159800687.0, + "step": 6176 + }, + { + "epoch": 0.678343949044586, + "grad_norm": 1.9554567337036133, + "learning_rate": 5e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7166274785995483, + "num_tokens": 159822986.0, + "step": 6177 + }, + { + "epoch": 0.6784537667471996, + "grad_norm": 1.8582688570022583, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7189964056015015, + "num_tokens": 159845813.0, + "step": 6178 + }, + { + "epoch": 0.6785635844498134, + "grad_norm": 1.781335711479187, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7363331317901611, + "num_tokens": 159871438.0, + "step": 6179 + }, + { + "epoch": 0.678673402152427, + "grad_norm": 1.6904598474502563, + "learning_rate": 5e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.704272449016571, + "num_tokens": 159902126.0, + "step": 6180 + }, + { + "epoch": 0.6787832198550406, + "grad_norm": 1.8689119815826416, + "learning_rate": 5e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7241319417953491, + "num_tokens": 159927635.0, + "step": 6181 + }, + { + "epoch": 0.6788930375576543, + "grad_norm": 1.8760113716125488, + "learning_rate": 5e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7080661058425903, + "num_tokens": 159950433.0, + "step": 6182 + }, + { + "epoch": 0.6790028552602679, + "grad_norm": 1.7740544080734253, + "learning_rate": 5e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7005215883255005, + "num_tokens": 159977952.0, + "step": 6183 + }, + { + "epoch": 0.6791126729628816, + "grad_norm": 1.7675707340240479, + "learning_rate": 5e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7061008214950562, + "num_tokens": 160003505.0, + "step": 6184 + }, + { + "epoch": 0.6792224906654952, + "grad_norm": 1.8359112739562988, + "learning_rate": 5e-06, + "loss": 1.112, + "mean_token_accuracy": 0.671640157699585, + "num_tokens": 160031398.0, + "step": 6185 + }, + { + "epoch": 0.679332308368109, + "grad_norm": 1.871415138244629, + "learning_rate": 5e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6953941583633423, + "num_tokens": 160055594.0, + "step": 6186 + }, + { + "epoch": 0.6794421260707226, + "grad_norm": 1.9360973834991455, + "learning_rate": 5e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7232187390327454, + "num_tokens": 160078525.0, + "step": 6187 + }, + { + "epoch": 0.6795519437733363, + "grad_norm": 1.6377111673355103, + "learning_rate": 5e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7290529012680054, + "num_tokens": 160105826.0, + "step": 6188 + }, + { + "epoch": 0.6796617614759499, + "grad_norm": 1.888338565826416, + "learning_rate": 5e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7074170708656311, + "num_tokens": 160129144.0, + "step": 6189 + }, + { + "epoch": 0.6797715791785636, + "grad_norm": 1.7908613681793213, + "learning_rate": 5e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6925235390663147, + "num_tokens": 160155238.0, + "step": 6190 + }, + { + "epoch": 0.6798813968811772, + "grad_norm": 1.7036181688308716, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7152007222175598, + "num_tokens": 160183913.0, + "step": 6191 + }, + { + "epoch": 0.6799912145837909, + "grad_norm": 1.918914794921875, + "learning_rate": 5e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7101430892944336, + "num_tokens": 160206677.0, + "step": 6192 + }, + { + "epoch": 0.6801010322864046, + "grad_norm": 1.7781627178192139, + "learning_rate": 5e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7087416648864746, + "num_tokens": 160233346.0, + "step": 6193 + }, + { + "epoch": 0.6802108499890183, + "grad_norm": 1.6405725479125977, + "learning_rate": 5e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6829236745834351, + "num_tokens": 160269660.0, + "step": 6194 + }, + { + "epoch": 0.6803206676916319, + "grad_norm": 1.9454210996627808, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7059817314147949, + "num_tokens": 160291986.0, + "step": 6195 + }, + { + "epoch": 0.6804304853942456, + "grad_norm": 1.935524821281433, + "learning_rate": 5e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7103652358055115, + "num_tokens": 160316646.0, + "step": 6196 + }, + { + "epoch": 0.6805403030968592, + "grad_norm": 1.828551173210144, + "learning_rate": 5e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7085131406784058, + "num_tokens": 160341302.0, + "step": 6197 + }, + { + "epoch": 0.6806501207994728, + "grad_norm": 1.8547194004058838, + "learning_rate": 5e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6974236965179443, + "num_tokens": 160367219.0, + "step": 6198 + }, + { + "epoch": 0.6807599385020865, + "grad_norm": 1.8677239418029785, + "learning_rate": 5e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7054864168167114, + "num_tokens": 160390979.0, + "step": 6199 + }, + { + "epoch": 0.6808697562047002, + "grad_norm": 2.000619411468506, + "learning_rate": 5e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7123903632164001, + "num_tokens": 160414484.0, + "step": 6200 + }, + { + "epoch": 0.6809795739073139, + "grad_norm": 1.6644659042358398, + "learning_rate": 5e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.7020354270935059, + "num_tokens": 160442958.0, + "step": 6201 + }, + { + "epoch": 0.6810893916099275, + "grad_norm": 1.850096344947815, + "learning_rate": 5e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6907442212104797, + "num_tokens": 160468517.0, + "step": 6202 + }, + { + "epoch": 0.6811992093125412, + "grad_norm": 1.8102266788482666, + "learning_rate": 5e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7020382881164551, + "num_tokens": 160498884.0, + "step": 6203 + }, + { + "epoch": 0.6813090270151548, + "grad_norm": 1.6932297945022583, + "learning_rate": 5e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7097894549369812, + "num_tokens": 160526653.0, + "step": 6204 + }, + { + "epoch": 0.6814188447177685, + "grad_norm": 1.8983523845672607, + "learning_rate": 5e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6828961968421936, + "num_tokens": 160552358.0, + "step": 6205 + }, + { + "epoch": 0.6815286624203821, + "grad_norm": 1.9192780256271362, + "learning_rate": 5e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7238558530807495, + "num_tokens": 160574165.0, + "step": 6206 + }, + { + "epoch": 0.6816384801229959, + "grad_norm": 1.9720561504364014, + "learning_rate": 5e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.703842282295227, + "num_tokens": 160595020.0, + "step": 6207 + }, + { + "epoch": 0.6817482978256095, + "grad_norm": 1.6659796237945557, + "learning_rate": 5e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7087322473526001, + "num_tokens": 160623973.0, + "step": 6208 + }, + { + "epoch": 0.6818581155282232, + "grad_norm": 1.8118679523468018, + "learning_rate": 5e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7046029567718506, + "num_tokens": 160651404.0, + "step": 6209 + }, + { + "epoch": 0.6819679332308368, + "grad_norm": 1.6980851888656616, + "learning_rate": 5e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6929950714111328, + "num_tokens": 160681987.0, + "step": 6210 + }, + { + "epoch": 0.6820777509334505, + "grad_norm": 1.8327277898788452, + "learning_rate": 5e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7043193578720093, + "num_tokens": 160708776.0, + "step": 6211 + }, + { + "epoch": 0.6821875686360641, + "grad_norm": 1.888672113418579, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.704333484172821, + "num_tokens": 160732648.0, + "step": 6212 + }, + { + "epoch": 0.6822973863386778, + "grad_norm": 1.9302754402160645, + "learning_rate": 5e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7208589315414429, + "num_tokens": 160757157.0, + "step": 6213 + }, + { + "epoch": 0.6824072040412914, + "grad_norm": 1.9250319004058838, + "learning_rate": 5e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7095564603805542, + "num_tokens": 160781631.0, + "step": 6214 + }, + { + "epoch": 0.6825170217439052, + "grad_norm": 1.8264997005462646, + "learning_rate": 5e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6985336542129517, + "num_tokens": 160807938.0, + "step": 6215 + }, + { + "epoch": 0.6826268394465188, + "grad_norm": 1.816718578338623, + "learning_rate": 5e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7085146307945251, + "num_tokens": 160831841.0, + "step": 6216 + }, + { + "epoch": 0.6827366571491325, + "grad_norm": 1.6739290952682495, + "learning_rate": 5e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7020094394683838, + "num_tokens": 160859847.0, + "step": 6217 + }, + { + "epoch": 0.6828464748517461, + "grad_norm": 1.776987075805664, + "learning_rate": 5e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6869219541549683, + "num_tokens": 160886726.0, + "step": 6218 + }, + { + "epoch": 0.6829562925543597, + "grad_norm": 1.9353734254837036, + "learning_rate": 5e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7116584777832031, + "num_tokens": 160908175.0, + "step": 6219 + }, + { + "epoch": 0.6830661102569734, + "grad_norm": 2.046330213546753, + "learning_rate": 5e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7174738645553589, + "num_tokens": 160928591.0, + "step": 6220 + }, + { + "epoch": 0.683175927959587, + "grad_norm": 2.0332858562469482, + "learning_rate": 5e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.703606128692627, + "num_tokens": 160951641.0, + "step": 6221 + }, + { + "epoch": 0.6832857456622008, + "grad_norm": 1.7561684846878052, + "learning_rate": 5e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7012680768966675, + "num_tokens": 160979258.0, + "step": 6222 + }, + { + "epoch": 0.6833955633648144, + "grad_norm": 2.0235116481781006, + "learning_rate": 5e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.699805736541748, + "num_tokens": 161001609.0, + "step": 6223 + }, + { + "epoch": 0.6835053810674281, + "grad_norm": 1.6208022832870483, + "learning_rate": 5e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6886870861053467, + "num_tokens": 161033575.0, + "step": 6224 + }, + { + "epoch": 0.6836151987700417, + "grad_norm": 1.8041744232177734, + "learning_rate": 5e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7188366651535034, + "num_tokens": 161059718.0, + "step": 6225 + }, + { + "epoch": 0.6837250164726554, + "grad_norm": 1.6626263856887817, + "learning_rate": 5e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7063294053077698, + "num_tokens": 161090509.0, + "step": 6226 + }, + { + "epoch": 0.683834834175269, + "grad_norm": 1.9845843315124512, + "learning_rate": 5e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7261892557144165, + "num_tokens": 161110039.0, + "step": 6227 + }, + { + "epoch": 0.6839446518778827, + "grad_norm": 2.1995046138763428, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7194755673408508, + "num_tokens": 161128310.0, + "step": 6228 + }, + { + "epoch": 0.6840544695804964, + "grad_norm": 1.8221673965454102, + "learning_rate": 5e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7126917243003845, + "num_tokens": 161154412.0, + "step": 6229 + }, + { + "epoch": 0.6841642872831101, + "grad_norm": 2.196791172027588, + "learning_rate": 5e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7068506479263306, + "num_tokens": 161176604.0, + "step": 6230 + }, + { + "epoch": 0.6842741049857237, + "grad_norm": 1.8090788125991821, + "learning_rate": 5e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7092914581298828, + "num_tokens": 161202094.0, + "step": 6231 + }, + { + "epoch": 0.6843839226883374, + "grad_norm": 1.7347089052200317, + "learning_rate": 5e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6921501159667969, + "num_tokens": 161229435.0, + "step": 6232 + }, + { + "epoch": 0.684493740390951, + "grad_norm": 1.939010739326477, + "learning_rate": 5e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.70906662940979, + "num_tokens": 161254642.0, + "step": 6233 + }, + { + "epoch": 0.6846035580935647, + "grad_norm": 1.775536060333252, + "learning_rate": 5e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6975816488265991, + "num_tokens": 161282306.0, + "step": 6234 + }, + { + "epoch": 0.6847133757961783, + "grad_norm": 1.9985692501068115, + "learning_rate": 5e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7274539470672607, + "num_tokens": 161302410.0, + "step": 6235 + }, + { + "epoch": 0.6848231934987921, + "grad_norm": 2.009199619293213, + "learning_rate": 5e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7072094678878784, + "num_tokens": 161325999.0, + "step": 6236 + }, + { + "epoch": 0.6849330112014057, + "grad_norm": 2.058333396911621, + "learning_rate": 5e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.715252697467804, + "num_tokens": 161346836.0, + "step": 6237 + }, + { + "epoch": 0.6850428289040194, + "grad_norm": 1.7336012125015259, + "learning_rate": 5e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7076059579849243, + "num_tokens": 161375262.0, + "step": 6238 + }, + { + "epoch": 0.685152646606633, + "grad_norm": 1.9442640542984009, + "learning_rate": 5e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7168093323707581, + "num_tokens": 161398765.0, + "step": 6239 + }, + { + "epoch": 0.6852624643092466, + "grad_norm": 1.9691730737686157, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.724888801574707, + "num_tokens": 161420334.0, + "step": 6240 + }, + { + "epoch": 0.6853722820118603, + "grad_norm": 2.067537784576416, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.715110182762146, + "num_tokens": 161443637.0, + "step": 6241 + }, + { + "epoch": 0.6854820997144739, + "grad_norm": 1.9529293775558472, + "learning_rate": 5e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.698481559753418, + "num_tokens": 161465867.0, + "step": 6242 + }, + { + "epoch": 0.6855919174170876, + "grad_norm": 1.726352572441101, + "learning_rate": 5e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.714429497718811, + "num_tokens": 161492568.0, + "step": 6243 + }, + { + "epoch": 0.6857017351197013, + "grad_norm": 1.7188246250152588, + "learning_rate": 5e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6888436079025269, + "num_tokens": 161520321.0, + "step": 6244 + }, + { + "epoch": 0.685811552822315, + "grad_norm": 1.898494839668274, + "learning_rate": 5e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7042633891105652, + "num_tokens": 161545781.0, + "step": 6245 + }, + { + "epoch": 0.6859213705249286, + "grad_norm": 1.8335367441177368, + "learning_rate": 5e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7156252861022949, + "num_tokens": 161569382.0, + "step": 6246 + }, + { + "epoch": 0.6860311882275423, + "grad_norm": 1.7669183015823364, + "learning_rate": 5e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6981999278068542, + "num_tokens": 161597937.0, + "step": 6247 + }, + { + "epoch": 0.6861410059301559, + "grad_norm": 1.7370525598526, + "learning_rate": 5e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6857402324676514, + "num_tokens": 161626950.0, + "step": 6248 + }, + { + "epoch": 0.6862508236327696, + "grad_norm": 1.8611394166946411, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7017149925231934, + "num_tokens": 161652075.0, + "step": 6249 + }, + { + "epoch": 0.6863606413353832, + "grad_norm": 1.6858628988265991, + "learning_rate": 5e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7038120627403259, + "num_tokens": 161680405.0, + "step": 6250 + }, + { + "epoch": 0.686470459037997, + "grad_norm": 1.8580108880996704, + "learning_rate": 5e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7213423252105713, + "num_tokens": 161704404.0, + "step": 6251 + }, + { + "epoch": 0.6865802767406106, + "grad_norm": 1.9410426616668701, + "learning_rate": 5e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7025808691978455, + "num_tokens": 161728411.0, + "step": 6252 + }, + { + "epoch": 0.6866900944432243, + "grad_norm": 1.8164925575256348, + "learning_rate": 5e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6941831111907959, + "num_tokens": 161754052.0, + "step": 6253 + }, + { + "epoch": 0.6867999121458379, + "grad_norm": 1.8860433101654053, + "learning_rate": 5e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7054344415664673, + "num_tokens": 161777780.0, + "step": 6254 + }, + { + "epoch": 0.6869097298484516, + "grad_norm": 1.9117796421051025, + "learning_rate": 5e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7334171533584595, + "num_tokens": 161800759.0, + "step": 6255 + }, + { + "epoch": 0.6870195475510652, + "grad_norm": 1.9408314228057861, + "learning_rate": 5e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7015936374664307, + "num_tokens": 161823214.0, + "step": 6256 + }, + { + "epoch": 0.6871293652536788, + "grad_norm": 1.903778314590454, + "learning_rate": 5e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6939140558242798, + "num_tokens": 161847975.0, + "step": 6257 + }, + { + "epoch": 0.6872391829562926, + "grad_norm": 1.9811393022537231, + "learning_rate": 5e-06, + "loss": 0.937, + "mean_token_accuracy": 0.720784068107605, + "num_tokens": 161871244.0, + "step": 6258 + }, + { + "epoch": 0.6873490006589063, + "grad_norm": 1.823838233947754, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7432988286018372, + "num_tokens": 161894231.0, + "step": 6259 + }, + { + "epoch": 0.6874588183615199, + "grad_norm": 1.8426144123077393, + "learning_rate": 5e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7108270525932312, + "num_tokens": 161920884.0, + "step": 6260 + }, + { + "epoch": 0.6875686360641335, + "grad_norm": 1.7794828414916992, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7222069501876831, + "num_tokens": 161948395.0, + "step": 6261 + }, + { + "epoch": 0.6876784537667472, + "grad_norm": 1.8564043045043945, + "learning_rate": 5e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7149949669837952, + "num_tokens": 161973840.0, + "step": 6262 + }, + { + "epoch": 0.6877882714693608, + "grad_norm": 2.0127432346343994, + "learning_rate": 5e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7261961698532104, + "num_tokens": 161993652.0, + "step": 6263 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 1.6861886978149414, + "learning_rate": 5e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7136116623878479, + "num_tokens": 162023479.0, + "step": 6264 + }, + { + "epoch": 0.6880079068745882, + "grad_norm": 1.7220691442489624, + "learning_rate": 5e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.691851019859314, + "num_tokens": 162053040.0, + "step": 6265 + }, + { + "epoch": 0.6881177245772019, + "grad_norm": 1.5942226648330688, + "learning_rate": 5e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6805007457733154, + "num_tokens": 162086330.0, + "step": 6266 + }, + { + "epoch": 0.6882275422798155, + "grad_norm": 1.7195206880569458, + "learning_rate": 5e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6976903676986694, + "num_tokens": 162115143.0, + "step": 6267 + }, + { + "epoch": 0.6883373599824292, + "grad_norm": 1.9914737939834595, + "learning_rate": 5e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.694028913974762, + "num_tokens": 162138548.0, + "step": 6268 + }, + { + "epoch": 0.6884471776850428, + "grad_norm": 1.5985242128372192, + "learning_rate": 5e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6996233463287354, + "num_tokens": 162170093.0, + "step": 6269 + }, + { + "epoch": 0.6885569953876565, + "grad_norm": 1.8694895505905151, + "learning_rate": 5e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6763197183609009, + "num_tokens": 162199827.0, + "step": 6270 + }, + { + "epoch": 0.6886668130902701, + "grad_norm": 1.7027896642684937, + "learning_rate": 5e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7185367345809937, + "num_tokens": 162228694.0, + "step": 6271 + }, + { + "epoch": 0.6887766307928838, + "grad_norm": 1.8847726583480835, + "learning_rate": 5e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7033877372741699, + "num_tokens": 162253222.0, + "step": 6272 + }, + { + "epoch": 0.6888864484954975, + "grad_norm": 1.8891938924789429, + "learning_rate": 5e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6970794796943665, + "num_tokens": 162279671.0, + "step": 6273 + }, + { + "epoch": 0.6889962661981112, + "grad_norm": 1.6755727529525757, + "learning_rate": 5e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6935586929321289, + "num_tokens": 162310843.0, + "step": 6274 + }, + { + "epoch": 0.6891060839007248, + "grad_norm": 1.7220778465270996, + "learning_rate": 5e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7110167741775513, + "num_tokens": 162335963.0, + "step": 6275 + }, + { + "epoch": 0.6892159016033385, + "grad_norm": 1.9089763164520264, + "learning_rate": 5e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7085986137390137, + "num_tokens": 162359464.0, + "step": 6276 + }, + { + "epoch": 0.6893257193059521, + "grad_norm": 1.7536646127700806, + "learning_rate": 5e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6890268921852112, + "num_tokens": 162389980.0, + "step": 6277 + }, + { + "epoch": 0.6894355370085657, + "grad_norm": 1.907368779182434, + "learning_rate": 5e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6901791095733643, + "num_tokens": 162415144.0, + "step": 6278 + }, + { + "epoch": 0.6895453547111794, + "grad_norm": 1.7012219429016113, + "learning_rate": 5e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.6974551677703857, + "num_tokens": 162444801.0, + "step": 6279 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 1.8731296062469482, + "learning_rate": 5e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7101595401763916, + "num_tokens": 162468239.0, + "step": 6280 + }, + { + "epoch": 0.6897649901164068, + "grad_norm": 1.7744402885437012, + "learning_rate": 5e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7026972770690918, + "num_tokens": 162494232.0, + "step": 6281 + }, + { + "epoch": 0.6898748078190204, + "grad_norm": 1.755787968635559, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7065115571022034, + "num_tokens": 162520748.0, + "step": 6282 + }, + { + "epoch": 0.6899846255216341, + "grad_norm": 1.691299319267273, + "learning_rate": 5e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.718330442905426, + "num_tokens": 162548615.0, + "step": 6283 + }, + { + "epoch": 0.6900944432242477, + "grad_norm": 1.7151981592178345, + "learning_rate": 5e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6846837997436523, + "num_tokens": 162576653.0, + "step": 6284 + }, + { + "epoch": 0.6902042609268614, + "grad_norm": 1.8844630718231201, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7161710858345032, + "num_tokens": 162599551.0, + "step": 6285 + }, + { + "epoch": 0.690314078629475, + "grad_norm": 2.093505382537842, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7255080938339233, + "num_tokens": 162619766.0, + "step": 6286 + }, + { + "epoch": 0.6904238963320888, + "grad_norm": 1.6983798742294312, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7044395208358765, + "num_tokens": 162648637.0, + "step": 6287 + }, + { + "epoch": 0.6905337140347024, + "grad_norm": 1.750317931175232, + "learning_rate": 5e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7133802771568298, + "num_tokens": 162673396.0, + "step": 6288 + }, + { + "epoch": 0.6906435317373161, + "grad_norm": 1.9172314405441284, + "learning_rate": 5e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7166180610656738, + "num_tokens": 162696354.0, + "step": 6289 + }, + { + "epoch": 0.6907533494399297, + "grad_norm": 1.7142748832702637, + "learning_rate": 5e-06, + "loss": 0.972, + "mean_token_accuracy": 0.6985033750534058, + "num_tokens": 162724141.0, + "step": 6290 + }, + { + "epoch": 0.6908631671425434, + "grad_norm": 1.7375904321670532, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7143130898475647, + "num_tokens": 162751342.0, + "step": 6291 + }, + { + "epoch": 0.690972984845157, + "grad_norm": 1.7608002424240112, + "learning_rate": 5e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.698478102684021, + "num_tokens": 162779901.0, + "step": 6292 + }, + { + "epoch": 0.6910828025477707, + "grad_norm": 1.552618145942688, + "learning_rate": 5e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7150030136108398, + "num_tokens": 162811113.0, + "step": 6293 + }, + { + "epoch": 0.6911926202503844, + "grad_norm": 2.063364267349243, + "learning_rate": 5e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7288118004798889, + "num_tokens": 162830526.0, + "step": 6294 + }, + { + "epoch": 0.6913024379529981, + "grad_norm": 1.7139155864715576, + "learning_rate": 5e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6924736499786377, + "num_tokens": 162860166.0, + "step": 6295 + }, + { + "epoch": 0.6914122556556117, + "grad_norm": 1.8396961688995361, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7096282243728638, + "num_tokens": 162884731.0, + "step": 6296 + }, + { + "epoch": 0.6915220733582254, + "grad_norm": 1.90700364112854, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7187677621841431, + "num_tokens": 162906062.0, + "step": 6297 + }, + { + "epoch": 0.691631891060839, + "grad_norm": 1.8548862934112549, + "learning_rate": 5e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7030134201049805, + "num_tokens": 162932918.0, + "step": 6298 + }, + { + "epoch": 0.6917417087634526, + "grad_norm": 1.792658805847168, + "learning_rate": 5e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7135691046714783, + "num_tokens": 162960488.0, + "step": 6299 + }, + { + "epoch": 0.6918515264660663, + "grad_norm": 1.7794122695922852, + "learning_rate": 5e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7055566310882568, + "num_tokens": 162987022.0, + "step": 6300 + }, + { + "epoch": 0.6919613441686799, + "grad_norm": 1.730459451675415, + "learning_rate": 5e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7206655144691467, + "num_tokens": 163013699.0, + "step": 6301 + }, + { + "epoch": 0.6920711618712937, + "grad_norm": 1.9556727409362793, + "learning_rate": 5e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6988271474838257, + "num_tokens": 163038253.0, + "step": 6302 + }, + { + "epoch": 0.6921809795739073, + "grad_norm": 1.7328578233718872, + "learning_rate": 5e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6937757730484009, + "num_tokens": 163066517.0, + "step": 6303 + }, + { + "epoch": 0.692290797276521, + "grad_norm": 1.6101897954940796, + "learning_rate": 5e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7310523390769958, + "num_tokens": 163095701.0, + "step": 6304 + }, + { + "epoch": 0.6924006149791346, + "grad_norm": 1.7778501510620117, + "learning_rate": 5e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7101340293884277, + "num_tokens": 163122406.0, + "step": 6305 + }, + { + "epoch": 0.6925104326817483, + "grad_norm": 1.8310033082962036, + "learning_rate": 5e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6815536022186279, + "num_tokens": 163148740.0, + "step": 6306 + }, + { + "epoch": 0.6926202503843619, + "grad_norm": 1.675197720527649, + "learning_rate": 5e-06, + "loss": 0.967, + "mean_token_accuracy": 0.708316445350647, + "num_tokens": 163179622.0, + "step": 6307 + }, + { + "epoch": 0.6927300680869756, + "grad_norm": 1.8473728895187378, + "learning_rate": 5e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7305946350097656, + "num_tokens": 163203922.0, + "step": 6308 + }, + { + "epoch": 0.6928398857895893, + "grad_norm": 1.9074128866195679, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7464970350265503, + "num_tokens": 163225043.0, + "step": 6309 + }, + { + "epoch": 0.692949703492203, + "grad_norm": 1.9118958711624146, + "learning_rate": 5e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6786596775054932, + "num_tokens": 163252018.0, + "step": 6310 + }, + { + "epoch": 0.6930595211948166, + "grad_norm": 1.7501168251037598, + "learning_rate": 5e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7086289525032043, + "num_tokens": 163279468.0, + "step": 6311 + }, + { + "epoch": 0.6931693388974303, + "grad_norm": 2.092029333114624, + "learning_rate": 5e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.6977330446243286, + "num_tokens": 163300193.0, + "step": 6312 + }, + { + "epoch": 0.6932791566000439, + "grad_norm": 1.6604825258255005, + "learning_rate": 5e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6944984793663025, + "num_tokens": 163331999.0, + "step": 6313 + }, + { + "epoch": 0.6933889743026576, + "grad_norm": 1.8630810976028442, + "learning_rate": 5e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6879118084907532, + "num_tokens": 163357563.0, + "step": 6314 + }, + { + "epoch": 0.6934987920052712, + "grad_norm": 1.8463568687438965, + "learning_rate": 5e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7313799262046814, + "num_tokens": 163379990.0, + "step": 6315 + }, + { + "epoch": 0.693608609707885, + "grad_norm": 1.850774884223938, + "learning_rate": 5e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7005301713943481, + "num_tokens": 163407949.0, + "step": 6316 + }, + { + "epoch": 0.6937184274104986, + "grad_norm": 2.0633702278137207, + "learning_rate": 5e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7101001739501953, + "num_tokens": 163430266.0, + "step": 6317 + }, + { + "epoch": 0.6938282451131123, + "grad_norm": 1.873160719871521, + "learning_rate": 5e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6825383901596069, + "num_tokens": 163455874.0, + "step": 6318 + }, + { + "epoch": 0.6939380628157259, + "grad_norm": 1.9551680088043213, + "learning_rate": 5e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7011752128601074, + "num_tokens": 163478259.0, + "step": 6319 + }, + { + "epoch": 0.6940478805183395, + "grad_norm": 1.7978678941726685, + "learning_rate": 5e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7096389532089233, + "num_tokens": 163503487.0, + "step": 6320 + }, + { + "epoch": 0.6941576982209532, + "grad_norm": 1.8640615940093994, + "learning_rate": 5e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7095149159431458, + "num_tokens": 163530690.0, + "step": 6321 + }, + { + "epoch": 0.6942675159235668, + "grad_norm": 1.9416269063949585, + "learning_rate": 5e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7050483226776123, + "num_tokens": 163557072.0, + "step": 6322 + }, + { + "epoch": 0.6943773336261806, + "grad_norm": 1.663356900215149, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7300944328308105, + "num_tokens": 163586868.0, + "step": 6323 + }, + { + "epoch": 0.6944871513287942, + "grad_norm": 1.9334588050842285, + "learning_rate": 5e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7061874866485596, + "num_tokens": 163612762.0, + "step": 6324 + }, + { + "epoch": 0.6945969690314079, + "grad_norm": 2.171142101287842, + "learning_rate": 5e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7238660454750061, + "num_tokens": 163631060.0, + "step": 6325 + }, + { + "epoch": 0.6947067867340215, + "grad_norm": 2.0004775524139404, + "learning_rate": 5e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.711128830909729, + "num_tokens": 163652314.0, + "step": 6326 + }, + { + "epoch": 0.6948166044366352, + "grad_norm": 1.7960636615753174, + "learning_rate": 5e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7139784097671509, + "num_tokens": 163676680.0, + "step": 6327 + }, + { + "epoch": 0.6949264221392488, + "grad_norm": 1.9741777181625366, + "learning_rate": 5e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7482693195343018, + "num_tokens": 163695939.0, + "step": 6328 + }, + { + "epoch": 0.6950362398418625, + "grad_norm": 1.9060097932815552, + "learning_rate": 5e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7057746052742004, + "num_tokens": 163719157.0, + "step": 6329 + }, + { + "epoch": 0.6951460575444761, + "grad_norm": 1.812424898147583, + "learning_rate": 5e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.691457986831665, + "num_tokens": 163749403.0, + "step": 6330 + }, + { + "epoch": 0.6952558752470899, + "grad_norm": 1.7979801893234253, + "learning_rate": 5e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7087572813034058, + "num_tokens": 163776286.0, + "step": 6331 + }, + { + "epoch": 0.6953656929497035, + "grad_norm": 1.6982245445251465, + "learning_rate": 5e-06, + "loss": 1.1254, + "mean_token_accuracy": 0.6672433614730835, + "num_tokens": 163808166.0, + "step": 6332 + }, + { + "epoch": 0.6954755106523172, + "grad_norm": 1.8438711166381836, + "learning_rate": 5e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7317660450935364, + "num_tokens": 163833363.0, + "step": 6333 + }, + { + "epoch": 0.6955853283549308, + "grad_norm": 1.9827815294265747, + "learning_rate": 5e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7390662431716919, + "num_tokens": 163854034.0, + "step": 6334 + }, + { + "epoch": 0.6956951460575445, + "grad_norm": 1.678187370300293, + "learning_rate": 5e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7110980749130249, + "num_tokens": 163882698.0, + "step": 6335 + }, + { + "epoch": 0.6958049637601581, + "grad_norm": 1.9724713563919067, + "learning_rate": 5e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6858155727386475, + "num_tokens": 163908638.0, + "step": 6336 + }, + { + "epoch": 0.6959147814627717, + "grad_norm": 1.744452714920044, + "learning_rate": 5e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7047299146652222, + "num_tokens": 163934930.0, + "step": 6337 + }, + { + "epoch": 0.6960245991653855, + "grad_norm": 1.835159182548523, + "learning_rate": 5e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6780532598495483, + "num_tokens": 163960879.0, + "step": 6338 + }, + { + "epoch": 0.6961344168679992, + "grad_norm": 1.766258955001831, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7294086813926697, + "num_tokens": 163987466.0, + "step": 6339 + }, + { + "epoch": 0.6962442345706128, + "grad_norm": 1.8841928243637085, + "learning_rate": 5e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7266674637794495, + "num_tokens": 164007974.0, + "step": 6340 + }, + { + "epoch": 0.6963540522732264, + "grad_norm": 1.8920519351959229, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7272742986679077, + "num_tokens": 164031293.0, + "step": 6341 + }, + { + "epoch": 0.6964638699758401, + "grad_norm": 1.7433409690856934, + "learning_rate": 5e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7074204683303833, + "num_tokens": 164058271.0, + "step": 6342 + }, + { + "epoch": 0.6965736876784537, + "grad_norm": 1.7153511047363281, + "learning_rate": 5e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6894437670707703, + "num_tokens": 164086797.0, + "step": 6343 + }, + { + "epoch": 0.6966835053810674, + "grad_norm": 1.9907522201538086, + "learning_rate": 5e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7062174677848816, + "num_tokens": 164111111.0, + "step": 6344 + }, + { + "epoch": 0.6967933230836811, + "grad_norm": 2.047008514404297, + "learning_rate": 5e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7002409100532532, + "num_tokens": 164135549.0, + "step": 6345 + }, + { + "epoch": 0.6969031407862948, + "grad_norm": 1.8857368230819702, + "learning_rate": 5e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6957303285598755, + "num_tokens": 164167054.0, + "step": 6346 + }, + { + "epoch": 0.6970129584889084, + "grad_norm": 1.8721644878387451, + "learning_rate": 5e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6992846727371216, + "num_tokens": 164193706.0, + "step": 6347 + }, + { + "epoch": 0.6971227761915221, + "grad_norm": 1.751512885093689, + "learning_rate": 5e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7148835062980652, + "num_tokens": 164218925.0, + "step": 6348 + }, + { + "epoch": 0.6972325938941357, + "grad_norm": 1.5812145471572876, + "learning_rate": 5e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7160274982452393, + "num_tokens": 164253269.0, + "step": 6349 + }, + { + "epoch": 0.6973424115967494, + "grad_norm": 1.9152140617370605, + "learning_rate": 5e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.691645622253418, + "num_tokens": 164277287.0, + "step": 6350 + }, + { + "epoch": 0.697452229299363, + "grad_norm": 1.969713568687439, + "learning_rate": 5e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7330003976821899, + "num_tokens": 164298346.0, + "step": 6351 + }, + { + "epoch": 0.6975620470019768, + "grad_norm": 1.8930015563964844, + "learning_rate": 5e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7133448123931885, + "num_tokens": 164321664.0, + "step": 6352 + }, + { + "epoch": 0.6976718647045904, + "grad_norm": 1.736156940460205, + "learning_rate": 5e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7167283296585083, + "num_tokens": 164351628.0, + "step": 6353 + }, + { + "epoch": 0.6977816824072041, + "grad_norm": 1.9013620615005493, + "learning_rate": 5e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7198596596717834, + "num_tokens": 164374636.0, + "step": 6354 + }, + { + "epoch": 0.6978915001098177, + "grad_norm": 1.6255197525024414, + "learning_rate": 5e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7273966670036316, + "num_tokens": 164404146.0, + "step": 6355 + }, + { + "epoch": 0.6980013178124314, + "grad_norm": 1.9266124963760376, + "learning_rate": 5e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6883727312088013, + "num_tokens": 164429904.0, + "step": 6356 + }, + { + "epoch": 0.698111135515045, + "grad_norm": 1.9447661638259888, + "learning_rate": 5e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6921291351318359, + "num_tokens": 164453202.0, + "step": 6357 + }, + { + "epoch": 0.6982209532176586, + "grad_norm": 1.9330471754074097, + "learning_rate": 5e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7206817865371704, + "num_tokens": 164477351.0, + "step": 6358 + }, + { + "epoch": 0.6983307709202724, + "grad_norm": 1.7713916301727295, + "learning_rate": 5e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6852454543113708, + "num_tokens": 164506306.0, + "step": 6359 + }, + { + "epoch": 0.698440588622886, + "grad_norm": 2.0179693698883057, + "learning_rate": 5e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7146060466766357, + "num_tokens": 164527271.0, + "step": 6360 + }, + { + "epoch": 0.6985504063254997, + "grad_norm": 1.7838643789291382, + "learning_rate": 5e-06, + "loss": 0.808, + "mean_token_accuracy": 0.742274284362793, + "num_tokens": 164550472.0, + "step": 6361 + }, + { + "epoch": 0.6986602240281133, + "grad_norm": 2.057347536087036, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.721222460269928, + "num_tokens": 164572217.0, + "step": 6362 + }, + { + "epoch": 0.698770041730727, + "grad_norm": 1.8030171394348145, + "learning_rate": 5e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7171248197555542, + "num_tokens": 164599574.0, + "step": 6363 + }, + { + "epoch": 0.6988798594333406, + "grad_norm": 1.6926302909851074, + "learning_rate": 5e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6966433525085449, + "num_tokens": 164631278.0, + "step": 6364 + }, + { + "epoch": 0.6989896771359543, + "grad_norm": 1.7142525911331177, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7048312425613403, + "num_tokens": 164659946.0, + "step": 6365 + }, + { + "epoch": 0.6990994948385679, + "grad_norm": 1.7650774717330933, + "learning_rate": 5e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7013484835624695, + "num_tokens": 164687769.0, + "step": 6366 + }, + { + "epoch": 0.6992093125411817, + "grad_norm": 1.8913512229919434, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6978994011878967, + "num_tokens": 164713225.0, + "step": 6367 + }, + { + "epoch": 0.6993191302437953, + "grad_norm": 1.844678521156311, + "learning_rate": 5e-06, + "loss": 1.1008, + "mean_token_accuracy": 0.6669102907180786, + "num_tokens": 164739198.0, + "step": 6368 + }, + { + "epoch": 0.699428947946409, + "grad_norm": 1.763559341430664, + "learning_rate": 5e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6957547664642334, + "num_tokens": 164766198.0, + "step": 6369 + }, + { + "epoch": 0.6995387656490226, + "grad_norm": 1.727571964263916, + "learning_rate": 5e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7072260975837708, + "num_tokens": 164792085.0, + "step": 6370 + }, + { + "epoch": 0.6996485833516363, + "grad_norm": 1.7410800457000732, + "learning_rate": 5e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.70088130235672, + "num_tokens": 164818025.0, + "step": 6371 + }, + { + "epoch": 0.6997584010542499, + "grad_norm": 1.8742103576660156, + "learning_rate": 5e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7101117968559265, + "num_tokens": 164840149.0, + "step": 6372 + }, + { + "epoch": 0.6998682187568636, + "grad_norm": 1.9628609418869019, + "learning_rate": 5e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7315154075622559, + "num_tokens": 164861984.0, + "step": 6373 + }, + { + "epoch": 0.6999780364594773, + "grad_norm": 2.006136655807495, + "learning_rate": 5e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7114828824996948, + "num_tokens": 164883504.0, + "step": 6374 + }, + { + "epoch": 0.700087854162091, + "grad_norm": 1.8720651865005493, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.717176079750061, + "num_tokens": 164907950.0, + "step": 6375 + }, + { + "epoch": 0.7001976718647046, + "grad_norm": 2.1030080318450928, + "learning_rate": 5e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.700323224067688, + "num_tokens": 164927716.0, + "step": 6376 + }, + { + "epoch": 0.7003074895673183, + "grad_norm": 1.744093656539917, + "learning_rate": 5e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7055476307868958, + "num_tokens": 164957898.0, + "step": 6377 + }, + { + "epoch": 0.7004173072699319, + "grad_norm": 1.8312909603118896, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7303310036659241, + "num_tokens": 164981511.0, + "step": 6378 + }, + { + "epoch": 0.7005271249725455, + "grad_norm": 1.8887615203857422, + "learning_rate": 5e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7104126811027527, + "num_tokens": 165007751.0, + "step": 6379 + }, + { + "epoch": 0.7006369426751592, + "grad_norm": 1.922957181930542, + "learning_rate": 5e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7156756520271301, + "num_tokens": 165030798.0, + "step": 6380 + }, + { + "epoch": 0.700746760377773, + "grad_norm": 1.856642723083496, + "learning_rate": 5e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7063962817192078, + "num_tokens": 165055195.0, + "step": 6381 + }, + { + "epoch": 0.7008565780803866, + "grad_norm": 1.8691939115524292, + "learning_rate": 5e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6947689652442932, + "num_tokens": 165083439.0, + "step": 6382 + }, + { + "epoch": 0.7009663957830002, + "grad_norm": 1.8442713022232056, + "learning_rate": 5e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7122443914413452, + "num_tokens": 165108154.0, + "step": 6383 + }, + { + "epoch": 0.7010762134856139, + "grad_norm": 1.9656107425689697, + "learning_rate": 5e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7141176462173462, + "num_tokens": 165130707.0, + "step": 6384 + }, + { + "epoch": 0.7011860311882275, + "grad_norm": 1.766222596168518, + "learning_rate": 5e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6921864748001099, + "num_tokens": 165159871.0, + "step": 6385 + }, + { + "epoch": 0.7012958488908412, + "grad_norm": 2.0141828060150146, + "learning_rate": 5e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7048807144165039, + "num_tokens": 165180728.0, + "step": 6386 + }, + { + "epoch": 0.7014056665934548, + "grad_norm": 1.9693279266357422, + "learning_rate": 5e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7103884220123291, + "num_tokens": 165203133.0, + "step": 6387 + }, + { + "epoch": 0.7015154842960686, + "grad_norm": 1.7408220767974854, + "learning_rate": 5e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.715148389339447, + "num_tokens": 165231067.0, + "step": 6388 + }, + { + "epoch": 0.7016253019986822, + "grad_norm": 1.7384092807769775, + "learning_rate": 5e-06, + "loss": 1.031, + "mean_token_accuracy": 0.685642659664154, + "num_tokens": 165260047.0, + "step": 6389 + }, + { + "epoch": 0.7017351197012959, + "grad_norm": 1.6860250234603882, + "learning_rate": 5e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.711103081703186, + "num_tokens": 165290052.0, + "step": 6390 + }, + { + "epoch": 0.7018449374039095, + "grad_norm": 1.8464548587799072, + "learning_rate": 5e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7046096920967102, + "num_tokens": 165314004.0, + "step": 6391 + }, + { + "epoch": 0.7019547551065232, + "grad_norm": 1.9992958307266235, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7336370944976807, + "num_tokens": 165334397.0, + "step": 6392 + }, + { + "epoch": 0.7020645728091368, + "grad_norm": 1.6412371397018433, + "learning_rate": 5e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7294734120368958, + "num_tokens": 165365192.0, + "step": 6393 + }, + { + "epoch": 0.7021743905117505, + "grad_norm": 2.107175350189209, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7188149690628052, + "num_tokens": 165385462.0, + "step": 6394 + }, + { + "epoch": 0.7022842082143641, + "grad_norm": 1.766894817352295, + "learning_rate": 5e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.715534508228302, + "num_tokens": 165413834.0, + "step": 6395 + }, + { + "epoch": 0.7023940259169779, + "grad_norm": 1.7690565586090088, + "learning_rate": 5e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6781976222991943, + "num_tokens": 165441802.0, + "step": 6396 + }, + { + "epoch": 0.7025038436195915, + "grad_norm": 1.8431508541107178, + "learning_rate": 5e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6870772838592529, + "num_tokens": 165467541.0, + "step": 6397 + }, + { + "epoch": 0.7026136613222052, + "grad_norm": 1.9410617351531982, + "learning_rate": 5e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7179539799690247, + "num_tokens": 165490591.0, + "step": 6398 + }, + { + "epoch": 0.7027234790248188, + "grad_norm": 1.7502152919769287, + "learning_rate": 5e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6985360383987427, + "num_tokens": 165520610.0, + "step": 6399 + }, + { + "epoch": 0.7028332967274324, + "grad_norm": 1.8730998039245605, + "learning_rate": 5e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7124199867248535, + "num_tokens": 165544679.0, + "step": 6400 + }, + { + "epoch": 0.7029431144300461, + "grad_norm": 1.8074307441711426, + "learning_rate": 5e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7000420689582825, + "num_tokens": 165570078.0, + "step": 6401 + }, + { + "epoch": 0.7030529321326597, + "grad_norm": 1.6428722143173218, + "learning_rate": 5e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7086251378059387, + "num_tokens": 165600942.0, + "step": 6402 + }, + { + "epoch": 0.7031627498352735, + "grad_norm": 1.8389091491699219, + "learning_rate": 5e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7000170350074768, + "num_tokens": 165626520.0, + "step": 6403 + }, + { + "epoch": 0.7032725675378871, + "grad_norm": 1.7870721817016602, + "learning_rate": 5e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.6992437839508057, + "num_tokens": 165654671.0, + "step": 6404 + }, + { + "epoch": 0.7033823852405008, + "grad_norm": 1.7225648164749146, + "learning_rate": 5e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7015769481658936, + "num_tokens": 165683860.0, + "step": 6405 + }, + { + "epoch": 0.7034922029431144, + "grad_norm": 2.0173373222351074, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7201404571533203, + "num_tokens": 165704951.0, + "step": 6406 + }, + { + "epoch": 0.7036020206457281, + "grad_norm": 1.6842584609985352, + "learning_rate": 5e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7147716283798218, + "num_tokens": 165735663.0, + "step": 6407 + }, + { + "epoch": 0.7037118383483417, + "grad_norm": 1.7972443103790283, + "learning_rate": 5e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7249630093574524, + "num_tokens": 165760907.0, + "step": 6408 + }, + { + "epoch": 0.7038216560509554, + "grad_norm": 2.1219260692596436, + "learning_rate": 5e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7285733819007874, + "num_tokens": 165786721.0, + "step": 6409 + }, + { + "epoch": 0.7039314737535691, + "grad_norm": 1.722084403038025, + "learning_rate": 5e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6925057172775269, + "num_tokens": 165815431.0, + "step": 6410 + }, + { + "epoch": 0.7040412914561828, + "grad_norm": 1.7561043500900269, + "learning_rate": 5e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7079939842224121, + "num_tokens": 165844299.0, + "step": 6411 + }, + { + "epoch": 0.7041511091587964, + "grad_norm": 1.8895601034164429, + "learning_rate": 5e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7009146213531494, + "num_tokens": 165869023.0, + "step": 6412 + }, + { + "epoch": 0.7042609268614101, + "grad_norm": 1.8010984659194946, + "learning_rate": 5e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7050697207450867, + "num_tokens": 165892694.0, + "step": 6413 + }, + { + "epoch": 0.7043707445640237, + "grad_norm": 2.336779832839966, + "learning_rate": 5e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.753055214881897, + "num_tokens": 165915605.0, + "step": 6414 + }, + { + "epoch": 0.7044805622666374, + "grad_norm": 1.8651303052902222, + "learning_rate": 5e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7231264710426331, + "num_tokens": 165938877.0, + "step": 6415 + }, + { + "epoch": 0.704590379969251, + "grad_norm": 1.7967320680618286, + "learning_rate": 5e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6869016885757446, + "num_tokens": 165963969.0, + "step": 6416 + }, + { + "epoch": 0.7047001976718648, + "grad_norm": 2.0323052406311035, + "learning_rate": 5e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.696526050567627, + "num_tokens": 165985234.0, + "step": 6417 + }, + { + "epoch": 0.7048100153744784, + "grad_norm": 1.9142519235610962, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7328725457191467, + "num_tokens": 166006916.0, + "step": 6418 + }, + { + "epoch": 0.704919833077092, + "grad_norm": 1.724734902381897, + "learning_rate": 5e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7124496698379517, + "num_tokens": 166032811.0, + "step": 6419 + }, + { + "epoch": 0.7050296507797057, + "grad_norm": 1.8659567832946777, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7118362188339233, + "num_tokens": 166059046.0, + "step": 6420 + }, + { + "epoch": 0.7051394684823193, + "grad_norm": 2.039926052093506, + "learning_rate": 5e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7120215892791748, + "num_tokens": 166079015.0, + "step": 6421 + }, + { + "epoch": 0.705249286184933, + "grad_norm": 2.0708160400390625, + "learning_rate": 5e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7159043550491333, + "num_tokens": 166099843.0, + "step": 6422 + }, + { + "epoch": 0.7053591038875466, + "grad_norm": 1.7607383728027344, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7204707860946655, + "num_tokens": 166126415.0, + "step": 6423 + }, + { + "epoch": 0.7054689215901603, + "grad_norm": 2.026437520980835, + "learning_rate": 5e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7026253342628479, + "num_tokens": 166148418.0, + "step": 6424 + }, + { + "epoch": 0.705578739292774, + "grad_norm": 1.8792357444763184, + "learning_rate": 5e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7112313508987427, + "num_tokens": 166172331.0, + "step": 6425 + }, + { + "epoch": 0.7056885569953877, + "grad_norm": 1.833757996559143, + "learning_rate": 5e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7056552171707153, + "num_tokens": 166199002.0, + "step": 6426 + }, + { + "epoch": 0.7057983746980013, + "grad_norm": 1.8527207374572754, + "learning_rate": 5e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6898502111434937, + "num_tokens": 166225437.0, + "step": 6427 + }, + { + "epoch": 0.705908192400615, + "grad_norm": 2.1075079441070557, + "learning_rate": 5e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7300243377685547, + "num_tokens": 166244721.0, + "step": 6428 + }, + { + "epoch": 0.7060180101032286, + "grad_norm": 1.839094638824463, + "learning_rate": 5e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7196528315544128, + "num_tokens": 166269400.0, + "step": 6429 + }, + { + "epoch": 0.7061278278058423, + "grad_norm": 1.7173328399658203, + "learning_rate": 5e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6959056258201599, + "num_tokens": 166299106.0, + "step": 6430 + }, + { + "epoch": 0.7062376455084559, + "grad_norm": 1.9669339656829834, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7250618934631348, + "num_tokens": 166319502.0, + "step": 6431 + }, + { + "epoch": 0.7063474632110697, + "grad_norm": 1.950644612312317, + "learning_rate": 5e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6867853999137878, + "num_tokens": 166342479.0, + "step": 6432 + }, + { + "epoch": 0.7064572809136833, + "grad_norm": 1.7636479139328003, + "learning_rate": 5e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7069193720817566, + "num_tokens": 166370718.0, + "step": 6433 + }, + { + "epoch": 0.706567098616297, + "grad_norm": 1.8951014280319214, + "learning_rate": 5e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7113856673240662, + "num_tokens": 166394601.0, + "step": 6434 + }, + { + "epoch": 0.7066769163189106, + "grad_norm": 1.8747471570968628, + "learning_rate": 5e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7227010726928711, + "num_tokens": 166416727.0, + "step": 6435 + }, + { + "epoch": 0.7067867340215243, + "grad_norm": 1.8563321828842163, + "learning_rate": 5e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7002633213996887, + "num_tokens": 166439597.0, + "step": 6436 + }, + { + "epoch": 0.7068965517241379, + "grad_norm": 1.9429439306259155, + "learning_rate": 5e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7049099206924438, + "num_tokens": 166460864.0, + "step": 6437 + }, + { + "epoch": 0.7070063694267515, + "grad_norm": 1.850438117980957, + "learning_rate": 5e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7176671028137207, + "num_tokens": 166486595.0, + "step": 6438 + }, + { + "epoch": 0.7071161871293653, + "grad_norm": 1.8892217874526978, + "learning_rate": 5e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7051824331283569, + "num_tokens": 166510757.0, + "step": 6439 + }, + { + "epoch": 0.707226004831979, + "grad_norm": 1.8170512914657593, + "learning_rate": 5e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7091071009635925, + "num_tokens": 166535162.0, + "step": 6440 + }, + { + "epoch": 0.7073358225345926, + "grad_norm": 1.5782817602157593, + "learning_rate": 5e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6977657675743103, + "num_tokens": 166568270.0, + "step": 6441 + }, + { + "epoch": 0.7074456402372062, + "grad_norm": 1.8162763118743896, + "learning_rate": 5e-06, + "loss": 0.977, + "mean_token_accuracy": 0.6953729391098022, + "num_tokens": 166592070.0, + "step": 6442 + }, + { + "epoch": 0.7075554579398199, + "grad_norm": 1.7687525749206543, + "learning_rate": 5e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7009094953536987, + "num_tokens": 166619635.0, + "step": 6443 + }, + { + "epoch": 0.7076652756424335, + "grad_norm": 1.9742026329040527, + "learning_rate": 5e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.701836347579956, + "num_tokens": 166642097.0, + "step": 6444 + }, + { + "epoch": 0.7077750933450472, + "grad_norm": 1.8812438249588013, + "learning_rate": 5e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7209854125976562, + "num_tokens": 166666899.0, + "step": 6445 + }, + { + "epoch": 0.7078849110476609, + "grad_norm": 1.639748215675354, + "learning_rate": 5e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.721095860004425, + "num_tokens": 166694065.0, + "step": 6446 + }, + { + "epoch": 0.7079947287502746, + "grad_norm": 1.8045238256454468, + "learning_rate": 5e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6944550275802612, + "num_tokens": 166720859.0, + "step": 6447 + }, + { + "epoch": 0.7081045464528882, + "grad_norm": 1.928205132484436, + "learning_rate": 5e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6837639212608337, + "num_tokens": 166746524.0, + "step": 6448 + }, + { + "epoch": 0.7082143641555019, + "grad_norm": 1.877119779586792, + "learning_rate": 5e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6839981079101562, + "num_tokens": 166772203.0, + "step": 6449 + }, + { + "epoch": 0.7083241818581155, + "grad_norm": 1.8672285079956055, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7012085914611816, + "num_tokens": 166795637.0, + "step": 6450 + }, + { + "epoch": 0.7084339995607292, + "grad_norm": 1.7736518383026123, + "learning_rate": 5e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7140758037567139, + "num_tokens": 166822660.0, + "step": 6451 + }, + { + "epoch": 0.7085438172633428, + "grad_norm": 1.8754199743270874, + "learning_rate": 5e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.6962403655052185, + "num_tokens": 166847678.0, + "step": 6452 + }, + { + "epoch": 0.7086536349659565, + "grad_norm": 1.7748210430145264, + "learning_rate": 5e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6900250911712646, + "num_tokens": 166876612.0, + "step": 6453 + }, + { + "epoch": 0.7087634526685702, + "grad_norm": 1.9116824865341187, + "learning_rate": 5e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7239356637001038, + "num_tokens": 166899890.0, + "step": 6454 + }, + { + "epoch": 0.7088732703711839, + "grad_norm": 1.6466063261032104, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6978417634963989, + "num_tokens": 166932136.0, + "step": 6455 + }, + { + "epoch": 0.7089830880737975, + "grad_norm": 1.668720006942749, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7066948413848877, + "num_tokens": 166964185.0, + "step": 6456 + }, + { + "epoch": 0.7090929057764112, + "grad_norm": 1.8857460021972656, + "learning_rate": 5e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6969902515411377, + "num_tokens": 166988593.0, + "step": 6457 + }, + { + "epoch": 0.7092027234790248, + "grad_norm": 1.6852154731750488, + "learning_rate": 5e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.69805908203125, + "num_tokens": 167017286.0, + "step": 6458 + }, + { + "epoch": 0.7093125411816384, + "grad_norm": 2.0110831260681152, + "learning_rate": 5e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7055579423904419, + "num_tokens": 167039430.0, + "step": 6459 + }, + { + "epoch": 0.7094223588842521, + "grad_norm": 1.90166175365448, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7141005396842957, + "num_tokens": 167064246.0, + "step": 6460 + }, + { + "epoch": 0.7095321765868658, + "grad_norm": 1.6685220003128052, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.709913969039917, + "num_tokens": 167092318.0, + "step": 6461 + }, + { + "epoch": 0.7096419942894795, + "grad_norm": 1.8101658821105957, + "learning_rate": 5e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7024892568588257, + "num_tokens": 167117708.0, + "step": 6462 + }, + { + "epoch": 0.7097518119920931, + "grad_norm": 2.1876602172851562, + "learning_rate": 5e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.699617326259613, + "num_tokens": 167135853.0, + "step": 6463 + }, + { + "epoch": 0.7098616296947068, + "grad_norm": 1.6254708766937256, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7033244967460632, + "num_tokens": 167166144.0, + "step": 6464 + }, + { + "epoch": 0.7099714473973204, + "grad_norm": 1.8128870725631714, + "learning_rate": 5e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7177537083625793, + "num_tokens": 167191386.0, + "step": 6465 + }, + { + "epoch": 0.7100812650999341, + "grad_norm": 1.9347681999206543, + "learning_rate": 5e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7067594528198242, + "num_tokens": 167215232.0, + "step": 6466 + }, + { + "epoch": 0.7101910828025477, + "grad_norm": 2.0217525959014893, + "learning_rate": 5e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7086693644523621, + "num_tokens": 167239787.0, + "step": 6467 + }, + { + "epoch": 0.7103009005051615, + "grad_norm": 1.7796870470046997, + "learning_rate": 5e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7147753834724426, + "num_tokens": 167265922.0, + "step": 6468 + }, + { + "epoch": 0.7104107182077751, + "grad_norm": 1.7574238777160645, + "learning_rate": 5e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.697344183921814, + "num_tokens": 167293056.0, + "step": 6469 + }, + { + "epoch": 0.7105205359103888, + "grad_norm": 1.684446096420288, + "learning_rate": 5e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.697719931602478, + "num_tokens": 167321522.0, + "step": 6470 + }, + { + "epoch": 0.7106303536130024, + "grad_norm": 1.8795288801193237, + "learning_rate": 5e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.697837233543396, + "num_tokens": 167347303.0, + "step": 6471 + }, + { + "epoch": 0.7107401713156161, + "grad_norm": 2.1026418209075928, + "learning_rate": 5e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7064682841300964, + "num_tokens": 167368158.0, + "step": 6472 + }, + { + "epoch": 0.7108499890182297, + "grad_norm": 2.0495386123657227, + "learning_rate": 5e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7075477838516235, + "num_tokens": 167388568.0, + "step": 6473 + }, + { + "epoch": 0.7109598067208434, + "grad_norm": 1.7310327291488647, + "learning_rate": 5e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.711560845375061, + "num_tokens": 167414576.0, + "step": 6474 + }, + { + "epoch": 0.7110696244234571, + "grad_norm": 1.757492184638977, + "learning_rate": 5e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7223712205886841, + "num_tokens": 167441743.0, + "step": 6475 + }, + { + "epoch": 0.7111794421260708, + "grad_norm": 1.9424442052841187, + "learning_rate": 5e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7091755270957947, + "num_tokens": 167462710.0, + "step": 6476 + }, + { + "epoch": 0.7112892598286844, + "grad_norm": 2.1777734756469727, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.721833348274231, + "num_tokens": 167482038.0, + "step": 6477 + }, + { + "epoch": 0.711399077531298, + "grad_norm": 1.7375351190567017, + "learning_rate": 5e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6959542036056519, + "num_tokens": 167511665.0, + "step": 6478 + }, + { + "epoch": 0.7115088952339117, + "grad_norm": 1.6081428527832031, + "learning_rate": 5e-06, + "loss": 1.0652, + "mean_token_accuracy": 0.6771429777145386, + "num_tokens": 167544965.0, + "step": 6479 + }, + { + "epoch": 0.7116187129365253, + "grad_norm": 1.6442930698394775, + "learning_rate": 5e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7082705497741699, + "num_tokens": 167576053.0, + "step": 6480 + }, + { + "epoch": 0.711728530639139, + "grad_norm": 1.7246617078781128, + "learning_rate": 5e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7205850481987, + "num_tokens": 167601982.0, + "step": 6481 + }, + { + "epoch": 0.7118383483417526, + "grad_norm": 1.8333408832550049, + "learning_rate": 5e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7062296867370605, + "num_tokens": 167628044.0, + "step": 6482 + }, + { + "epoch": 0.7119481660443664, + "grad_norm": 1.6843903064727783, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7274607419967651, + "num_tokens": 167655536.0, + "step": 6483 + }, + { + "epoch": 0.71205798374698, + "grad_norm": 1.7398995161056519, + "learning_rate": 5e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7297952771186829, + "num_tokens": 167681704.0, + "step": 6484 + }, + { + "epoch": 0.7121678014495937, + "grad_norm": 1.7656943798065186, + "learning_rate": 5e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6992767453193665, + "num_tokens": 167708842.0, + "step": 6485 + }, + { + "epoch": 0.7122776191522073, + "grad_norm": 1.7202705144882202, + "learning_rate": 5e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7187728881835938, + "num_tokens": 167734718.0, + "step": 6486 + }, + { + "epoch": 0.712387436854821, + "grad_norm": 1.9221389293670654, + "learning_rate": 5e-06, + "loss": 1.1133, + "mean_token_accuracy": 0.6678314805030823, + "num_tokens": 167760626.0, + "step": 6487 + }, + { + "epoch": 0.7124972545574346, + "grad_norm": 1.9219794273376465, + "learning_rate": 5e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7179521322250366, + "num_tokens": 167785090.0, + "step": 6488 + }, + { + "epoch": 0.7126070722600483, + "grad_norm": 1.6928678750991821, + "learning_rate": 5e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7086571455001831, + "num_tokens": 167814365.0, + "step": 6489 + }, + { + "epoch": 0.712716889962662, + "grad_norm": 1.7048499584197998, + "learning_rate": 5e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6926038265228271, + "num_tokens": 167844816.0, + "step": 6490 + }, + { + "epoch": 0.7128267076652757, + "grad_norm": 1.5619251728057861, + "learning_rate": 5e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7037521600723267, + "num_tokens": 167877491.0, + "step": 6491 + }, + { + "epoch": 0.7129365253678893, + "grad_norm": 1.9628574848175049, + "learning_rate": 5e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7313039302825928, + "num_tokens": 167898155.0, + "step": 6492 + }, + { + "epoch": 0.713046343070503, + "grad_norm": 1.9015969038009644, + "learning_rate": 5e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6979767680168152, + "num_tokens": 167921446.0, + "step": 6493 + }, + { + "epoch": 0.7131561607731166, + "grad_norm": 1.7666270732879639, + "learning_rate": 5e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7198971509933472, + "num_tokens": 167948614.0, + "step": 6494 + }, + { + "epoch": 0.7132659784757303, + "grad_norm": 1.7695400714874268, + "learning_rate": 5e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7031692862510681, + "num_tokens": 167974799.0, + "step": 6495 + }, + { + "epoch": 0.7133757961783439, + "grad_norm": 1.6116247177124023, + "learning_rate": 5e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6889951825141907, + "num_tokens": 168005716.0, + "step": 6496 + }, + { + "epoch": 0.7134856138809577, + "grad_norm": 1.762771725654602, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7226771116256714, + "num_tokens": 168030725.0, + "step": 6497 + }, + { + "epoch": 0.7135954315835713, + "grad_norm": 1.801546335220337, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7279518246650696, + "num_tokens": 168054311.0, + "step": 6498 + }, + { + "epoch": 0.713705249286185, + "grad_norm": 1.9370863437652588, + "learning_rate": 5e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7487055063247681, + "num_tokens": 168073499.0, + "step": 6499 + }, + { + "epoch": 0.7138150669887986, + "grad_norm": 1.7980941534042358, + "learning_rate": 5e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7020034790039062, + "num_tokens": 168098880.0, + "step": 6500 + }, + { + "epoch": 0.7139248846914122, + "grad_norm": 1.683789849281311, + "learning_rate": 5e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6884293556213379, + "num_tokens": 168127722.0, + "step": 6501 + }, + { + "epoch": 0.7140347023940259, + "grad_norm": 1.7034218311309814, + "learning_rate": 5e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.70681232213974, + "num_tokens": 168158276.0, + "step": 6502 + }, + { + "epoch": 0.7141445200966395, + "grad_norm": 1.7111183404922485, + "learning_rate": 5e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7295461297035217, + "num_tokens": 168187260.0, + "step": 6503 + }, + { + "epoch": 0.7142543377992533, + "grad_norm": 1.7101272344589233, + "learning_rate": 5e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6876600980758667, + "num_tokens": 168214993.0, + "step": 6504 + }, + { + "epoch": 0.7143641555018669, + "grad_norm": 1.8707610368728638, + "learning_rate": 5e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6957594156265259, + "num_tokens": 168241728.0, + "step": 6505 + }, + { + "epoch": 0.7144739732044806, + "grad_norm": 1.5299209356307983, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7277712821960449, + "num_tokens": 168274580.0, + "step": 6506 + }, + { + "epoch": 0.7145837909070942, + "grad_norm": 1.800378441810608, + "learning_rate": 5e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7046028971672058, + "num_tokens": 168300789.0, + "step": 6507 + }, + { + "epoch": 0.7146936086097079, + "grad_norm": 1.8498177528381348, + "learning_rate": 5e-06, + "loss": 0.892, + "mean_token_accuracy": 0.720323920249939, + "num_tokens": 168326192.0, + "step": 6508 + }, + { + "epoch": 0.7148034263123215, + "grad_norm": 1.8526828289031982, + "learning_rate": 5e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7076233625411987, + "num_tokens": 168349990.0, + "step": 6509 + }, + { + "epoch": 0.7149132440149352, + "grad_norm": 1.784481406211853, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7398431301116943, + "num_tokens": 168373573.0, + "step": 6510 + }, + { + "epoch": 0.7150230617175488, + "grad_norm": 2.088655710220337, + "learning_rate": 5e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7168723344802856, + "num_tokens": 168394671.0, + "step": 6511 + }, + { + "epoch": 0.7151328794201626, + "grad_norm": 1.592041015625, + "learning_rate": 5e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7017229795455933, + "num_tokens": 168425061.0, + "step": 6512 + }, + { + "epoch": 0.7152426971227762, + "grad_norm": 2.0776190757751465, + "learning_rate": 5e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7205654382705688, + "num_tokens": 168444278.0, + "step": 6513 + }, + { + "epoch": 0.7153525148253899, + "grad_norm": 1.7914836406707764, + "learning_rate": 5e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7048466801643372, + "num_tokens": 168470889.0, + "step": 6514 + }, + { + "epoch": 0.7154623325280035, + "grad_norm": 1.6377314329147339, + "learning_rate": 5e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7015461921691895, + "num_tokens": 168501751.0, + "step": 6515 + }, + { + "epoch": 0.7155721502306172, + "grad_norm": 2.0129880905151367, + "learning_rate": 5e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7171088457107544, + "num_tokens": 168523061.0, + "step": 6516 + }, + { + "epoch": 0.7156819679332308, + "grad_norm": 1.9028866291046143, + "learning_rate": 5e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6825651526451111, + "num_tokens": 168549408.0, + "step": 6517 + }, + { + "epoch": 0.7157917856358444, + "grad_norm": 1.8537534475326538, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.6997758150100708, + "num_tokens": 168573343.0, + "step": 6518 + }, + { + "epoch": 0.7159016033384582, + "grad_norm": 1.9037303924560547, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7128478288650513, + "num_tokens": 168595598.0, + "step": 6519 + }, + { + "epoch": 0.7160114210410718, + "grad_norm": 2.062602996826172, + "learning_rate": 5e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7114062905311584, + "num_tokens": 168616149.0, + "step": 6520 + }, + { + "epoch": 0.7161212387436855, + "grad_norm": 1.8465410470962524, + "learning_rate": 5e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7026987671852112, + "num_tokens": 168637958.0, + "step": 6521 + }, + { + "epoch": 0.7162310564462991, + "grad_norm": 1.7325106859207153, + "learning_rate": 5e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.706467866897583, + "num_tokens": 168666745.0, + "step": 6522 + }, + { + "epoch": 0.7163408741489128, + "grad_norm": 1.8908840417861938, + "learning_rate": 5e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7078403234481812, + "num_tokens": 168691517.0, + "step": 6523 + }, + { + "epoch": 0.7164506918515264, + "grad_norm": 1.6621432304382324, + "learning_rate": 5e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6940509080886841, + "num_tokens": 168723866.0, + "step": 6524 + }, + { + "epoch": 0.7165605095541401, + "grad_norm": 1.7292753458023071, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7190517783164978, + "num_tokens": 168750375.0, + "step": 6525 + }, + { + "epoch": 0.7166703272567538, + "grad_norm": 1.803714394569397, + "learning_rate": 5e-06, + "loss": 0.974, + "mean_token_accuracy": 0.6947190761566162, + "num_tokens": 168778317.0, + "step": 6526 + }, + { + "epoch": 0.7167801449593675, + "grad_norm": 1.8997493982315063, + "learning_rate": 5e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.6965041160583496, + "num_tokens": 168803324.0, + "step": 6527 + }, + { + "epoch": 0.7168899626619811, + "grad_norm": 1.7853111028671265, + "learning_rate": 5e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6951197385787964, + "num_tokens": 168832540.0, + "step": 6528 + }, + { + "epoch": 0.7169997803645948, + "grad_norm": 2.188720941543579, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.728369951248169, + "num_tokens": 168850511.0, + "step": 6529 + }, + { + "epoch": 0.7171095980672084, + "grad_norm": 1.9501769542694092, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6976883411407471, + "num_tokens": 168873802.0, + "step": 6530 + }, + { + "epoch": 0.7172194157698221, + "grad_norm": 2.0821611881256104, + "learning_rate": 5e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7101476192474365, + "num_tokens": 168893788.0, + "step": 6531 + }, + { + "epoch": 0.7173292334724357, + "grad_norm": 1.954992413520813, + "learning_rate": 5e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7257506847381592, + "num_tokens": 168919106.0, + "step": 6532 + }, + { + "epoch": 0.7174390511750495, + "grad_norm": 1.740493893623352, + "learning_rate": 5e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7118096351623535, + "num_tokens": 168945294.0, + "step": 6533 + }, + { + "epoch": 0.7175488688776631, + "grad_norm": 1.7342315912246704, + "learning_rate": 5e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7133259773254395, + "num_tokens": 168970926.0, + "step": 6534 + }, + { + "epoch": 0.7176586865802768, + "grad_norm": 1.887686848640442, + "learning_rate": 5e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6981452703475952, + "num_tokens": 168995765.0, + "step": 6535 + }, + { + "epoch": 0.7177685042828904, + "grad_norm": 1.8613145351409912, + "learning_rate": 5e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7144415378570557, + "num_tokens": 169023315.0, + "step": 6536 + }, + { + "epoch": 0.717878321985504, + "grad_norm": 1.8088589906692505, + "learning_rate": 5e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7093169093132019, + "num_tokens": 169048849.0, + "step": 6537 + }, + { + "epoch": 0.7179881396881177, + "grad_norm": 2.0442397594451904, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7004497051239014, + "num_tokens": 169073277.0, + "step": 6538 + }, + { + "epoch": 0.7180979573907313, + "grad_norm": 1.6964777708053589, + "learning_rate": 5e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6909939646720886, + "num_tokens": 169102205.0, + "step": 6539 + }, + { + "epoch": 0.7182077750933451, + "grad_norm": 1.8164093494415283, + "learning_rate": 5e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7175618410110474, + "num_tokens": 169130820.0, + "step": 6540 + }, + { + "epoch": 0.7183175927959587, + "grad_norm": 1.7191473245620728, + "learning_rate": 5e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7243582606315613, + "num_tokens": 169159340.0, + "step": 6541 + }, + { + "epoch": 0.7184274104985724, + "grad_norm": 1.8471647500991821, + "learning_rate": 5e-06, + "loss": 1.08, + "mean_token_accuracy": 0.673329770565033, + "num_tokens": 169185385.0, + "step": 6542 + }, + { + "epoch": 0.718537228201186, + "grad_norm": 1.762943983078003, + "learning_rate": 5e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6939451694488525, + "num_tokens": 169213552.0, + "step": 6543 + }, + { + "epoch": 0.7186470459037997, + "grad_norm": 1.5940293073654175, + "learning_rate": 5e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7398542165756226, + "num_tokens": 169243355.0, + "step": 6544 + }, + { + "epoch": 0.7187568636064133, + "grad_norm": 1.8477072715759277, + "learning_rate": 5e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7077548503875732, + "num_tokens": 169266986.0, + "step": 6545 + }, + { + "epoch": 0.718866681309027, + "grad_norm": 1.5148873329162598, + "learning_rate": 5e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6869096159934998, + "num_tokens": 169303130.0, + "step": 6546 + }, + { + "epoch": 0.7189764990116406, + "grad_norm": 1.8175044059753418, + "learning_rate": 5e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7191486358642578, + "num_tokens": 169326807.0, + "step": 6547 + }, + { + "epoch": 0.7190863167142544, + "grad_norm": 1.8861243724822998, + "learning_rate": 5e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6906704902648926, + "num_tokens": 169353563.0, + "step": 6548 + }, + { + "epoch": 0.719196134416868, + "grad_norm": 1.5892043113708496, + "learning_rate": 5e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6797034740447998, + "num_tokens": 169387973.0, + "step": 6549 + }, + { + "epoch": 0.7193059521194817, + "grad_norm": 1.4458032846450806, + "learning_rate": 5e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6808040738105774, + "num_tokens": 169426862.0, + "step": 6550 + }, + { + "epoch": 0.7194157698220953, + "grad_norm": 1.6940487623214722, + "learning_rate": 5e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6916107535362244, + "num_tokens": 169454728.0, + "step": 6551 + }, + { + "epoch": 0.719525587524709, + "grad_norm": 1.7129837274551392, + "learning_rate": 5e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7084262371063232, + "num_tokens": 169480740.0, + "step": 6552 + }, + { + "epoch": 0.7196354052273226, + "grad_norm": 2.020799398422241, + "learning_rate": 5e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7149430513381958, + "num_tokens": 169501393.0, + "step": 6553 + }, + { + "epoch": 0.7197452229299363, + "grad_norm": 1.929805040359497, + "learning_rate": 5e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7218890190124512, + "num_tokens": 169524563.0, + "step": 6554 + }, + { + "epoch": 0.71985504063255, + "grad_norm": 1.9190789461135864, + "learning_rate": 5e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7116186618804932, + "num_tokens": 169547476.0, + "step": 6555 + }, + { + "epoch": 0.7199648583351637, + "grad_norm": 2.0830495357513428, + "learning_rate": 5e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7143083810806274, + "num_tokens": 169567264.0, + "step": 6556 + }, + { + "epoch": 0.7200746760377773, + "grad_norm": 1.9489346742630005, + "learning_rate": 5e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7112125754356384, + "num_tokens": 169589507.0, + "step": 6557 + }, + { + "epoch": 0.720184493740391, + "grad_norm": 1.6746457815170288, + "learning_rate": 5e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7093106508255005, + "num_tokens": 169620316.0, + "step": 6558 + }, + { + "epoch": 0.7202943114430046, + "grad_norm": 1.6864893436431885, + "learning_rate": 5e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6993602514266968, + "num_tokens": 169652425.0, + "step": 6559 + }, + { + "epoch": 0.7204041291456182, + "grad_norm": 1.7867122888565063, + "learning_rate": 5e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6946663856506348, + "num_tokens": 169678758.0, + "step": 6560 + }, + { + "epoch": 0.7205139468482319, + "grad_norm": 1.5620975494384766, + "learning_rate": 5e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7202504873275757, + "num_tokens": 169710922.0, + "step": 6561 + }, + { + "epoch": 0.7206237645508456, + "grad_norm": 1.7143689393997192, + "learning_rate": 5e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.709425151348114, + "num_tokens": 169737996.0, + "step": 6562 + }, + { + "epoch": 0.7207335822534593, + "grad_norm": 1.8359007835388184, + "learning_rate": 5e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.713469386100769, + "num_tokens": 169762455.0, + "step": 6563 + }, + { + "epoch": 0.7208433999560729, + "grad_norm": 2.0025594234466553, + "learning_rate": 5e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7044848799705505, + "num_tokens": 169784845.0, + "step": 6564 + }, + { + "epoch": 0.7209532176586866, + "grad_norm": 1.7104065418243408, + "learning_rate": 5e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7117698192596436, + "num_tokens": 169812459.0, + "step": 6565 + }, + { + "epoch": 0.7210630353613002, + "grad_norm": 1.7163264751434326, + "learning_rate": 5e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7238645553588867, + "num_tokens": 169836867.0, + "step": 6566 + }, + { + "epoch": 0.7211728530639139, + "grad_norm": 1.7157090902328491, + "learning_rate": 5e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.677639365196228, + "num_tokens": 169869249.0, + "step": 6567 + }, + { + "epoch": 0.7212826707665275, + "grad_norm": 1.6856077909469604, + "learning_rate": 5e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7184411287307739, + "num_tokens": 169896864.0, + "step": 6568 + }, + { + "epoch": 0.7213924884691413, + "grad_norm": 1.7539684772491455, + "learning_rate": 5e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.709952712059021, + "num_tokens": 169924939.0, + "step": 6569 + }, + { + "epoch": 0.7215023061717549, + "grad_norm": 1.7561964988708496, + "learning_rate": 5e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7160252332687378, + "num_tokens": 169949683.0, + "step": 6570 + }, + { + "epoch": 0.7216121238743686, + "grad_norm": 1.9549615383148193, + "learning_rate": 5e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.68461012840271, + "num_tokens": 169973764.0, + "step": 6571 + }, + { + "epoch": 0.7217219415769822, + "grad_norm": 1.7706451416015625, + "learning_rate": 5e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7060244083404541, + "num_tokens": 170002787.0, + "step": 6572 + }, + { + "epoch": 0.7218317592795959, + "grad_norm": 1.820465326309204, + "learning_rate": 5e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6805222034454346, + "num_tokens": 170028777.0, + "step": 6573 + }, + { + "epoch": 0.7219415769822095, + "grad_norm": 1.8431137800216675, + "learning_rate": 5e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7173879146575928, + "num_tokens": 170052163.0, + "step": 6574 + }, + { + "epoch": 0.7220513946848232, + "grad_norm": 1.8388277292251587, + "learning_rate": 5e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6917900443077087, + "num_tokens": 170077299.0, + "step": 6575 + }, + { + "epoch": 0.7221612123874368, + "grad_norm": 1.723304271697998, + "learning_rate": 5e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6902949213981628, + "num_tokens": 170107843.0, + "step": 6576 + }, + { + "epoch": 0.7222710300900506, + "grad_norm": 1.7455663681030273, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7007772922515869, + "num_tokens": 170133604.0, + "step": 6577 + }, + { + "epoch": 0.7223808477926642, + "grad_norm": 1.6138429641723633, + "learning_rate": 5e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7015875577926636, + "num_tokens": 170168074.0, + "step": 6578 + }, + { + "epoch": 0.7224906654952779, + "grad_norm": 1.727893352508545, + "learning_rate": 5e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6967616081237793, + "num_tokens": 170197319.0, + "step": 6579 + }, + { + "epoch": 0.7226004831978915, + "grad_norm": 1.8547720909118652, + "learning_rate": 5e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7136192917823792, + "num_tokens": 170221666.0, + "step": 6580 + }, + { + "epoch": 0.7227103009005051, + "grad_norm": 1.865860939025879, + "learning_rate": 5e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6937518119812012, + "num_tokens": 170248025.0, + "step": 6581 + }, + { + "epoch": 0.7228201186031188, + "grad_norm": 2.0615177154541016, + "learning_rate": 5e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.699033260345459, + "num_tokens": 170268399.0, + "step": 6582 + }, + { + "epoch": 0.7229299363057324, + "grad_norm": 2.026979923248291, + "learning_rate": 5e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7187445163726807, + "num_tokens": 170289521.0, + "step": 6583 + }, + { + "epoch": 0.7230397540083462, + "grad_norm": 2.03794002532959, + "learning_rate": 5e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7409385442733765, + "num_tokens": 170308477.0, + "step": 6584 + }, + { + "epoch": 0.7231495717109598, + "grad_norm": 1.732559323310852, + "learning_rate": 5e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.703360915184021, + "num_tokens": 170335780.0, + "step": 6585 + }, + { + "epoch": 0.7232593894135735, + "grad_norm": 1.9374481439590454, + "learning_rate": 5e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6930879950523376, + "num_tokens": 170359001.0, + "step": 6586 + }, + { + "epoch": 0.7233692071161871, + "grad_norm": 1.797104835510254, + "learning_rate": 5e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7116179466247559, + "num_tokens": 170386115.0, + "step": 6587 + }, + { + "epoch": 0.7234790248188008, + "grad_norm": 1.8881210088729858, + "learning_rate": 5e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7149877548217773, + "num_tokens": 170411386.0, + "step": 6588 + }, + { + "epoch": 0.7235888425214144, + "grad_norm": 1.8975416421890259, + "learning_rate": 5e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7121471166610718, + "num_tokens": 170436978.0, + "step": 6589 + }, + { + "epoch": 0.7236986602240281, + "grad_norm": 1.786537766456604, + "learning_rate": 5e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7075406312942505, + "num_tokens": 170463239.0, + "step": 6590 + }, + { + "epoch": 0.7238084779266418, + "grad_norm": 1.8928905725479126, + "learning_rate": 5e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6951795816421509, + "num_tokens": 170485669.0, + "step": 6591 + }, + { + "epoch": 0.7239182956292555, + "grad_norm": 1.7282122373580933, + "learning_rate": 5e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6907305717468262, + "num_tokens": 170512414.0, + "step": 6592 + }, + { + "epoch": 0.7240281133318691, + "grad_norm": 1.7281899452209473, + "learning_rate": 5e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7220483422279358, + "num_tokens": 170540009.0, + "step": 6593 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 1.6492793560028076, + "learning_rate": 5e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6870744228363037, + "num_tokens": 170569938.0, + "step": 6594 + }, + { + "epoch": 0.7242477487370964, + "grad_norm": 1.7192277908325195, + "learning_rate": 5e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6768786907196045, + "num_tokens": 170599585.0, + "step": 6595 + }, + { + "epoch": 0.72435756643971, + "grad_norm": 1.803297996520996, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6995149254798889, + "num_tokens": 170625048.0, + "step": 6596 + }, + { + "epoch": 0.7244673841423237, + "grad_norm": 2.0084152221679688, + "learning_rate": 5e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.712296724319458, + "num_tokens": 170649456.0, + "step": 6597 + }, + { + "epoch": 0.7245772018449375, + "grad_norm": 1.8011136054992676, + "learning_rate": 5e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7154495716094971, + "num_tokens": 170672107.0, + "step": 6598 + }, + { + "epoch": 0.7246870195475511, + "grad_norm": 1.7622935771942139, + "learning_rate": 5e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.715675413608551, + "num_tokens": 170698544.0, + "step": 6599 + }, + { + "epoch": 0.7247968372501647, + "grad_norm": 1.8679457902908325, + "learning_rate": 5e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6866848468780518, + "num_tokens": 170725367.0, + "step": 6600 + }, + { + "epoch": 0.7249066549527784, + "grad_norm": 1.936423659324646, + "learning_rate": 5e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.69399094581604, + "num_tokens": 170751114.0, + "step": 6601 + }, + { + "epoch": 0.725016472655392, + "grad_norm": 1.8310420513153076, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7215143442153931, + "num_tokens": 170775748.0, + "step": 6602 + }, + { + "epoch": 0.7251262903580057, + "grad_norm": 1.7457820177078247, + "learning_rate": 5e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7087515592575073, + "num_tokens": 170803931.0, + "step": 6603 + }, + { + "epoch": 0.7252361080606193, + "grad_norm": 1.8593732118606567, + "learning_rate": 5e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7038849592208862, + "num_tokens": 170827221.0, + "step": 6604 + }, + { + "epoch": 0.725345925763233, + "grad_norm": 1.8938744068145752, + "learning_rate": 5e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6994205713272095, + "num_tokens": 170853311.0, + "step": 6605 + }, + { + "epoch": 0.7254557434658467, + "grad_norm": 1.8521606922149658, + "learning_rate": 5e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7022267580032349, + "num_tokens": 170878503.0, + "step": 6606 + }, + { + "epoch": 0.7255655611684604, + "grad_norm": 1.832329273223877, + "learning_rate": 5e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7204990386962891, + "num_tokens": 170902462.0, + "step": 6607 + }, + { + "epoch": 0.725675378871074, + "grad_norm": 1.86726713180542, + "learning_rate": 5e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7016636729240417, + "num_tokens": 170927891.0, + "step": 6608 + }, + { + "epoch": 0.7257851965736877, + "grad_norm": 1.9364771842956543, + "learning_rate": 5e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7202330827713013, + "num_tokens": 170948922.0, + "step": 6609 + }, + { + "epoch": 0.7258950142763013, + "grad_norm": 1.7221089601516724, + "learning_rate": 5e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7003374695777893, + "num_tokens": 170975555.0, + "step": 6610 + }, + { + "epoch": 0.726004831978915, + "grad_norm": 1.5075980424880981, + "learning_rate": 5e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7145365476608276, + "num_tokens": 171009176.0, + "step": 6611 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 1.7517889738082886, + "learning_rate": 5e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6840922832489014, + "num_tokens": 171039070.0, + "step": 6612 + }, + { + "epoch": 0.7262244673841424, + "grad_norm": 1.8897987604141235, + "learning_rate": 5e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7065877318382263, + "num_tokens": 171063104.0, + "step": 6613 + }, + { + "epoch": 0.726334285086756, + "grad_norm": 1.8534126281738281, + "learning_rate": 5e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7040594816207886, + "num_tokens": 171086791.0, + "step": 6614 + }, + { + "epoch": 0.7264441027893697, + "grad_norm": 1.9744493961334229, + "learning_rate": 5e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7078003883361816, + "num_tokens": 171108925.0, + "step": 6615 + }, + { + "epoch": 0.7265539204919833, + "grad_norm": 1.8171229362487793, + "learning_rate": 5e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6939083933830261, + "num_tokens": 171135915.0, + "step": 6616 + }, + { + "epoch": 0.726663738194597, + "grad_norm": 1.737046480178833, + "learning_rate": 5e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6897718906402588, + "num_tokens": 171165149.0, + "step": 6617 + }, + { + "epoch": 0.7267735558972106, + "grad_norm": 1.7319931983947754, + "learning_rate": 5e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6988186836242676, + "num_tokens": 171193221.0, + "step": 6618 + }, + { + "epoch": 0.7268833735998242, + "grad_norm": 1.9145474433898926, + "learning_rate": 5e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6893267035484314, + "num_tokens": 171217364.0, + "step": 6619 + }, + { + "epoch": 0.726993191302438, + "grad_norm": 1.7965840101242065, + "learning_rate": 5e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7090349197387695, + "num_tokens": 171241915.0, + "step": 6620 + }, + { + "epoch": 0.7271030090050516, + "grad_norm": 1.8983428478240967, + "learning_rate": 5e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7062357664108276, + "num_tokens": 171267856.0, + "step": 6621 + }, + { + "epoch": 0.7272128267076653, + "grad_norm": 1.735719084739685, + "learning_rate": 5e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6951491832733154, + "num_tokens": 171298203.0, + "step": 6622 + }, + { + "epoch": 0.7273226444102789, + "grad_norm": 1.9196974039077759, + "learning_rate": 5e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.677321195602417, + "num_tokens": 171324391.0, + "step": 6623 + }, + { + "epoch": 0.7274324621128926, + "grad_norm": 1.9496818780899048, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6971435546875, + "num_tokens": 171347602.0, + "step": 6624 + }, + { + "epoch": 0.7275422798155062, + "grad_norm": 1.6246507167816162, + "learning_rate": 5e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7204930186271667, + "num_tokens": 171376769.0, + "step": 6625 + }, + { + "epoch": 0.7276520975181199, + "grad_norm": 1.8113118410110474, + "learning_rate": 5e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6965301036834717, + "num_tokens": 171401217.0, + "step": 6626 + }, + { + "epoch": 0.7277619152207336, + "grad_norm": 1.7787449359893799, + "learning_rate": 5e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.687407374382019, + "num_tokens": 171430140.0, + "step": 6627 + }, + { + "epoch": 0.7278717329233473, + "grad_norm": 1.4299458265304565, + "learning_rate": 5e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.702434778213501, + "num_tokens": 171465010.0, + "step": 6628 + }, + { + "epoch": 0.7279815506259609, + "grad_norm": 1.7705507278442383, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.709141731262207, + "num_tokens": 171491520.0, + "step": 6629 + }, + { + "epoch": 0.7280913683285746, + "grad_norm": 1.8958115577697754, + "learning_rate": 5e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7074258923530579, + "num_tokens": 171515567.0, + "step": 6630 + }, + { + "epoch": 0.7282011860311882, + "grad_norm": 1.7422807216644287, + "learning_rate": 5e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.740570068359375, + "num_tokens": 171540746.0, + "step": 6631 + }, + { + "epoch": 0.7283110037338019, + "grad_norm": 1.9477595090866089, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7240616083145142, + "num_tokens": 171560867.0, + "step": 6632 + }, + { + "epoch": 0.7284208214364155, + "grad_norm": 1.9426307678222656, + "learning_rate": 5e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7107322216033936, + "num_tokens": 171582394.0, + "step": 6633 + }, + { + "epoch": 0.7285306391390292, + "grad_norm": 1.7704763412475586, + "learning_rate": 5e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7302809357643127, + "num_tokens": 171608961.0, + "step": 6634 + }, + { + "epoch": 0.7286404568416429, + "grad_norm": 1.7601920366287231, + "learning_rate": 5e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7230581045150757, + "num_tokens": 171633270.0, + "step": 6635 + }, + { + "epoch": 0.7287502745442566, + "grad_norm": 1.7858388423919678, + "learning_rate": 5e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7188113927841187, + "num_tokens": 171656868.0, + "step": 6636 + }, + { + "epoch": 0.7288600922468702, + "grad_norm": 1.6780983209609985, + "learning_rate": 5e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7025872468948364, + "num_tokens": 171684206.0, + "step": 6637 + }, + { + "epoch": 0.7289699099494839, + "grad_norm": 1.8500559329986572, + "learning_rate": 5e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7114666700363159, + "num_tokens": 171708596.0, + "step": 6638 + }, + { + "epoch": 0.7290797276520975, + "grad_norm": 1.7979248762130737, + "learning_rate": 5e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7006580829620361, + "num_tokens": 171736395.0, + "step": 6639 + }, + { + "epoch": 0.7291895453547111, + "grad_norm": 1.7264760732650757, + "learning_rate": 5e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7014119625091553, + "num_tokens": 171764325.0, + "step": 6640 + }, + { + "epoch": 0.7292993630573248, + "grad_norm": 1.6700513362884521, + "learning_rate": 5e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6938992142677307, + "num_tokens": 171792734.0, + "step": 6641 + }, + { + "epoch": 0.7294091807599385, + "grad_norm": 1.8869057893753052, + "learning_rate": 5e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6902589797973633, + "num_tokens": 171818116.0, + "step": 6642 + }, + { + "epoch": 0.7295189984625522, + "grad_norm": 1.7197644710540771, + "learning_rate": 5e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7084663510322571, + "num_tokens": 171843098.0, + "step": 6643 + }, + { + "epoch": 0.7296288161651658, + "grad_norm": 1.8506884574890137, + "learning_rate": 5e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7078243494033813, + "num_tokens": 171866639.0, + "step": 6644 + }, + { + "epoch": 0.7297386338677795, + "grad_norm": 1.942895770072937, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7331827878952026, + "num_tokens": 171887786.0, + "step": 6645 + }, + { + "epoch": 0.7298484515703931, + "grad_norm": 2.0136353969573975, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7163670659065247, + "num_tokens": 171908232.0, + "step": 6646 + }, + { + "epoch": 0.7299582692730068, + "grad_norm": 2.064401388168335, + "learning_rate": 5e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.73805171251297, + "num_tokens": 171927017.0, + "step": 6647 + }, + { + "epoch": 0.7300680869756204, + "grad_norm": 1.6920146942138672, + "learning_rate": 5e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6877934336662292, + "num_tokens": 171956218.0, + "step": 6648 + }, + { + "epoch": 0.7301779046782342, + "grad_norm": 1.924994707107544, + "learning_rate": 5e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7013732194900513, + "num_tokens": 171978380.0, + "step": 6649 + }, + { + "epoch": 0.7302877223808478, + "grad_norm": 1.7751818895339966, + "learning_rate": 5e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7058188915252686, + "num_tokens": 172004741.0, + "step": 6650 + }, + { + "epoch": 0.7303975400834615, + "grad_norm": 1.8056185245513916, + "learning_rate": 5e-06, + "loss": 1.029, + "mean_token_accuracy": 0.696792721748352, + "num_tokens": 172032261.0, + "step": 6651 + }, + { + "epoch": 0.7305073577860751, + "grad_norm": 1.9391350746154785, + "learning_rate": 5e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7159353494644165, + "num_tokens": 172055034.0, + "step": 6652 + }, + { + "epoch": 0.7306171754886888, + "grad_norm": 1.7850579023361206, + "learning_rate": 5e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6866067051887512, + "num_tokens": 172081444.0, + "step": 6653 + }, + { + "epoch": 0.7307269931913024, + "grad_norm": 1.7706705331802368, + "learning_rate": 5e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7021949291229248, + "num_tokens": 172108223.0, + "step": 6654 + }, + { + "epoch": 0.7308368108939161, + "grad_norm": 1.8183234930038452, + "learning_rate": 5e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6889908909797668, + "num_tokens": 172133930.0, + "step": 6655 + }, + { + "epoch": 0.7309466285965298, + "grad_norm": 1.8070229291915894, + "learning_rate": 5e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7044944167137146, + "num_tokens": 172161316.0, + "step": 6656 + }, + { + "epoch": 0.7310564462991435, + "grad_norm": 1.5049376487731934, + "learning_rate": 5e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.6987367868423462, + "num_tokens": 172195122.0, + "step": 6657 + }, + { + "epoch": 0.7311662640017571, + "grad_norm": 1.7846319675445557, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7186153531074524, + "num_tokens": 172220842.0, + "step": 6658 + }, + { + "epoch": 0.7312760817043708, + "grad_norm": 1.8125803470611572, + "learning_rate": 5e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7128437757492065, + "num_tokens": 172244569.0, + "step": 6659 + }, + { + "epoch": 0.7313858994069844, + "grad_norm": 1.871595859527588, + "learning_rate": 5e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6993366479873657, + "num_tokens": 172271692.0, + "step": 6660 + }, + { + "epoch": 0.731495717109598, + "grad_norm": 1.7472409009933472, + "learning_rate": 5e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7156981229782104, + "num_tokens": 172296386.0, + "step": 6661 + }, + { + "epoch": 0.7316055348122117, + "grad_norm": 1.7624309062957764, + "learning_rate": 5e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7049554586410522, + "num_tokens": 172321874.0, + "step": 6662 + }, + { + "epoch": 0.7317153525148253, + "grad_norm": 1.9259651899337769, + "learning_rate": 5e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7024611830711365, + "num_tokens": 172343708.0, + "step": 6663 + }, + { + "epoch": 0.7318251702174391, + "grad_norm": 1.9403467178344727, + "learning_rate": 5e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7032058238983154, + "num_tokens": 172367079.0, + "step": 6664 + }, + { + "epoch": 0.7319349879200527, + "grad_norm": 1.9004724025726318, + "learning_rate": 5e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6899023652076721, + "num_tokens": 172394064.0, + "step": 6665 + }, + { + "epoch": 0.7320448056226664, + "grad_norm": 2.021329164505005, + "learning_rate": 5e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7334584593772888, + "num_tokens": 172413334.0, + "step": 6666 + }, + { + "epoch": 0.73215462332528, + "grad_norm": 1.7088249921798706, + "learning_rate": 5e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7069616317749023, + "num_tokens": 172440424.0, + "step": 6667 + }, + { + "epoch": 0.7322644410278937, + "grad_norm": 1.7534558773040771, + "learning_rate": 5e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7241942882537842, + "num_tokens": 172464821.0, + "step": 6668 + }, + { + "epoch": 0.7323742587305073, + "grad_norm": 1.676693081855774, + "learning_rate": 5e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7187904119491577, + "num_tokens": 172493455.0, + "step": 6669 + }, + { + "epoch": 0.732484076433121, + "grad_norm": 1.985400676727295, + "learning_rate": 5e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7196518778800964, + "num_tokens": 172516181.0, + "step": 6670 + }, + { + "epoch": 0.7325938941357347, + "grad_norm": 2.0135178565979004, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.703035831451416, + "num_tokens": 172540027.0, + "step": 6671 + }, + { + "epoch": 0.7327037118383484, + "grad_norm": 2.0612382888793945, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7315866947174072, + "num_tokens": 172557979.0, + "step": 6672 + }, + { + "epoch": 0.732813529540962, + "grad_norm": 1.760545253753662, + "learning_rate": 5e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6948957443237305, + "num_tokens": 172587644.0, + "step": 6673 + }, + { + "epoch": 0.7329233472435757, + "grad_norm": 1.6948575973510742, + "learning_rate": 5e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6936307549476624, + "num_tokens": 172617113.0, + "step": 6674 + }, + { + "epoch": 0.7330331649461893, + "grad_norm": 1.8992784023284912, + "learning_rate": 5e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7013579607009888, + "num_tokens": 172642104.0, + "step": 6675 + }, + { + "epoch": 0.733142982648803, + "grad_norm": 1.5732026100158691, + "learning_rate": 5e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7054716348648071, + "num_tokens": 172674904.0, + "step": 6676 + }, + { + "epoch": 0.7332528003514166, + "grad_norm": 2.0143215656280518, + "learning_rate": 5e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7143221497535706, + "num_tokens": 172696082.0, + "step": 6677 + }, + { + "epoch": 0.7333626180540304, + "grad_norm": 1.8009475469589233, + "learning_rate": 5e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6974329948425293, + "num_tokens": 172721239.0, + "step": 6678 + }, + { + "epoch": 0.733472435756644, + "grad_norm": 1.7371591329574585, + "learning_rate": 5e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7009972333908081, + "num_tokens": 172747184.0, + "step": 6679 + }, + { + "epoch": 0.7335822534592576, + "grad_norm": 1.9863275289535522, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7217575907707214, + "num_tokens": 172767450.0, + "step": 6680 + }, + { + "epoch": 0.7336920711618713, + "grad_norm": 1.8197890520095825, + "learning_rate": 5e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6958345770835876, + "num_tokens": 172792745.0, + "step": 6681 + }, + { + "epoch": 0.7338018888644849, + "grad_norm": 1.7627415657043457, + "learning_rate": 5e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6977941989898682, + "num_tokens": 172823538.0, + "step": 6682 + }, + { + "epoch": 0.7339117065670986, + "grad_norm": 1.9047119617462158, + "learning_rate": 5e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7125963568687439, + "num_tokens": 172845204.0, + "step": 6683 + }, + { + "epoch": 0.7340215242697122, + "grad_norm": 1.834356665611267, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7453795671463013, + "num_tokens": 172867951.0, + "step": 6684 + }, + { + "epoch": 0.734131341972326, + "grad_norm": 1.7849681377410889, + "learning_rate": 5e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7173751592636108, + "num_tokens": 172893724.0, + "step": 6685 + }, + { + "epoch": 0.7342411596749396, + "grad_norm": 1.9574432373046875, + "learning_rate": 5e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7075814008712769, + "num_tokens": 172917765.0, + "step": 6686 + }, + { + "epoch": 0.7343509773775533, + "grad_norm": 1.778942584991455, + "learning_rate": 5e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7003225088119507, + "num_tokens": 172945586.0, + "step": 6687 + }, + { + "epoch": 0.7344607950801669, + "grad_norm": 1.7251523733139038, + "learning_rate": 5e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7053053379058838, + "num_tokens": 172977293.0, + "step": 6688 + }, + { + "epoch": 0.7345706127827806, + "grad_norm": 1.7549364566802979, + "learning_rate": 5e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7082525491714478, + "num_tokens": 173003004.0, + "step": 6689 + }, + { + "epoch": 0.7346804304853942, + "grad_norm": 1.6878734827041626, + "learning_rate": 5e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7083302736282349, + "num_tokens": 173030675.0, + "step": 6690 + }, + { + "epoch": 0.7347902481880079, + "grad_norm": 1.7570167779922485, + "learning_rate": 5e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6988067626953125, + "num_tokens": 173055993.0, + "step": 6691 + }, + { + "epoch": 0.7349000658906216, + "grad_norm": 1.669018268585205, + "learning_rate": 5e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7173439264297485, + "num_tokens": 173083611.0, + "step": 6692 + }, + { + "epoch": 0.7350098835932353, + "grad_norm": 1.736513376235962, + "learning_rate": 5e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7223935127258301, + "num_tokens": 173111968.0, + "step": 6693 + }, + { + "epoch": 0.7351197012958489, + "grad_norm": 1.7598512172698975, + "learning_rate": 5e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6866772174835205, + "num_tokens": 173138032.0, + "step": 6694 + }, + { + "epoch": 0.7352295189984626, + "grad_norm": 1.6375564336776733, + "learning_rate": 5e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7256646156311035, + "num_tokens": 173166948.0, + "step": 6695 + }, + { + "epoch": 0.7353393367010762, + "grad_norm": 1.7954694032669067, + "learning_rate": 5e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.6994973421096802, + "num_tokens": 173192004.0, + "step": 6696 + }, + { + "epoch": 0.7354491544036899, + "grad_norm": 1.8929692506790161, + "learning_rate": 5e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.6903205513954163, + "num_tokens": 173216097.0, + "step": 6697 + }, + { + "epoch": 0.7355589721063035, + "grad_norm": 1.7947782278060913, + "learning_rate": 5e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7118486166000366, + "num_tokens": 173239608.0, + "step": 6698 + }, + { + "epoch": 0.7356687898089171, + "grad_norm": 1.7925339937210083, + "learning_rate": 5e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6956188678741455, + "num_tokens": 173265041.0, + "step": 6699 + }, + { + "epoch": 0.7357786075115309, + "grad_norm": 1.7955073118209839, + "learning_rate": 5e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7056070566177368, + "num_tokens": 173288293.0, + "step": 6700 + }, + { + "epoch": 0.7358884252141445, + "grad_norm": 1.6312562227249146, + "learning_rate": 5e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6941760778427124, + "num_tokens": 173316740.0, + "step": 6701 + }, + { + "epoch": 0.7359982429167582, + "grad_norm": 1.9343745708465576, + "learning_rate": 5e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.71471107006073, + "num_tokens": 173339060.0, + "step": 6702 + }, + { + "epoch": 0.7361080606193718, + "grad_norm": 1.9206963777542114, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.745186448097229, + "num_tokens": 173360065.0, + "step": 6703 + }, + { + "epoch": 0.7362178783219855, + "grad_norm": 1.7677747011184692, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7012482285499573, + "num_tokens": 173386905.0, + "step": 6704 + }, + { + "epoch": 0.7363276960245991, + "grad_norm": 1.77741539478302, + "learning_rate": 5e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.713273823261261, + "num_tokens": 173413559.0, + "step": 6705 + }, + { + "epoch": 0.7364375137272128, + "grad_norm": 1.6992241144180298, + "learning_rate": 5e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7037638425827026, + "num_tokens": 173440863.0, + "step": 6706 + }, + { + "epoch": 0.7365473314298265, + "grad_norm": 1.7107962369918823, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7293751239776611, + "num_tokens": 173466482.0, + "step": 6707 + }, + { + "epoch": 0.7366571491324402, + "grad_norm": 1.8239668607711792, + "learning_rate": 5e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6968814730644226, + "num_tokens": 173494441.0, + "step": 6708 + }, + { + "epoch": 0.7367669668350538, + "grad_norm": 1.9685947895050049, + "learning_rate": 5e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7044060826301575, + "num_tokens": 173518171.0, + "step": 6709 + }, + { + "epoch": 0.7368767845376675, + "grad_norm": 1.7687909603118896, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7113962173461914, + "num_tokens": 173544132.0, + "step": 6710 + }, + { + "epoch": 0.7369866022402811, + "grad_norm": 1.885250210762024, + "learning_rate": 5e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6815333962440491, + "num_tokens": 173568989.0, + "step": 6711 + }, + { + "epoch": 0.7370964199428948, + "grad_norm": 1.9779176712036133, + "learning_rate": 5e-06, + "loss": 0.974, + "mean_token_accuracy": 0.6982364654541016, + "num_tokens": 173592452.0, + "step": 6712 + }, + { + "epoch": 0.7372062376455084, + "grad_norm": 1.9684351682662964, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7247815132141113, + "num_tokens": 173613997.0, + "step": 6713 + }, + { + "epoch": 0.7373160553481222, + "grad_norm": 1.7022461891174316, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7391324043273926, + "num_tokens": 173641147.0, + "step": 6714 + }, + { + "epoch": 0.7374258730507358, + "grad_norm": 1.703315258026123, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7359235286712646, + "num_tokens": 173668183.0, + "step": 6715 + }, + { + "epoch": 0.7375356907533495, + "grad_norm": 1.8953639268875122, + "learning_rate": 5e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7225956916809082, + "num_tokens": 173691096.0, + "step": 6716 + }, + { + "epoch": 0.7376455084559631, + "grad_norm": 1.7504726648330688, + "learning_rate": 5e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.711595892906189, + "num_tokens": 173718578.0, + "step": 6717 + }, + { + "epoch": 0.7377553261585768, + "grad_norm": 1.863497257232666, + "learning_rate": 5e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6936012506484985, + "num_tokens": 173743067.0, + "step": 6718 + }, + { + "epoch": 0.7378651438611904, + "grad_norm": 1.9901363849639893, + "learning_rate": 5e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7069224119186401, + "num_tokens": 173763470.0, + "step": 6719 + }, + { + "epoch": 0.737974961563804, + "grad_norm": 1.7910743951797485, + "learning_rate": 5e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6910253763198853, + "num_tokens": 173790039.0, + "step": 6720 + }, + { + "epoch": 0.7380847792664178, + "grad_norm": 1.7056736946105957, + "learning_rate": 5e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6864261627197266, + "num_tokens": 173819767.0, + "step": 6721 + }, + { + "epoch": 0.7381945969690314, + "grad_norm": 2.0748276710510254, + "learning_rate": 5e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.713123083114624, + "num_tokens": 173840555.0, + "step": 6722 + }, + { + "epoch": 0.7383044146716451, + "grad_norm": 2.081367015838623, + "learning_rate": 5e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7198563814163208, + "num_tokens": 173860789.0, + "step": 6723 + }, + { + "epoch": 0.7384142323742587, + "grad_norm": 1.8497564792633057, + "learning_rate": 5e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.7064254879951477, + "num_tokens": 173885983.0, + "step": 6724 + }, + { + "epoch": 0.7385240500768724, + "grad_norm": 2.058267116546631, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.705578625202179, + "num_tokens": 173906327.0, + "step": 6725 + }, + { + "epoch": 0.738633867779486, + "grad_norm": 1.7569875717163086, + "learning_rate": 5e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7116173505783081, + "num_tokens": 173933659.0, + "step": 6726 + }, + { + "epoch": 0.7387436854820997, + "grad_norm": 1.8359683752059937, + "learning_rate": 5e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6985320448875427, + "num_tokens": 173960853.0, + "step": 6727 + }, + { + "epoch": 0.7388535031847133, + "grad_norm": 1.6858365535736084, + "learning_rate": 5e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7431491613388062, + "num_tokens": 173988819.0, + "step": 6728 + }, + { + "epoch": 0.7389633208873271, + "grad_norm": 1.790127158164978, + "learning_rate": 5e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7299590110778809, + "num_tokens": 174012169.0, + "step": 6729 + }, + { + "epoch": 0.7390731385899407, + "grad_norm": 1.6976134777069092, + "learning_rate": 5e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.713388204574585, + "num_tokens": 174040488.0, + "step": 6730 + }, + { + "epoch": 0.7391829562925544, + "grad_norm": 2.155566692352295, + "learning_rate": 5e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7148175239562988, + "num_tokens": 174060150.0, + "step": 6731 + }, + { + "epoch": 0.739292773995168, + "grad_norm": 1.7278499603271484, + "learning_rate": 5e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7185615301132202, + "num_tokens": 174088712.0, + "step": 6732 + }, + { + "epoch": 0.7394025916977817, + "grad_norm": 1.7296030521392822, + "learning_rate": 5e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7215147614479065, + "num_tokens": 174115411.0, + "step": 6733 + }, + { + "epoch": 0.7395124094003953, + "grad_norm": 1.7271015644073486, + "learning_rate": 5e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7015025615692139, + "num_tokens": 174146401.0, + "step": 6734 + }, + { + "epoch": 0.739622227103009, + "grad_norm": 1.604109764099121, + "learning_rate": 5e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7285152077674866, + "num_tokens": 174175629.0, + "step": 6735 + }, + { + "epoch": 0.7397320448056227, + "grad_norm": 1.8439961671829224, + "learning_rate": 5e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7193421721458435, + "num_tokens": 174200214.0, + "step": 6736 + }, + { + "epoch": 0.7398418625082364, + "grad_norm": 1.8072385787963867, + "learning_rate": 5e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7099791765213013, + "num_tokens": 174225098.0, + "step": 6737 + }, + { + "epoch": 0.73995168021085, + "grad_norm": 2.123671531677246, + "learning_rate": 5e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7141269445419312, + "num_tokens": 174245750.0, + "step": 6738 + }, + { + "epoch": 0.7400614979134637, + "grad_norm": 1.9125914573669434, + "learning_rate": 5e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6993370652198792, + "num_tokens": 174270665.0, + "step": 6739 + }, + { + "epoch": 0.7401713156160773, + "grad_norm": 1.8101555109024048, + "learning_rate": 5e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7046511173248291, + "num_tokens": 174295382.0, + "step": 6740 + }, + { + "epoch": 0.7402811333186909, + "grad_norm": 2.161746025085449, + "learning_rate": 5e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7229834794998169, + "num_tokens": 174314376.0, + "step": 6741 + }, + { + "epoch": 0.7403909510213046, + "grad_norm": 1.9705514907836914, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.717613160610199, + "num_tokens": 174336282.0, + "step": 6742 + }, + { + "epoch": 0.7405007687239183, + "grad_norm": 1.8630597591400146, + "learning_rate": 5e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6966291666030884, + "num_tokens": 174363220.0, + "step": 6743 + }, + { + "epoch": 0.740610586426532, + "grad_norm": 1.7634367942810059, + "learning_rate": 5e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7108347415924072, + "num_tokens": 174390998.0, + "step": 6744 + }, + { + "epoch": 0.7407204041291456, + "grad_norm": 1.929999828338623, + "learning_rate": 5e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7104321718215942, + "num_tokens": 174414357.0, + "step": 6745 + }, + { + "epoch": 0.7408302218317593, + "grad_norm": 1.6935824155807495, + "learning_rate": 5e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7013370990753174, + "num_tokens": 174441619.0, + "step": 6746 + }, + { + "epoch": 0.7409400395343729, + "grad_norm": 1.6105504035949707, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7249944806098938, + "num_tokens": 174471241.0, + "step": 6747 + }, + { + "epoch": 0.7410498572369866, + "grad_norm": 2.1203222274780273, + "learning_rate": 5e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7254942655563354, + "num_tokens": 174491024.0, + "step": 6748 + }, + { + "epoch": 0.7411596749396002, + "grad_norm": 1.7181459665298462, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6914045214653015, + "num_tokens": 174518870.0, + "step": 6749 + }, + { + "epoch": 0.741269492642214, + "grad_norm": 1.8916229009628296, + "learning_rate": 5e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6951240301132202, + "num_tokens": 174543470.0, + "step": 6750 + }, + { + "epoch": 0.7413793103448276, + "grad_norm": 1.901505947113037, + "learning_rate": 5e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7193583250045776, + "num_tokens": 174567875.0, + "step": 6751 + }, + { + "epoch": 0.7414891280474413, + "grad_norm": 1.89972722530365, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7037674188613892, + "num_tokens": 174594155.0, + "step": 6752 + }, + { + "epoch": 0.7415989457500549, + "grad_norm": 1.6328529119491577, + "learning_rate": 5e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7281431555747986, + "num_tokens": 174625239.0, + "step": 6753 + }, + { + "epoch": 0.7417087634526686, + "grad_norm": 1.7530919313430786, + "learning_rate": 5e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7101633548736572, + "num_tokens": 174651635.0, + "step": 6754 + }, + { + "epoch": 0.7418185811552822, + "grad_norm": 1.8073891401290894, + "learning_rate": 5e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6853866577148438, + "num_tokens": 174680320.0, + "step": 6755 + }, + { + "epoch": 0.7419283988578959, + "grad_norm": 2.085664749145508, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7335327863693237, + "num_tokens": 174698587.0, + "step": 6756 + }, + { + "epoch": 0.7420382165605095, + "grad_norm": 1.8254650831222534, + "learning_rate": 5e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7035390138626099, + "num_tokens": 174724973.0, + "step": 6757 + }, + { + "epoch": 0.7421480342631233, + "grad_norm": 1.6251044273376465, + "learning_rate": 5e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7159032821655273, + "num_tokens": 174753712.0, + "step": 6758 + }, + { + "epoch": 0.7422578519657369, + "grad_norm": 1.689699649810791, + "learning_rate": 5e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7110631465911865, + "num_tokens": 174782540.0, + "step": 6759 + }, + { + "epoch": 0.7423676696683505, + "grad_norm": 1.813529372215271, + "learning_rate": 5e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6911722421646118, + "num_tokens": 174808838.0, + "step": 6760 + }, + { + "epoch": 0.7424774873709642, + "grad_norm": 1.668872594833374, + "learning_rate": 5e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7078752517700195, + "num_tokens": 174838153.0, + "step": 6761 + }, + { + "epoch": 0.7425873050735778, + "grad_norm": 1.717344045639038, + "learning_rate": 5e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7142813205718994, + "num_tokens": 174866925.0, + "step": 6762 + }, + { + "epoch": 0.7426971227761915, + "grad_norm": 1.8645943403244019, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7264977097511292, + "num_tokens": 174889095.0, + "step": 6763 + }, + { + "epoch": 0.7428069404788051, + "grad_norm": 1.6537985801696777, + "learning_rate": 5e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7032557725906372, + "num_tokens": 174919794.0, + "step": 6764 + }, + { + "epoch": 0.7429167581814189, + "grad_norm": 2.1591532230377197, + "learning_rate": 5e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7197086215019226, + "num_tokens": 174937459.0, + "step": 6765 + }, + { + "epoch": 0.7430265758840325, + "grad_norm": 1.6562840938568115, + "learning_rate": 5e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6863632202148438, + "num_tokens": 174969165.0, + "step": 6766 + }, + { + "epoch": 0.7431363935866462, + "grad_norm": 1.7132023572921753, + "learning_rate": 5e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7076785564422607, + "num_tokens": 174998690.0, + "step": 6767 + }, + { + "epoch": 0.7432462112892598, + "grad_norm": 1.8381603956222534, + "learning_rate": 5e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7015801668167114, + "num_tokens": 175025672.0, + "step": 6768 + }, + { + "epoch": 0.7433560289918735, + "grad_norm": 1.7930539846420288, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7347432374954224, + "num_tokens": 175050341.0, + "step": 6769 + }, + { + "epoch": 0.7434658466944871, + "grad_norm": 1.7718437910079956, + "learning_rate": 5e-06, + "loss": 0.985, + "mean_token_accuracy": 0.6997154951095581, + "num_tokens": 175077935.0, + "step": 6770 + }, + { + "epoch": 0.7435756643971008, + "grad_norm": 1.725441813468933, + "learning_rate": 5e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7169710397720337, + "num_tokens": 175104296.0, + "step": 6771 + }, + { + "epoch": 0.7436854820997145, + "grad_norm": 1.7288844585418701, + "learning_rate": 5e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6795740723609924, + "num_tokens": 175132212.0, + "step": 6772 + }, + { + "epoch": 0.7437952998023282, + "grad_norm": 1.8287572860717773, + "learning_rate": 5e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6875357031822205, + "num_tokens": 175160457.0, + "step": 6773 + }, + { + "epoch": 0.7439051175049418, + "grad_norm": 1.8229535818099976, + "learning_rate": 5e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.712354302406311, + "num_tokens": 175184935.0, + "step": 6774 + }, + { + "epoch": 0.7440149352075555, + "grad_norm": 2.038828134536743, + "learning_rate": 5e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.6942613124847412, + "num_tokens": 175208248.0, + "step": 6775 + }, + { + "epoch": 0.7441247529101691, + "grad_norm": 1.7030504941940308, + "learning_rate": 5e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6979702711105347, + "num_tokens": 175237179.0, + "step": 6776 + }, + { + "epoch": 0.7442345706127828, + "grad_norm": 1.5362344980239868, + "learning_rate": 5e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7103463411331177, + "num_tokens": 175268717.0, + "step": 6777 + }, + { + "epoch": 0.7443443883153964, + "grad_norm": 1.9686435461044312, + "learning_rate": 5e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6978862881660461, + "num_tokens": 175291804.0, + "step": 6778 + }, + { + "epoch": 0.7444542060180102, + "grad_norm": 1.6299614906311035, + "learning_rate": 5e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7105990648269653, + "num_tokens": 175318812.0, + "step": 6779 + }, + { + "epoch": 0.7445640237206238, + "grad_norm": 1.6634081602096558, + "learning_rate": 5e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6833015084266663, + "num_tokens": 175353780.0, + "step": 6780 + }, + { + "epoch": 0.7446738414232374, + "grad_norm": 1.9610183238983154, + "learning_rate": 5e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6918151378631592, + "num_tokens": 175376696.0, + "step": 6781 + }, + { + "epoch": 0.7447836591258511, + "grad_norm": 2.108767032623291, + "learning_rate": 5e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7065213918685913, + "num_tokens": 175395555.0, + "step": 6782 + }, + { + "epoch": 0.7448934768284647, + "grad_norm": 1.8191964626312256, + "learning_rate": 5e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6923116445541382, + "num_tokens": 175423443.0, + "step": 6783 + }, + { + "epoch": 0.7450032945310784, + "grad_norm": 1.718137264251709, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7275991439819336, + "num_tokens": 175452636.0, + "step": 6784 + }, + { + "epoch": 0.745113112233692, + "grad_norm": 1.770534634590149, + "learning_rate": 5e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.713840901851654, + "num_tokens": 175480732.0, + "step": 6785 + }, + { + "epoch": 0.7452229299363057, + "grad_norm": 1.780426263809204, + "learning_rate": 5e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6908321380615234, + "num_tokens": 175508072.0, + "step": 6786 + }, + { + "epoch": 0.7453327476389194, + "grad_norm": 1.8533974885940552, + "learning_rate": 5e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7139599323272705, + "num_tokens": 175531460.0, + "step": 6787 + }, + { + "epoch": 0.7454425653415331, + "grad_norm": 1.8186630010604858, + "learning_rate": 5e-06, + "loss": 1.1051, + "mean_token_accuracy": 0.6827374696731567, + "num_tokens": 175559447.0, + "step": 6788 + }, + { + "epoch": 0.7455523830441467, + "grad_norm": 2.061112880706787, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7054105997085571, + "num_tokens": 175581289.0, + "step": 6789 + }, + { + "epoch": 0.7456622007467604, + "grad_norm": 1.884989619255066, + "learning_rate": 5e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7151485085487366, + "num_tokens": 175605651.0, + "step": 6790 + }, + { + "epoch": 0.745772018449374, + "grad_norm": 1.7893208265304565, + "learning_rate": 5e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7042351961135864, + "num_tokens": 175634314.0, + "step": 6791 + }, + { + "epoch": 0.7458818361519877, + "grad_norm": 1.599381446838379, + "learning_rate": 5e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6828660368919373, + "num_tokens": 175667735.0, + "step": 6792 + }, + { + "epoch": 0.7459916538546013, + "grad_norm": 2.0102243423461914, + "learning_rate": 5e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7166789174079895, + "num_tokens": 175688158.0, + "step": 6793 + }, + { + "epoch": 0.7461014715572151, + "grad_norm": 1.6612906455993652, + "learning_rate": 5e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.702934205532074, + "num_tokens": 175716535.0, + "step": 6794 + }, + { + "epoch": 0.7462112892598287, + "grad_norm": 1.720527172088623, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.6996374130249023, + "num_tokens": 175742977.0, + "step": 6795 + }, + { + "epoch": 0.7463211069624424, + "grad_norm": 1.7423970699310303, + "learning_rate": 5e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7065408825874329, + "num_tokens": 175770474.0, + "step": 6796 + }, + { + "epoch": 0.746430924665056, + "grad_norm": 1.9411860704421997, + "learning_rate": 5e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7164310812950134, + "num_tokens": 175794386.0, + "step": 6797 + }, + { + "epoch": 0.7465407423676697, + "grad_norm": 1.554671287536621, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7147728204727173, + "num_tokens": 175825528.0, + "step": 6798 + }, + { + "epoch": 0.7466505600702833, + "grad_norm": 1.743683099746704, + "learning_rate": 5e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7164865732192993, + "num_tokens": 175851502.0, + "step": 6799 + }, + { + "epoch": 0.7467603777728969, + "grad_norm": 1.625912070274353, + "learning_rate": 5e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7064616084098816, + "num_tokens": 175879906.0, + "step": 6800 + }, + { + "epoch": 0.7468701954755107, + "grad_norm": 1.947420597076416, + "learning_rate": 5e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.694575846195221, + "num_tokens": 175902697.0, + "step": 6801 + }, + { + "epoch": 0.7469800131781243, + "grad_norm": 1.9182612895965576, + "learning_rate": 5e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7160319685935974, + "num_tokens": 175926690.0, + "step": 6802 + }, + { + "epoch": 0.747089830880738, + "grad_norm": 1.8890055418014526, + "learning_rate": 5e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.693060040473938, + "num_tokens": 175950149.0, + "step": 6803 + }, + { + "epoch": 0.7471996485833516, + "grad_norm": 2.2549173831939697, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7300225496292114, + "num_tokens": 175966886.0, + "step": 6804 + }, + { + "epoch": 0.7473094662859653, + "grad_norm": 1.7153234481811523, + "learning_rate": 5e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7065214514732361, + "num_tokens": 175993377.0, + "step": 6805 + }, + { + "epoch": 0.7474192839885789, + "grad_norm": 1.605275273323059, + "learning_rate": 5e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.706892728805542, + "num_tokens": 176024592.0, + "step": 6806 + }, + { + "epoch": 0.7475291016911926, + "grad_norm": 1.7600542306900024, + "learning_rate": 5e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6985998153686523, + "num_tokens": 176050507.0, + "step": 6807 + }, + { + "epoch": 0.7476389193938063, + "grad_norm": 1.8512927293777466, + "learning_rate": 5e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6963268518447876, + "num_tokens": 176081181.0, + "step": 6808 + }, + { + "epoch": 0.74774873709642, + "grad_norm": 1.8278872966766357, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7095985412597656, + "num_tokens": 176108111.0, + "step": 6809 + }, + { + "epoch": 0.7478585547990336, + "grad_norm": 1.85031259059906, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7050400972366333, + "num_tokens": 176136611.0, + "step": 6810 + }, + { + "epoch": 0.7479683725016473, + "grad_norm": 1.9352165460586548, + "learning_rate": 5e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7057136297225952, + "num_tokens": 176159353.0, + "step": 6811 + }, + { + "epoch": 0.7480781902042609, + "grad_norm": 1.7731659412384033, + "learning_rate": 5e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.6988075971603394, + "num_tokens": 176188374.0, + "step": 6812 + }, + { + "epoch": 0.7481880079068746, + "grad_norm": 1.7573133707046509, + "learning_rate": 5e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6902883648872375, + "num_tokens": 176217045.0, + "step": 6813 + }, + { + "epoch": 0.7482978256094882, + "grad_norm": 1.6965214014053345, + "learning_rate": 5e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7039077281951904, + "num_tokens": 176245922.0, + "step": 6814 + }, + { + "epoch": 0.7484076433121019, + "grad_norm": 1.8081804513931274, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7274076342582703, + "num_tokens": 176268728.0, + "step": 6815 + }, + { + "epoch": 0.7485174610147156, + "grad_norm": 1.9017399549484253, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7185922861099243, + "num_tokens": 176292162.0, + "step": 6816 + }, + { + "epoch": 0.7486272787173293, + "grad_norm": 1.9116238355636597, + "learning_rate": 5e-06, + "loss": 0.98, + "mean_token_accuracy": 0.701910138130188, + "num_tokens": 176316240.0, + "step": 6817 + }, + { + "epoch": 0.7487370964199429, + "grad_norm": 1.8301725387573242, + "learning_rate": 5e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.702867865562439, + "num_tokens": 176344755.0, + "step": 6818 + }, + { + "epoch": 0.7488469141225566, + "grad_norm": 1.5491578578948975, + "learning_rate": 5e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7177749276161194, + "num_tokens": 176375588.0, + "step": 6819 + }, + { + "epoch": 0.7489567318251702, + "grad_norm": 1.6632163524627686, + "learning_rate": 5e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7024040222167969, + "num_tokens": 176407648.0, + "step": 6820 + }, + { + "epoch": 0.7490665495277838, + "grad_norm": 1.7980601787567139, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6946895122528076, + "num_tokens": 176433295.0, + "step": 6821 + }, + { + "epoch": 0.7491763672303975, + "grad_norm": 1.877301573753357, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7400646209716797, + "num_tokens": 176455053.0, + "step": 6822 + }, + { + "epoch": 0.7492861849330112, + "grad_norm": 1.6442559957504272, + "learning_rate": 5e-06, + "loss": 1.1004, + "mean_token_accuracy": 0.6771172285079956, + "num_tokens": 176487684.0, + "step": 6823 + }, + { + "epoch": 0.7493960026356249, + "grad_norm": 2.033393144607544, + "learning_rate": 5e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.705327033996582, + "num_tokens": 176507838.0, + "step": 6824 + }, + { + "epoch": 0.7495058203382385, + "grad_norm": 1.7253661155700684, + "learning_rate": 5e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.700411319732666, + "num_tokens": 176537934.0, + "step": 6825 + }, + { + "epoch": 0.7496156380408522, + "grad_norm": 1.8083059787750244, + "learning_rate": 5e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7296965718269348, + "num_tokens": 176561917.0, + "step": 6826 + }, + { + "epoch": 0.7497254557434658, + "grad_norm": 1.8651481866836548, + "learning_rate": 5e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7176212072372437, + "num_tokens": 176586341.0, + "step": 6827 + }, + { + "epoch": 0.7498352734460795, + "grad_norm": 1.816069483757019, + "learning_rate": 5e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6919650435447693, + "num_tokens": 176614030.0, + "step": 6828 + }, + { + "epoch": 0.7499450911486931, + "grad_norm": 1.7688909769058228, + "learning_rate": 5e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7092468738555908, + "num_tokens": 176640715.0, + "step": 6829 + }, + { + "epoch": 0.7500549088513069, + "grad_norm": 1.7544825077056885, + "learning_rate": 5e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7376149892807007, + "num_tokens": 176666259.0, + "step": 6830 + }, + { + "epoch": 0.7501647265539205, + "grad_norm": 1.8890262842178345, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7293726205825806, + "num_tokens": 176689473.0, + "step": 6831 + }, + { + "epoch": 0.7502745442565342, + "grad_norm": 1.6548354625701904, + "learning_rate": 5e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6840274930000305, + "num_tokens": 176720725.0, + "step": 6832 + }, + { + "epoch": 0.7503843619591478, + "grad_norm": 1.6369072198867798, + "learning_rate": 5e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6937315464019775, + "num_tokens": 176751256.0, + "step": 6833 + }, + { + "epoch": 0.7504941796617615, + "grad_norm": 1.682088017463684, + "learning_rate": 5e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.706957995891571, + "num_tokens": 176779266.0, + "step": 6834 + }, + { + "epoch": 0.7506039973643751, + "grad_norm": 1.7396773099899292, + "learning_rate": 5e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7025409936904907, + "num_tokens": 176805999.0, + "step": 6835 + }, + { + "epoch": 0.7507138150669888, + "grad_norm": 2.1210732460021973, + "learning_rate": 5e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7196515798568726, + "num_tokens": 176825564.0, + "step": 6836 + }, + { + "epoch": 0.7508236327696025, + "grad_norm": 1.8952717781066895, + "learning_rate": 5e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7128725051879883, + "num_tokens": 176847619.0, + "step": 6837 + }, + { + "epoch": 0.7509334504722162, + "grad_norm": 1.755016803741455, + "learning_rate": 5e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7115780711174011, + "num_tokens": 176872828.0, + "step": 6838 + }, + { + "epoch": 0.7510432681748298, + "grad_norm": 1.667568325996399, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7042442560195923, + "num_tokens": 176900436.0, + "step": 6839 + }, + { + "epoch": 0.7511530858774434, + "grad_norm": 1.7453322410583496, + "learning_rate": 5e-06, + "loss": 1.1191, + "mean_token_accuracy": 0.662452220916748, + "num_tokens": 176928407.0, + "step": 6840 + }, + { + "epoch": 0.7512629035800571, + "grad_norm": 1.5949792861938477, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7187501788139343, + "num_tokens": 176962326.0, + "step": 6841 + }, + { + "epoch": 0.7513727212826707, + "grad_norm": 1.6782300472259521, + "learning_rate": 5e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7020512819290161, + "num_tokens": 176991132.0, + "step": 6842 + }, + { + "epoch": 0.7514825389852844, + "grad_norm": 1.7408326864242554, + "learning_rate": 5e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7107745409011841, + "num_tokens": 177018580.0, + "step": 6843 + }, + { + "epoch": 0.7515923566878981, + "grad_norm": 1.8323427438735962, + "learning_rate": 5e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6965451836585999, + "num_tokens": 177045607.0, + "step": 6844 + }, + { + "epoch": 0.7517021743905118, + "grad_norm": 1.6231262683868408, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7093448638916016, + "num_tokens": 177081365.0, + "step": 6845 + }, + { + "epoch": 0.7518119920931254, + "grad_norm": 1.5529261827468872, + "learning_rate": 5e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7156413793563843, + "num_tokens": 177114958.0, + "step": 6846 + }, + { + "epoch": 0.7519218097957391, + "grad_norm": 1.8946627378463745, + "learning_rate": 5e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6886053085327148, + "num_tokens": 177139411.0, + "step": 6847 + }, + { + "epoch": 0.7520316274983527, + "grad_norm": 1.9415414333343506, + "learning_rate": 5e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.705314040184021, + "num_tokens": 177161243.0, + "step": 6848 + }, + { + "epoch": 0.7521414452009664, + "grad_norm": 1.7609103918075562, + "learning_rate": 5e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6934871673583984, + "num_tokens": 177187852.0, + "step": 6849 + }, + { + "epoch": 0.75225126290358, + "grad_norm": 1.8597519397735596, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7346265316009521, + "num_tokens": 177210721.0, + "step": 6850 + }, + { + "epoch": 0.7523610806061937, + "grad_norm": 1.983247995376587, + "learning_rate": 5e-06, + "loss": 0.994, + "mean_token_accuracy": 0.6948528289794922, + "num_tokens": 177235328.0, + "step": 6851 + }, + { + "epoch": 0.7524708983088074, + "grad_norm": 2.0113327503204346, + "learning_rate": 5e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.703385055065155, + "num_tokens": 177258118.0, + "step": 6852 + }, + { + "epoch": 0.7525807160114211, + "grad_norm": 1.7854481935501099, + "learning_rate": 5e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7033364176750183, + "num_tokens": 177284084.0, + "step": 6853 + }, + { + "epoch": 0.7526905337140347, + "grad_norm": 1.7082455158233643, + "learning_rate": 5e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7131848335266113, + "num_tokens": 177311821.0, + "step": 6854 + }, + { + "epoch": 0.7528003514166484, + "grad_norm": 1.9808228015899658, + "learning_rate": 5e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7290942072868347, + "num_tokens": 177332994.0, + "step": 6855 + }, + { + "epoch": 0.752910169119262, + "grad_norm": 1.6858530044555664, + "learning_rate": 5e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7151048183441162, + "num_tokens": 177359700.0, + "step": 6856 + }, + { + "epoch": 0.7530199868218757, + "grad_norm": 1.7443053722381592, + "learning_rate": 5e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6746405363082886, + "num_tokens": 177390171.0, + "step": 6857 + }, + { + "epoch": 0.7531298045244893, + "grad_norm": 1.725106120109558, + "learning_rate": 5e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6921412348747253, + "num_tokens": 177416961.0, + "step": 6858 + }, + { + "epoch": 0.7532396222271031, + "grad_norm": 1.761033058166504, + "learning_rate": 5e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7200541496276855, + "num_tokens": 177442860.0, + "step": 6859 + }, + { + "epoch": 0.7533494399297167, + "grad_norm": 1.9654749631881714, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7330215573310852, + "num_tokens": 177464906.0, + "step": 6860 + }, + { + "epoch": 0.7534592576323303, + "grad_norm": 1.8900116682052612, + "learning_rate": 5e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.705001950263977, + "num_tokens": 177488156.0, + "step": 6861 + }, + { + "epoch": 0.753569075334944, + "grad_norm": 1.7568706274032593, + "learning_rate": 5e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7005125880241394, + "num_tokens": 177515403.0, + "step": 6862 + }, + { + "epoch": 0.7536788930375576, + "grad_norm": 1.8549609184265137, + "learning_rate": 5e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6987206935882568, + "num_tokens": 177539437.0, + "step": 6863 + }, + { + "epoch": 0.7537887107401713, + "grad_norm": 1.9231510162353516, + "learning_rate": 5e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7230798006057739, + "num_tokens": 177562007.0, + "step": 6864 + }, + { + "epoch": 0.7538985284427849, + "grad_norm": 1.98160719871521, + "learning_rate": 5e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7046068906784058, + "num_tokens": 177584680.0, + "step": 6865 + }, + { + "epoch": 0.7540083461453987, + "grad_norm": 1.9061510562896729, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7187884449958801, + "num_tokens": 177608769.0, + "step": 6866 + }, + { + "epoch": 0.7541181638480123, + "grad_norm": 1.692768931388855, + "learning_rate": 5e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7056112885475159, + "num_tokens": 177639535.0, + "step": 6867 + }, + { + "epoch": 0.754227981550626, + "grad_norm": 1.6701362133026123, + "learning_rate": 5e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7095625996589661, + "num_tokens": 177668038.0, + "step": 6868 + }, + { + "epoch": 0.7543377992532396, + "grad_norm": 1.7636809349060059, + "learning_rate": 5e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7162817120552063, + "num_tokens": 177692416.0, + "step": 6869 + }, + { + "epoch": 0.7544476169558533, + "grad_norm": 1.6889519691467285, + "learning_rate": 5e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7153698801994324, + "num_tokens": 177720490.0, + "step": 6870 + }, + { + "epoch": 0.7545574346584669, + "grad_norm": 1.6194372177124023, + "learning_rate": 5e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.675534725189209, + "num_tokens": 177751806.0, + "step": 6871 + }, + { + "epoch": 0.7546672523610806, + "grad_norm": 1.8632197380065918, + "learning_rate": 5e-06, + "loss": 1.1039, + "mean_token_accuracy": 0.6797692179679871, + "num_tokens": 177779488.0, + "step": 6872 + }, + { + "epoch": 0.7547770700636943, + "grad_norm": 1.8984391689300537, + "learning_rate": 5e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.706201434135437, + "num_tokens": 177803361.0, + "step": 6873 + }, + { + "epoch": 0.754886887766308, + "grad_norm": 1.8394216299057007, + "learning_rate": 5e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6822987794876099, + "num_tokens": 177831344.0, + "step": 6874 + }, + { + "epoch": 0.7549967054689216, + "grad_norm": 1.6720597743988037, + "learning_rate": 5e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7136589288711548, + "num_tokens": 177860691.0, + "step": 6875 + }, + { + "epoch": 0.7551065231715353, + "grad_norm": 1.8573768138885498, + "learning_rate": 5e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7134901285171509, + "num_tokens": 177884815.0, + "step": 6876 + }, + { + "epoch": 0.7552163408741489, + "grad_norm": 1.5803775787353516, + "learning_rate": 5e-06, + "loss": 1.0385, + "mean_token_accuracy": 0.6967251300811768, + "num_tokens": 177915614.0, + "step": 6877 + }, + { + "epoch": 0.7553261585767626, + "grad_norm": 1.853921890258789, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7344043850898743, + "num_tokens": 177936422.0, + "step": 6878 + }, + { + "epoch": 0.7554359762793762, + "grad_norm": 1.6913368701934814, + "learning_rate": 5e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7194645404815674, + "num_tokens": 177963836.0, + "step": 6879 + }, + { + "epoch": 0.7555457939819898, + "grad_norm": 2.092468023300171, + "learning_rate": 5e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7094271183013916, + "num_tokens": 177983323.0, + "step": 6880 + }, + { + "epoch": 0.7556556116846036, + "grad_norm": 1.7183994054794312, + "learning_rate": 5e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7096778154373169, + "num_tokens": 178012334.0, + "step": 6881 + }, + { + "epoch": 0.7557654293872172, + "grad_norm": 1.6939507722854614, + "learning_rate": 5e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6735201478004456, + "num_tokens": 178043438.0, + "step": 6882 + }, + { + "epoch": 0.7558752470898309, + "grad_norm": 1.5506114959716797, + "learning_rate": 5e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7058813571929932, + "num_tokens": 178076495.0, + "step": 6883 + }, + { + "epoch": 0.7559850647924445, + "grad_norm": 1.8314130306243896, + "learning_rate": 5e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.698460578918457, + "num_tokens": 178103150.0, + "step": 6884 + }, + { + "epoch": 0.7560948824950582, + "grad_norm": 1.9608842134475708, + "learning_rate": 5e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7013875842094421, + "num_tokens": 178126594.0, + "step": 6885 + }, + { + "epoch": 0.7562047001976718, + "grad_norm": 1.891626238822937, + "learning_rate": 5e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7045555710792542, + "num_tokens": 178152180.0, + "step": 6886 + }, + { + "epoch": 0.7563145179002855, + "grad_norm": 1.5590620040893555, + "learning_rate": 5e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7053806781768799, + "num_tokens": 178184731.0, + "step": 6887 + }, + { + "epoch": 0.7564243356028992, + "grad_norm": 1.8881714344024658, + "learning_rate": 5e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7186286449432373, + "num_tokens": 178207074.0, + "step": 6888 + }, + { + "epoch": 0.7565341533055129, + "grad_norm": 1.8911654949188232, + "learning_rate": 5e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7220069766044617, + "num_tokens": 178228122.0, + "step": 6889 + }, + { + "epoch": 0.7566439710081265, + "grad_norm": 2.0470635890960693, + "learning_rate": 5e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7179737687110901, + "num_tokens": 178248447.0, + "step": 6890 + }, + { + "epoch": 0.7567537887107402, + "grad_norm": 2.0801494121551514, + "learning_rate": 5e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7086670994758606, + "num_tokens": 178269671.0, + "step": 6891 + }, + { + "epoch": 0.7568636064133538, + "grad_norm": 2.0074751377105713, + "learning_rate": 5e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7165809273719788, + "num_tokens": 178291590.0, + "step": 6892 + }, + { + "epoch": 0.7569734241159675, + "grad_norm": 1.9346222877502441, + "learning_rate": 5e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.696799099445343, + "num_tokens": 178317968.0, + "step": 6893 + }, + { + "epoch": 0.7570832418185811, + "grad_norm": 1.7935445308685303, + "learning_rate": 5e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7176920175552368, + "num_tokens": 178345487.0, + "step": 6894 + }, + { + "epoch": 0.7571930595211949, + "grad_norm": 1.6715401411056519, + "learning_rate": 5e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6918329000473022, + "num_tokens": 178377996.0, + "step": 6895 + }, + { + "epoch": 0.7573028772238085, + "grad_norm": 1.6335636377334595, + "learning_rate": 5e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7022499442100525, + "num_tokens": 178409419.0, + "step": 6896 + }, + { + "epoch": 0.7574126949264222, + "grad_norm": 1.7229793071746826, + "learning_rate": 5e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.691279411315918, + "num_tokens": 178436573.0, + "step": 6897 + }, + { + "epoch": 0.7575225126290358, + "grad_norm": 1.7871425151824951, + "learning_rate": 5e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7127758264541626, + "num_tokens": 178461719.0, + "step": 6898 + }, + { + "epoch": 0.7576323303316495, + "grad_norm": 1.8520245552062988, + "learning_rate": 5e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6909618973731995, + "num_tokens": 178488520.0, + "step": 6899 + }, + { + "epoch": 0.7577421480342631, + "grad_norm": 1.7861335277557373, + "learning_rate": 5e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.68392413854599, + "num_tokens": 178517335.0, + "step": 6900 + }, + { + "epoch": 0.7578519657368767, + "grad_norm": 1.5515789985656738, + "learning_rate": 5e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.721233606338501, + "num_tokens": 178551510.0, + "step": 6901 + }, + { + "epoch": 0.7579617834394905, + "grad_norm": 1.781494140625, + "learning_rate": 5e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7022720575332642, + "num_tokens": 178577674.0, + "step": 6902 + }, + { + "epoch": 0.7580716011421041, + "grad_norm": 1.9546489715576172, + "learning_rate": 5e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7386027574539185, + "num_tokens": 178598295.0, + "step": 6903 + }, + { + "epoch": 0.7581814188447178, + "grad_norm": 2.2007625102996826, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7391846179962158, + "num_tokens": 178615741.0, + "step": 6904 + }, + { + "epoch": 0.7582912365473314, + "grad_norm": 1.7941278219223022, + "learning_rate": 5e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7033331990242004, + "num_tokens": 178641049.0, + "step": 6905 + }, + { + "epoch": 0.7584010542499451, + "grad_norm": 1.7675660848617554, + "learning_rate": 5e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6867427825927734, + "num_tokens": 178668860.0, + "step": 6906 + }, + { + "epoch": 0.7585108719525587, + "grad_norm": 1.766654372215271, + "learning_rate": 5e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6987738609313965, + "num_tokens": 178697245.0, + "step": 6907 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 1.85592782497406, + "learning_rate": 5e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.6929992437362671, + "num_tokens": 178723122.0, + "step": 6908 + }, + { + "epoch": 0.758730507357786, + "grad_norm": 1.604433298110962, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7312772870063782, + "num_tokens": 178751626.0, + "step": 6909 + }, + { + "epoch": 0.7588403250603998, + "grad_norm": 1.9733941555023193, + "learning_rate": 5e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7374579906463623, + "num_tokens": 178770710.0, + "step": 6910 + }, + { + "epoch": 0.7589501427630134, + "grad_norm": 1.7412368059158325, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7022294402122498, + "num_tokens": 178796855.0, + "step": 6911 + }, + { + "epoch": 0.7590599604656271, + "grad_norm": 1.692141056060791, + "learning_rate": 5e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6778305768966675, + "num_tokens": 178832025.0, + "step": 6912 + }, + { + "epoch": 0.7591697781682407, + "grad_norm": 1.8753000497817993, + "learning_rate": 5e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.6999745965003967, + "num_tokens": 178857868.0, + "step": 6913 + }, + { + "epoch": 0.7592795958708544, + "grad_norm": 1.7975811958312988, + "learning_rate": 5e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6958857178688049, + "num_tokens": 178885871.0, + "step": 6914 + }, + { + "epoch": 0.759389413573468, + "grad_norm": 1.8684879541397095, + "learning_rate": 5e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6823018789291382, + "num_tokens": 178909391.0, + "step": 6915 + }, + { + "epoch": 0.7594992312760817, + "grad_norm": 1.8973933458328247, + "learning_rate": 5e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7050548195838928, + "num_tokens": 178932338.0, + "step": 6916 + }, + { + "epoch": 0.7596090489786954, + "grad_norm": 1.9685380458831787, + "learning_rate": 5e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7283725142478943, + "num_tokens": 178952560.0, + "step": 6917 + }, + { + "epoch": 0.7597188666813091, + "grad_norm": 1.8643012046813965, + "learning_rate": 5e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7288870811462402, + "num_tokens": 178975333.0, + "step": 6918 + }, + { + "epoch": 0.7598286843839227, + "grad_norm": 1.7163563966751099, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6969692707061768, + "num_tokens": 179005016.0, + "step": 6919 + }, + { + "epoch": 0.7599385020865363, + "grad_norm": 1.725588083267212, + "learning_rate": 5e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6984469890594482, + "num_tokens": 179034513.0, + "step": 6920 + }, + { + "epoch": 0.76004831978915, + "grad_norm": 1.820184588432312, + "learning_rate": 5e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6954519152641296, + "num_tokens": 179058126.0, + "step": 6921 + }, + { + "epoch": 0.7601581374917636, + "grad_norm": 1.6930584907531738, + "learning_rate": 5e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7079644799232483, + "num_tokens": 179088646.0, + "step": 6922 + }, + { + "epoch": 0.7602679551943773, + "grad_norm": 1.676195740699768, + "learning_rate": 5e-06, + "loss": 0.971, + "mean_token_accuracy": 0.6988094449043274, + "num_tokens": 179120402.0, + "step": 6923 + }, + { + "epoch": 0.760377772896991, + "grad_norm": 1.7791314125061035, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.707748532295227, + "num_tokens": 179145419.0, + "step": 6924 + }, + { + "epoch": 0.7604875905996047, + "grad_norm": 1.7488459348678589, + "learning_rate": 5e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6894749402999878, + "num_tokens": 179171713.0, + "step": 6925 + }, + { + "epoch": 0.7605974083022183, + "grad_norm": 1.8020011186599731, + "learning_rate": 5e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6961820125579834, + "num_tokens": 179196909.0, + "step": 6926 + }, + { + "epoch": 0.760707226004832, + "grad_norm": 1.910679817199707, + "learning_rate": 5e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7202812433242798, + "num_tokens": 179219618.0, + "step": 6927 + }, + { + "epoch": 0.7608170437074456, + "grad_norm": 1.8373241424560547, + "learning_rate": 5e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6956135630607605, + "num_tokens": 179246477.0, + "step": 6928 + }, + { + "epoch": 0.7609268614100593, + "grad_norm": 1.7487891912460327, + "learning_rate": 5e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7135334014892578, + "num_tokens": 179273640.0, + "step": 6929 + }, + { + "epoch": 0.7610366791126729, + "grad_norm": 2.0231773853302, + "learning_rate": 5e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6944320201873779, + "num_tokens": 179296357.0, + "step": 6930 + }, + { + "epoch": 0.7611464968152867, + "grad_norm": 1.807225227355957, + "learning_rate": 5e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7148017883300781, + "num_tokens": 179321851.0, + "step": 6931 + }, + { + "epoch": 0.7612563145179003, + "grad_norm": 1.699636697769165, + "learning_rate": 5e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6964395046234131, + "num_tokens": 179348964.0, + "step": 6932 + }, + { + "epoch": 0.761366132220514, + "grad_norm": 2.093494415283203, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7354710698127747, + "num_tokens": 179367331.0, + "step": 6933 + }, + { + "epoch": 0.7614759499231276, + "grad_norm": 1.7249557971954346, + "learning_rate": 5e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7057749032974243, + "num_tokens": 179396316.0, + "step": 6934 + }, + { + "epoch": 0.7615857676257413, + "grad_norm": 1.9062778949737549, + "learning_rate": 5e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6934741735458374, + "num_tokens": 179420898.0, + "step": 6935 + }, + { + "epoch": 0.7616955853283549, + "grad_norm": 1.7433950901031494, + "learning_rate": 5e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6918593645095825, + "num_tokens": 179452029.0, + "step": 6936 + }, + { + "epoch": 0.7618054030309686, + "grad_norm": 1.6697795391082764, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7093950510025024, + "num_tokens": 179484253.0, + "step": 6937 + }, + { + "epoch": 0.7619152207335822, + "grad_norm": 1.7751935720443726, + "learning_rate": 5e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7229800820350647, + "num_tokens": 179508215.0, + "step": 6938 + }, + { + "epoch": 0.762025038436196, + "grad_norm": 1.762913703918457, + "learning_rate": 5e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.722858190536499, + "num_tokens": 179535081.0, + "step": 6939 + }, + { + "epoch": 0.7621348561388096, + "grad_norm": 1.7409865856170654, + "learning_rate": 5e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7303656339645386, + "num_tokens": 179560542.0, + "step": 6940 + }, + { + "epoch": 0.7622446738414232, + "grad_norm": 1.9544098377227783, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7243475914001465, + "num_tokens": 179581432.0, + "step": 6941 + }, + { + "epoch": 0.7623544915440369, + "grad_norm": 1.7888840436935425, + "learning_rate": 5e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6779509782791138, + "num_tokens": 179610077.0, + "step": 6942 + }, + { + "epoch": 0.7624643092466505, + "grad_norm": 1.8872857093811035, + "learning_rate": 5e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7178899645805359, + "num_tokens": 179634425.0, + "step": 6943 + }, + { + "epoch": 0.7625741269492642, + "grad_norm": 1.627234697341919, + "learning_rate": 5e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6884382367134094, + "num_tokens": 179666041.0, + "step": 6944 + }, + { + "epoch": 0.7626839446518778, + "grad_norm": 1.831008791923523, + "learning_rate": 5e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6864849328994751, + "num_tokens": 179693235.0, + "step": 6945 + }, + { + "epoch": 0.7627937623544916, + "grad_norm": 1.6745051145553589, + "learning_rate": 5e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.707115888595581, + "num_tokens": 179722641.0, + "step": 6946 + }, + { + "epoch": 0.7629035800571052, + "grad_norm": 1.7541096210479736, + "learning_rate": 5e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7138733863830566, + "num_tokens": 179749012.0, + "step": 6947 + }, + { + "epoch": 0.7630133977597189, + "grad_norm": 1.8192932605743408, + "learning_rate": 5e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6967719197273254, + "num_tokens": 179773985.0, + "step": 6948 + }, + { + "epoch": 0.7631232154623325, + "grad_norm": 1.6698415279388428, + "learning_rate": 5e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6857209801673889, + "num_tokens": 179803630.0, + "step": 6949 + }, + { + "epoch": 0.7632330331649462, + "grad_norm": 2.035921335220337, + "learning_rate": 5e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7360116243362427, + "num_tokens": 179822433.0, + "step": 6950 + }, + { + "epoch": 0.7633428508675598, + "grad_norm": 1.730828046798706, + "learning_rate": 5e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7116419076919556, + "num_tokens": 179852237.0, + "step": 6951 + }, + { + "epoch": 0.7634526685701735, + "grad_norm": 1.8338122367858887, + "learning_rate": 5e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.6999897956848145, + "num_tokens": 179878990.0, + "step": 6952 + }, + { + "epoch": 0.7635624862727872, + "grad_norm": 1.9475548267364502, + "learning_rate": 5e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7091954946517944, + "num_tokens": 179902225.0, + "step": 6953 + }, + { + "epoch": 0.7636723039754009, + "grad_norm": 1.8867326974868774, + "learning_rate": 5e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6866564750671387, + "num_tokens": 179928615.0, + "step": 6954 + }, + { + "epoch": 0.7637821216780145, + "grad_norm": 1.6213018894195557, + "learning_rate": 5e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7004131078720093, + "num_tokens": 179957340.0, + "step": 6955 + }, + { + "epoch": 0.7638919393806282, + "grad_norm": 1.8825241327285767, + "learning_rate": 5e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7111519575119019, + "num_tokens": 179980404.0, + "step": 6956 + }, + { + "epoch": 0.7640017570832418, + "grad_norm": 1.7793407440185547, + "learning_rate": 5e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7485625743865967, + "num_tokens": 180005096.0, + "step": 6957 + }, + { + "epoch": 0.7641115747858555, + "grad_norm": 1.6811435222625732, + "learning_rate": 5e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.70320725440979, + "num_tokens": 180035145.0, + "step": 6958 + }, + { + "epoch": 0.7642213924884691, + "grad_norm": 1.8798487186431885, + "learning_rate": 5e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7135938405990601, + "num_tokens": 180065678.0, + "step": 6959 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 1.7386103868484497, + "learning_rate": 5e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6997552514076233, + "num_tokens": 180092225.0, + "step": 6960 + }, + { + "epoch": 0.7644410278936965, + "grad_norm": 1.8032748699188232, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.719335675239563, + "num_tokens": 180117476.0, + "step": 6961 + }, + { + "epoch": 0.7645508455963101, + "grad_norm": 1.7378380298614502, + "learning_rate": 5e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7088360786437988, + "num_tokens": 180147562.0, + "step": 6962 + }, + { + "epoch": 0.7646606632989238, + "grad_norm": 1.8504202365875244, + "learning_rate": 5e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7196086645126343, + "num_tokens": 180173765.0, + "step": 6963 + }, + { + "epoch": 0.7647704810015374, + "grad_norm": 1.7580206394195557, + "learning_rate": 5e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7065871953964233, + "num_tokens": 180198311.0, + "step": 6964 + }, + { + "epoch": 0.7648802987041511, + "grad_norm": 1.8034846782684326, + "learning_rate": 5e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.715116024017334, + "num_tokens": 180222791.0, + "step": 6965 + }, + { + "epoch": 0.7649901164067647, + "grad_norm": 1.7492822408676147, + "learning_rate": 5e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7056400775909424, + "num_tokens": 180250273.0, + "step": 6966 + }, + { + "epoch": 0.7650999341093784, + "grad_norm": 1.8877251148223877, + "learning_rate": 5e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7290017604827881, + "num_tokens": 180273225.0, + "step": 6967 + }, + { + "epoch": 0.7652097518119921, + "grad_norm": 1.9261600971221924, + "learning_rate": 5e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6873071193695068, + "num_tokens": 180298907.0, + "step": 6968 + }, + { + "epoch": 0.7653195695146058, + "grad_norm": 1.825056552886963, + "learning_rate": 5e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7106629610061646, + "num_tokens": 180323070.0, + "step": 6969 + }, + { + "epoch": 0.7654293872172194, + "grad_norm": 2.0495450496673584, + "learning_rate": 5e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7109370231628418, + "num_tokens": 180344251.0, + "step": 6970 + }, + { + "epoch": 0.7655392049198331, + "grad_norm": 1.582323431968689, + "learning_rate": 5e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7027866840362549, + "num_tokens": 180376678.0, + "step": 6971 + }, + { + "epoch": 0.7656490226224467, + "grad_norm": 1.8716959953308105, + "learning_rate": 5e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7197885513305664, + "num_tokens": 180398859.0, + "step": 6972 + }, + { + "epoch": 0.7657588403250604, + "grad_norm": 1.9092811346054077, + "learning_rate": 5e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7200396060943604, + "num_tokens": 180421990.0, + "step": 6973 + }, + { + "epoch": 0.765868658027674, + "grad_norm": 1.7659597396850586, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7040789127349854, + "num_tokens": 180449092.0, + "step": 6974 + }, + { + "epoch": 0.7659784757302878, + "grad_norm": 1.8463068008422852, + "learning_rate": 5e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7075915336608887, + "num_tokens": 180474008.0, + "step": 6975 + }, + { + "epoch": 0.7660882934329014, + "grad_norm": 1.8006128072738647, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.733031153678894, + "num_tokens": 180498123.0, + "step": 6976 + }, + { + "epoch": 0.7661981111355151, + "grad_norm": 1.6669657230377197, + "learning_rate": 5e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6927472949028015, + "num_tokens": 180528712.0, + "step": 6977 + }, + { + "epoch": 0.7663079288381287, + "grad_norm": 1.959481120109558, + "learning_rate": 5e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7050995826721191, + "num_tokens": 180550529.0, + "step": 6978 + }, + { + "epoch": 0.7664177465407424, + "grad_norm": 1.7054030895233154, + "learning_rate": 5e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6875970959663391, + "num_tokens": 180580175.0, + "step": 6979 + }, + { + "epoch": 0.766527564243356, + "grad_norm": 1.9559630155563354, + "learning_rate": 5e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7034175395965576, + "num_tokens": 180602605.0, + "step": 6980 + }, + { + "epoch": 0.7666373819459696, + "grad_norm": 1.821377158164978, + "learning_rate": 5e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7243896722793579, + "num_tokens": 180625841.0, + "step": 6981 + }, + { + "epoch": 0.7667471996485834, + "grad_norm": 1.830994963645935, + "learning_rate": 5e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6987591981887817, + "num_tokens": 180650876.0, + "step": 6982 + }, + { + "epoch": 0.766857017351197, + "grad_norm": 1.7949278354644775, + "learning_rate": 5e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7021088600158691, + "num_tokens": 180674696.0, + "step": 6983 + }, + { + "epoch": 0.7669668350538107, + "grad_norm": 1.7193704843521118, + "learning_rate": 5e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7342519164085388, + "num_tokens": 180702440.0, + "step": 6984 + }, + { + "epoch": 0.7670766527564243, + "grad_norm": 1.6916061639785767, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7296888828277588, + "num_tokens": 180729131.0, + "step": 6985 + }, + { + "epoch": 0.767186470459038, + "grad_norm": 2.259012460708618, + "learning_rate": 5e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7258861660957336, + "num_tokens": 180746783.0, + "step": 6986 + }, + { + "epoch": 0.7672962881616516, + "grad_norm": 1.7208888530731201, + "learning_rate": 5e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7396108508110046, + "num_tokens": 180771432.0, + "step": 6987 + }, + { + "epoch": 0.7674061058642653, + "grad_norm": 1.7474966049194336, + "learning_rate": 5e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7035731673240662, + "num_tokens": 180800106.0, + "step": 6988 + }, + { + "epoch": 0.767515923566879, + "grad_norm": 1.6860722303390503, + "learning_rate": 5e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7062321901321411, + "num_tokens": 180829121.0, + "step": 6989 + }, + { + "epoch": 0.7676257412694927, + "grad_norm": 1.872825026512146, + "learning_rate": 5e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6839627027511597, + "num_tokens": 180852728.0, + "step": 6990 + }, + { + "epoch": 0.7677355589721063, + "grad_norm": 1.8281465768814087, + "learning_rate": 5e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7025331258773804, + "num_tokens": 180879707.0, + "step": 6991 + }, + { + "epoch": 0.76784537667472, + "grad_norm": 1.7746385335922241, + "learning_rate": 5e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7017759680747986, + "num_tokens": 180904735.0, + "step": 6992 + }, + { + "epoch": 0.7679551943773336, + "grad_norm": 1.7642239332199097, + "learning_rate": 5e-06, + "loss": 1.032, + "mean_token_accuracy": 0.7029509544372559, + "num_tokens": 180931028.0, + "step": 6993 + }, + { + "epoch": 0.7680650120799473, + "grad_norm": 1.8546565771102905, + "learning_rate": 5e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7014080286026001, + "num_tokens": 180956290.0, + "step": 6994 + }, + { + "epoch": 0.7681748297825609, + "grad_norm": 1.7074010372161865, + "learning_rate": 5e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7059358358383179, + "num_tokens": 180982139.0, + "step": 6995 + }, + { + "epoch": 0.7682846474851747, + "grad_norm": 1.667467713356018, + "learning_rate": 5e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.6963096857070923, + "num_tokens": 181011497.0, + "step": 6996 + }, + { + "epoch": 0.7683944651877883, + "grad_norm": 2.1061301231384277, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7218796014785767, + "num_tokens": 181030873.0, + "step": 6997 + }, + { + "epoch": 0.768504282890402, + "grad_norm": 1.836942434310913, + "learning_rate": 5e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7036638259887695, + "num_tokens": 181056276.0, + "step": 6998 + }, + { + "epoch": 0.7686141005930156, + "grad_norm": 1.790897250175476, + "learning_rate": 5e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7136279344558716, + "num_tokens": 181082287.0, + "step": 6999 + }, + { + "epoch": 0.7687239182956292, + "grad_norm": 1.6786078214645386, + "learning_rate": 5e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.742289662361145, + "num_tokens": 181107859.0, + "step": 7000 + }, + { + "epoch": 0.7688337359982429, + "grad_norm": 2.036724805831909, + "learning_rate": 5e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7236140966415405, + "num_tokens": 181129680.0, + "step": 7001 + }, + { + "epoch": 0.7689435537008565, + "grad_norm": 1.9143239259719849, + "learning_rate": 5e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7036067247390747, + "num_tokens": 181155015.0, + "step": 7002 + }, + { + "epoch": 0.7690533714034702, + "grad_norm": 1.8990321159362793, + "learning_rate": 5e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7007474303245544, + "num_tokens": 181178997.0, + "step": 7003 + }, + { + "epoch": 0.7691631891060839, + "grad_norm": 1.9728718996047974, + "learning_rate": 5e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7058494091033936, + "num_tokens": 181200083.0, + "step": 7004 + }, + { + "epoch": 0.7692730068086976, + "grad_norm": 1.814975380897522, + "learning_rate": 5e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7099822163581848, + "num_tokens": 181226738.0, + "step": 7005 + }, + { + "epoch": 0.7693828245113112, + "grad_norm": 1.7138612270355225, + "learning_rate": 5e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7158241271972656, + "num_tokens": 181252737.0, + "step": 7006 + }, + { + "epoch": 0.7694926422139249, + "grad_norm": 2.0423197746276855, + "learning_rate": 5e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7085607051849365, + "num_tokens": 181272995.0, + "step": 7007 + }, + { + "epoch": 0.7696024599165385, + "grad_norm": 1.7926220893859863, + "learning_rate": 5e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7042847275733948, + "num_tokens": 181297827.0, + "step": 7008 + }, + { + "epoch": 0.7697122776191522, + "grad_norm": 1.8408137559890747, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7369824647903442, + "num_tokens": 181322076.0, + "step": 7009 + }, + { + "epoch": 0.7698220953217658, + "grad_norm": 1.6879949569702148, + "learning_rate": 5e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6953343749046326, + "num_tokens": 181351513.0, + "step": 7010 + }, + { + "epoch": 0.7699319130243796, + "grad_norm": 1.8994837999343872, + "learning_rate": 5e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6972848176956177, + "num_tokens": 181376388.0, + "step": 7011 + }, + { + "epoch": 0.7700417307269932, + "grad_norm": 1.7692162990570068, + "learning_rate": 5e-06, + "loss": 0.978, + "mean_token_accuracy": 0.6922339200973511, + "num_tokens": 181402524.0, + "step": 7012 + }, + { + "epoch": 0.7701515484296069, + "grad_norm": 1.8743022680282593, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7198147773742676, + "num_tokens": 181425358.0, + "step": 7013 + }, + { + "epoch": 0.7702613661322205, + "grad_norm": 1.8695623874664307, + "learning_rate": 5e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6717137098312378, + "num_tokens": 181452516.0, + "step": 7014 + }, + { + "epoch": 0.7703711838348342, + "grad_norm": 1.93948495388031, + "learning_rate": 5e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.722102165222168, + "num_tokens": 181475432.0, + "step": 7015 + }, + { + "epoch": 0.7704810015374478, + "grad_norm": 1.8674383163452148, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7024335861206055, + "num_tokens": 181502364.0, + "step": 7016 + }, + { + "epoch": 0.7705908192400615, + "grad_norm": 1.6137547492980957, + "learning_rate": 5e-06, + "loss": 0.992, + "mean_token_accuracy": 0.695732593536377, + "num_tokens": 181531461.0, + "step": 7017 + }, + { + "epoch": 0.7707006369426752, + "grad_norm": 1.7068604230880737, + "learning_rate": 5e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7133564949035645, + "num_tokens": 181558428.0, + "step": 7018 + }, + { + "epoch": 0.7708104546452889, + "grad_norm": 1.7687370777130127, + "learning_rate": 5e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.6992225646972656, + "num_tokens": 181585447.0, + "step": 7019 + }, + { + "epoch": 0.7709202723479025, + "grad_norm": 1.8053230047225952, + "learning_rate": 5e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7174255847930908, + "num_tokens": 181610753.0, + "step": 7020 + }, + { + "epoch": 0.7710300900505161, + "grad_norm": 1.6668052673339844, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7280721068382263, + "num_tokens": 181637039.0, + "step": 7021 + }, + { + "epoch": 0.7711399077531298, + "grad_norm": 1.8891853094100952, + "learning_rate": 5e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6984712481498718, + "num_tokens": 181660619.0, + "step": 7022 + }, + { + "epoch": 0.7712497254557434, + "grad_norm": 1.7303211688995361, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7154198884963989, + "num_tokens": 181688737.0, + "step": 7023 + }, + { + "epoch": 0.7713595431583571, + "grad_norm": 1.6161173582077026, + "learning_rate": 5e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6826527118682861, + "num_tokens": 181722096.0, + "step": 7024 + }, + { + "epoch": 0.7714693608609708, + "grad_norm": 1.8211477994918823, + "learning_rate": 5e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7110394835472107, + "num_tokens": 181748940.0, + "step": 7025 + }, + { + "epoch": 0.7715791785635845, + "grad_norm": 2.0376124382019043, + "learning_rate": 5e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7149280905723572, + "num_tokens": 181769377.0, + "step": 7026 + }, + { + "epoch": 0.7716889962661981, + "grad_norm": 1.891018271446228, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7213206887245178, + "num_tokens": 181792879.0, + "step": 7027 + }, + { + "epoch": 0.7717988139688118, + "grad_norm": 1.8622647523880005, + "learning_rate": 5e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7212539911270142, + "num_tokens": 181817101.0, + "step": 7028 + }, + { + "epoch": 0.7719086316714254, + "grad_norm": 2.01239013671875, + "learning_rate": 5e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7101242542266846, + "num_tokens": 181840096.0, + "step": 7029 + }, + { + "epoch": 0.7720184493740391, + "grad_norm": 1.7423356771469116, + "learning_rate": 5e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.704026460647583, + "num_tokens": 181868266.0, + "step": 7030 + }, + { + "epoch": 0.7721282670766527, + "grad_norm": 1.7229200601577759, + "learning_rate": 5e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7092471122741699, + "num_tokens": 181896225.0, + "step": 7031 + }, + { + "epoch": 0.7722380847792664, + "grad_norm": 1.9113560914993286, + "learning_rate": 5e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7137701511383057, + "num_tokens": 181920681.0, + "step": 7032 + }, + { + "epoch": 0.7723479024818801, + "grad_norm": 1.9302736520767212, + "learning_rate": 5e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7291638851165771, + "num_tokens": 181942515.0, + "step": 7033 + }, + { + "epoch": 0.7724577201844938, + "grad_norm": 1.8210527896881104, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7193516492843628, + "num_tokens": 181966502.0, + "step": 7034 + }, + { + "epoch": 0.7725675378871074, + "grad_norm": 1.747807264328003, + "learning_rate": 5e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6993879079818726, + "num_tokens": 181994520.0, + "step": 7035 + }, + { + "epoch": 0.7726773555897211, + "grad_norm": 1.7712372541427612, + "learning_rate": 5e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7147573232650757, + "num_tokens": 182018897.0, + "step": 7036 + }, + { + "epoch": 0.7727871732923347, + "grad_norm": 1.7089178562164307, + "learning_rate": 5e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6924914121627808, + "num_tokens": 182047654.0, + "step": 7037 + }, + { + "epoch": 0.7728969909949484, + "grad_norm": 1.9279508590698242, + "learning_rate": 5e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6899269223213196, + "num_tokens": 182071467.0, + "step": 7038 + }, + { + "epoch": 0.773006808697562, + "grad_norm": 1.7464284896850586, + "learning_rate": 5e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7013236284255981, + "num_tokens": 182098347.0, + "step": 7039 + }, + { + "epoch": 0.7731166264001758, + "grad_norm": 1.7685455083847046, + "learning_rate": 5e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7152018547058105, + "num_tokens": 182124515.0, + "step": 7040 + }, + { + "epoch": 0.7732264441027894, + "grad_norm": 1.7634844779968262, + "learning_rate": 5e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7086086273193359, + "num_tokens": 182150633.0, + "step": 7041 + }, + { + "epoch": 0.773336261805403, + "grad_norm": 1.7589502334594727, + "learning_rate": 5e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7106459140777588, + "num_tokens": 182176003.0, + "step": 7042 + }, + { + "epoch": 0.7734460795080167, + "grad_norm": 1.788047194480896, + "learning_rate": 5e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6941285133361816, + "num_tokens": 182201889.0, + "step": 7043 + }, + { + "epoch": 0.7735558972106303, + "grad_norm": 1.895323395729065, + "learning_rate": 5e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7056360244750977, + "num_tokens": 182226329.0, + "step": 7044 + }, + { + "epoch": 0.773665714913244, + "grad_norm": 1.8888946771621704, + "learning_rate": 5e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6987311840057373, + "num_tokens": 182251635.0, + "step": 7045 + }, + { + "epoch": 0.7737755326158576, + "grad_norm": 1.5515542030334473, + "learning_rate": 5e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7015988230705261, + "num_tokens": 182285745.0, + "step": 7046 + }, + { + "epoch": 0.7738853503184714, + "grad_norm": 1.6736351251602173, + "learning_rate": 5e-06, + "loss": 1.074, + "mean_token_accuracy": 0.6790260076522827, + "num_tokens": 182315862.0, + "step": 7047 + }, + { + "epoch": 0.773995168021085, + "grad_norm": 1.724236249923706, + "learning_rate": 5e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7126172184944153, + "num_tokens": 182341685.0, + "step": 7048 + }, + { + "epoch": 0.7741049857236987, + "grad_norm": 1.858629584312439, + "learning_rate": 5e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6910102367401123, + "num_tokens": 182366945.0, + "step": 7049 + }, + { + "epoch": 0.7742148034263123, + "grad_norm": 1.8295096158981323, + "learning_rate": 5e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6997934579849243, + "num_tokens": 182392562.0, + "step": 7050 + }, + { + "epoch": 0.774324621128926, + "grad_norm": 1.826696515083313, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.718287467956543, + "num_tokens": 182419698.0, + "step": 7051 + }, + { + "epoch": 0.7744344388315396, + "grad_norm": 1.6019691228866577, + "learning_rate": 5e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7140745520591736, + "num_tokens": 182450319.0, + "step": 7052 + }, + { + "epoch": 0.7745442565341533, + "grad_norm": 2.2015795707702637, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7091612219810486, + "num_tokens": 182469403.0, + "step": 7053 + }, + { + "epoch": 0.774654074236767, + "grad_norm": 1.7138805389404297, + "learning_rate": 5e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.688785970211029, + "num_tokens": 182497872.0, + "step": 7054 + }, + { + "epoch": 0.7747638919393807, + "grad_norm": 1.9226405620574951, + "learning_rate": 5e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7101413607597351, + "num_tokens": 182520264.0, + "step": 7055 + }, + { + "epoch": 0.7748737096419943, + "grad_norm": 1.867989182472229, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7236814498901367, + "num_tokens": 182542265.0, + "step": 7056 + }, + { + "epoch": 0.774983527344608, + "grad_norm": 1.9834376573562622, + "learning_rate": 5e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7096904516220093, + "num_tokens": 182564840.0, + "step": 7057 + }, + { + "epoch": 0.7750933450472216, + "grad_norm": 1.738811731338501, + "learning_rate": 5e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.6976490020751953, + "num_tokens": 182592846.0, + "step": 7058 + }, + { + "epoch": 0.7752031627498353, + "grad_norm": 1.7498570680618286, + "learning_rate": 5e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7200512290000916, + "num_tokens": 182618696.0, + "step": 7059 + }, + { + "epoch": 0.7753129804524489, + "grad_norm": 1.9019548892974854, + "learning_rate": 5e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.718003511428833, + "num_tokens": 182640753.0, + "step": 7060 + }, + { + "epoch": 0.7754227981550625, + "grad_norm": 1.913577675819397, + "learning_rate": 5e-06, + "loss": 1.025, + "mean_token_accuracy": 0.7057541012763977, + "num_tokens": 182665125.0, + "step": 7061 + }, + { + "epoch": 0.7755326158576763, + "grad_norm": 1.725426197052002, + "learning_rate": 5e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7031102776527405, + "num_tokens": 182694839.0, + "step": 7062 + }, + { + "epoch": 0.77564243356029, + "grad_norm": 1.631765604019165, + "learning_rate": 5e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6973472237586975, + "num_tokens": 182724258.0, + "step": 7063 + }, + { + "epoch": 0.7757522512629036, + "grad_norm": 1.6869573593139648, + "learning_rate": 5e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6990329623222351, + "num_tokens": 182752554.0, + "step": 7064 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 2.037043809890747, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7111577987670898, + "num_tokens": 182772571.0, + "step": 7065 + }, + { + "epoch": 0.7759718866681309, + "grad_norm": 2.066619634628296, + "learning_rate": 5e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7126361131668091, + "num_tokens": 182793416.0, + "step": 7066 + }, + { + "epoch": 0.7760817043707445, + "grad_norm": 1.9629855155944824, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7015199661254883, + "num_tokens": 182818235.0, + "step": 7067 + }, + { + "epoch": 0.7761915220733582, + "grad_norm": 2.0518879890441895, + "learning_rate": 5e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7015063166618347, + "num_tokens": 182841986.0, + "step": 7068 + }, + { + "epoch": 0.7763013397759719, + "grad_norm": 1.7666772603988647, + "learning_rate": 5e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7203474044799805, + "num_tokens": 182867693.0, + "step": 7069 + }, + { + "epoch": 0.7764111574785856, + "grad_norm": 1.923389196395874, + "learning_rate": 5e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7246154546737671, + "num_tokens": 182890350.0, + "step": 7070 + }, + { + "epoch": 0.7765209751811992, + "grad_norm": 1.6579622030258179, + "learning_rate": 5e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7270690202713013, + "num_tokens": 182916105.0, + "step": 7071 + }, + { + "epoch": 0.7766307928838129, + "grad_norm": 1.869675636291504, + "learning_rate": 5e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6967565417289734, + "num_tokens": 182942368.0, + "step": 7072 + }, + { + "epoch": 0.7767406105864265, + "grad_norm": 1.8184823989868164, + "learning_rate": 5e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.6970263123512268, + "num_tokens": 182970206.0, + "step": 7073 + }, + { + "epoch": 0.7768504282890402, + "grad_norm": 1.8047431707382202, + "learning_rate": 5e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6810114979743958, + "num_tokens": 182999682.0, + "step": 7074 + }, + { + "epoch": 0.7769602459916538, + "grad_norm": 1.7999427318572998, + "learning_rate": 5e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7078759670257568, + "num_tokens": 183023973.0, + "step": 7075 + }, + { + "epoch": 0.7770700636942676, + "grad_norm": 2.0895838737487793, + "learning_rate": 5e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7234514951705933, + "num_tokens": 183042388.0, + "step": 7076 + }, + { + "epoch": 0.7771798813968812, + "grad_norm": 2.004669427871704, + "learning_rate": 5e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6922675967216492, + "num_tokens": 183065253.0, + "step": 7077 + }, + { + "epoch": 0.7772896990994949, + "grad_norm": 1.8431850671768188, + "learning_rate": 5e-06, + "loss": 0.875, + "mean_token_accuracy": 0.724474310874939, + "num_tokens": 183089269.0, + "step": 7078 + }, + { + "epoch": 0.7773995168021085, + "grad_norm": 1.6355512142181396, + "learning_rate": 5e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7024874687194824, + "num_tokens": 183119398.0, + "step": 7079 + }, + { + "epoch": 0.7775093345047221, + "grad_norm": 1.830830693244934, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7355824708938599, + "num_tokens": 183140064.0, + "step": 7080 + }, + { + "epoch": 0.7776191522073358, + "grad_norm": 1.8982621431350708, + "learning_rate": 5e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6868451833724976, + "num_tokens": 183165038.0, + "step": 7081 + }, + { + "epoch": 0.7777289699099494, + "grad_norm": 1.585839033126831, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7200969457626343, + "num_tokens": 183196352.0, + "step": 7082 + }, + { + "epoch": 0.7778387876125632, + "grad_norm": 1.882746934890747, + "learning_rate": 5e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6779384613037109, + "num_tokens": 183220592.0, + "step": 7083 + }, + { + "epoch": 0.7779486053151768, + "grad_norm": 1.8905658721923828, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7245807647705078, + "num_tokens": 183245132.0, + "step": 7084 + }, + { + "epoch": 0.7780584230177905, + "grad_norm": 1.6162313222885132, + "learning_rate": 5e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7016964554786682, + "num_tokens": 183273746.0, + "step": 7085 + }, + { + "epoch": 0.7781682407204041, + "grad_norm": 1.863259196281433, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7230266332626343, + "num_tokens": 183296072.0, + "step": 7086 + }, + { + "epoch": 0.7782780584230178, + "grad_norm": 1.6942217350006104, + "learning_rate": 5e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7151806354522705, + "num_tokens": 183322964.0, + "step": 7087 + }, + { + "epoch": 0.7783878761256314, + "grad_norm": 1.792465090751648, + "learning_rate": 5e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6909937858581543, + "num_tokens": 183350652.0, + "step": 7088 + }, + { + "epoch": 0.7784976938282451, + "grad_norm": 1.8825408220291138, + "learning_rate": 5e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.698864221572876, + "num_tokens": 183375943.0, + "step": 7089 + }, + { + "epoch": 0.7786075115308587, + "grad_norm": 1.7138450145721436, + "learning_rate": 5e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7090674638748169, + "num_tokens": 183403003.0, + "step": 7090 + }, + { + "epoch": 0.7787173292334725, + "grad_norm": 1.8237518072128296, + "learning_rate": 5e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.707717776298523, + "num_tokens": 183428526.0, + "step": 7091 + }, + { + "epoch": 0.7788271469360861, + "grad_norm": 1.6816550493240356, + "learning_rate": 5e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7012171745300293, + "num_tokens": 183457557.0, + "step": 7092 + }, + { + "epoch": 0.7789369646386998, + "grad_norm": 1.9558212757110596, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7318772077560425, + "num_tokens": 183479955.0, + "step": 7093 + }, + { + "epoch": 0.7790467823413134, + "grad_norm": 1.8338267803192139, + "learning_rate": 5e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7059987783432007, + "num_tokens": 183504578.0, + "step": 7094 + }, + { + "epoch": 0.7791566000439271, + "grad_norm": 1.827938199043274, + "learning_rate": 5e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6913554668426514, + "num_tokens": 183528760.0, + "step": 7095 + }, + { + "epoch": 0.7792664177465407, + "grad_norm": 1.794751524925232, + "learning_rate": 5e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6913163661956787, + "num_tokens": 183555669.0, + "step": 7096 + }, + { + "epoch": 0.7793762354491544, + "grad_norm": 1.6391619443893433, + "learning_rate": 5e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6851258277893066, + "num_tokens": 183588454.0, + "step": 7097 + }, + { + "epoch": 0.7794860531517681, + "grad_norm": 1.72970449924469, + "learning_rate": 5e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7023298740386963, + "num_tokens": 183617540.0, + "step": 7098 + }, + { + "epoch": 0.7795958708543818, + "grad_norm": 1.5369232892990112, + "learning_rate": 5e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7131742238998413, + "num_tokens": 183650719.0, + "step": 7099 + }, + { + "epoch": 0.7797056885569954, + "grad_norm": 2.0374038219451904, + "learning_rate": 5e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7028316855430603, + "num_tokens": 183670631.0, + "step": 7100 + }, + { + "epoch": 0.779815506259609, + "grad_norm": 1.533882737159729, + "learning_rate": 5e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7220712304115295, + "num_tokens": 183704487.0, + "step": 7101 + }, + { + "epoch": 0.7799253239622227, + "grad_norm": 1.7117927074432373, + "learning_rate": 5e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6957258582115173, + "num_tokens": 183733016.0, + "step": 7102 + }, + { + "epoch": 0.7800351416648363, + "grad_norm": 1.8363347053527832, + "learning_rate": 5e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.725036084651947, + "num_tokens": 183756712.0, + "step": 7103 + }, + { + "epoch": 0.78014495936745, + "grad_norm": 1.6706818342208862, + "learning_rate": 5e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6962410807609558, + "num_tokens": 183786290.0, + "step": 7104 + }, + { + "epoch": 0.7802547770700637, + "grad_norm": 1.706180214881897, + "learning_rate": 5e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7125371098518372, + "num_tokens": 183814488.0, + "step": 7105 + }, + { + "epoch": 0.7803645947726774, + "grad_norm": 1.918223261833191, + "learning_rate": 5e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7018195390701294, + "num_tokens": 183836445.0, + "step": 7106 + }, + { + "epoch": 0.780474412475291, + "grad_norm": 2.04272198677063, + "learning_rate": 5e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7200341820716858, + "num_tokens": 183856452.0, + "step": 7107 + }, + { + "epoch": 0.7805842301779047, + "grad_norm": 1.81462824344635, + "learning_rate": 5e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7253510355949402, + "num_tokens": 183878933.0, + "step": 7108 + }, + { + "epoch": 0.7806940478805183, + "grad_norm": 1.718352198600769, + "learning_rate": 5e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7186155319213867, + "num_tokens": 183907109.0, + "step": 7109 + }, + { + "epoch": 0.780803865583132, + "grad_norm": 1.9610382318496704, + "learning_rate": 5e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7196204662322998, + "num_tokens": 183927847.0, + "step": 7110 + }, + { + "epoch": 0.7809136832857456, + "grad_norm": 1.7230688333511353, + "learning_rate": 5e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6885345578193665, + "num_tokens": 183957092.0, + "step": 7111 + }, + { + "epoch": 0.7810235009883594, + "grad_norm": 2.203886032104492, + "learning_rate": 5e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.703713059425354, + "num_tokens": 183975297.0, + "step": 7112 + }, + { + "epoch": 0.781133318690973, + "grad_norm": 1.9283268451690674, + "learning_rate": 5e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.711571991443634, + "num_tokens": 183996485.0, + "step": 7113 + }, + { + "epoch": 0.7812431363935867, + "grad_norm": 1.6562328338623047, + "learning_rate": 5e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6981734037399292, + "num_tokens": 184027251.0, + "step": 7114 + }, + { + "epoch": 0.7813529540962003, + "grad_norm": 1.7755365371704102, + "learning_rate": 5e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7095382213592529, + "num_tokens": 184055291.0, + "step": 7115 + }, + { + "epoch": 0.781462771798814, + "grad_norm": 1.8490902185440063, + "learning_rate": 5e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7014440298080444, + "num_tokens": 184080611.0, + "step": 7116 + }, + { + "epoch": 0.7815725895014276, + "grad_norm": 1.8015590906143188, + "learning_rate": 5e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7054435014724731, + "num_tokens": 184107017.0, + "step": 7117 + }, + { + "epoch": 0.7816824072040413, + "grad_norm": 1.5639748573303223, + "learning_rate": 5e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6910138130187988, + "num_tokens": 184140612.0, + "step": 7118 + }, + { + "epoch": 0.7817922249066549, + "grad_norm": 1.6718941926956177, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7106406688690186, + "num_tokens": 184168531.0, + "step": 7119 + }, + { + "epoch": 0.7819020426092687, + "grad_norm": 1.80953049659729, + "learning_rate": 5e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7185574173927307, + "num_tokens": 184191985.0, + "step": 7120 + }, + { + "epoch": 0.7820118603118823, + "grad_norm": 1.7432070970535278, + "learning_rate": 5e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7018587589263916, + "num_tokens": 184217894.0, + "step": 7121 + }, + { + "epoch": 0.782121678014496, + "grad_norm": 1.921492099761963, + "learning_rate": 5e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7085806131362915, + "num_tokens": 184239193.0, + "step": 7122 + }, + { + "epoch": 0.7822314957171096, + "grad_norm": 1.8699915409088135, + "learning_rate": 5e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7030575275421143, + "num_tokens": 184264336.0, + "step": 7123 + }, + { + "epoch": 0.7823413134197232, + "grad_norm": 1.8260828256607056, + "learning_rate": 5e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6988279819488525, + "num_tokens": 184290313.0, + "step": 7124 + }, + { + "epoch": 0.7824511311223369, + "grad_norm": 1.5212147235870361, + "learning_rate": 5e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7104552984237671, + "num_tokens": 184323641.0, + "step": 7125 + }, + { + "epoch": 0.7825609488249505, + "grad_norm": 1.8220851421356201, + "learning_rate": 5e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6934385299682617, + "num_tokens": 184349383.0, + "step": 7126 + }, + { + "epoch": 0.7826707665275643, + "grad_norm": 1.7396384477615356, + "learning_rate": 5e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6906895637512207, + "num_tokens": 184377800.0, + "step": 7127 + }, + { + "epoch": 0.7827805842301779, + "grad_norm": 1.8283805847167969, + "learning_rate": 5e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6996071338653564, + "num_tokens": 184402518.0, + "step": 7128 + }, + { + "epoch": 0.7828904019327916, + "grad_norm": 1.6821589469909668, + "learning_rate": 5e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7096381783485413, + "num_tokens": 184430953.0, + "step": 7129 + }, + { + "epoch": 0.7830002196354052, + "grad_norm": 1.7368754148483276, + "learning_rate": 5e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6926833987236023, + "num_tokens": 184458541.0, + "step": 7130 + }, + { + "epoch": 0.7831100373380189, + "grad_norm": 1.6783719062805176, + "learning_rate": 5e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6914322376251221, + "num_tokens": 184489274.0, + "step": 7131 + }, + { + "epoch": 0.7832198550406325, + "grad_norm": 1.7103849649429321, + "learning_rate": 5e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7144855260848999, + "num_tokens": 184516817.0, + "step": 7132 + }, + { + "epoch": 0.7833296727432462, + "grad_norm": 2.08293080329895, + "learning_rate": 5e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6953686475753784, + "num_tokens": 184537425.0, + "step": 7133 + }, + { + "epoch": 0.7834394904458599, + "grad_norm": 1.749735951423645, + "learning_rate": 5e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7151034474372864, + "num_tokens": 184562914.0, + "step": 7134 + }, + { + "epoch": 0.7835493081484736, + "grad_norm": 1.6923328638076782, + "learning_rate": 5e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.719876766204834, + "num_tokens": 184590485.0, + "step": 7135 + }, + { + "epoch": 0.7836591258510872, + "grad_norm": 1.7102484703063965, + "learning_rate": 5e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7151251435279846, + "num_tokens": 184617087.0, + "step": 7136 + }, + { + "epoch": 0.7837689435537009, + "grad_norm": 1.600221037864685, + "learning_rate": 5e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6974542140960693, + "num_tokens": 184648480.0, + "step": 7137 + }, + { + "epoch": 0.7838787612563145, + "grad_norm": 1.8659601211547852, + "learning_rate": 5e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7207140326499939, + "num_tokens": 184671386.0, + "step": 7138 + }, + { + "epoch": 0.7839885789589282, + "grad_norm": 1.8013986349105835, + "learning_rate": 5e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6866674423217773, + "num_tokens": 184699047.0, + "step": 7139 + }, + { + "epoch": 0.7840983966615418, + "grad_norm": 1.754757046699524, + "learning_rate": 5e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7098407745361328, + "num_tokens": 184726186.0, + "step": 7140 + }, + { + "epoch": 0.7842082143641556, + "grad_norm": 1.7638084888458252, + "learning_rate": 5e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7157352566719055, + "num_tokens": 184751333.0, + "step": 7141 + }, + { + "epoch": 0.7843180320667692, + "grad_norm": 1.6460485458374023, + "learning_rate": 5e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.6992286443710327, + "num_tokens": 184781657.0, + "step": 7142 + }, + { + "epoch": 0.7844278497693828, + "grad_norm": 1.566340446472168, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7368038892745972, + "num_tokens": 184810008.0, + "step": 7143 + }, + { + "epoch": 0.7845376674719965, + "grad_norm": 2.0663509368896484, + "learning_rate": 5e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7115235328674316, + "num_tokens": 184832810.0, + "step": 7144 + }, + { + "epoch": 0.7846474851746101, + "grad_norm": 1.5887919664382935, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7101078033447266, + "num_tokens": 184864667.0, + "step": 7145 + }, + { + "epoch": 0.7847573028772238, + "grad_norm": 1.6605178117752075, + "learning_rate": 5e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7221596837043762, + "num_tokens": 184892997.0, + "step": 7146 + }, + { + "epoch": 0.7848671205798374, + "grad_norm": 1.826104998588562, + "learning_rate": 5e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7137830853462219, + "num_tokens": 184918643.0, + "step": 7147 + }, + { + "epoch": 0.7849769382824512, + "grad_norm": 1.831010103225708, + "learning_rate": 5e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7019121646881104, + "num_tokens": 184942813.0, + "step": 7148 + }, + { + "epoch": 0.7850867559850648, + "grad_norm": 1.660406231880188, + "learning_rate": 5e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7010664939880371, + "num_tokens": 184971097.0, + "step": 7149 + }, + { + "epoch": 0.7851965736876785, + "grad_norm": 1.909079909324646, + "learning_rate": 5e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7075436115264893, + "num_tokens": 184993808.0, + "step": 7150 + }, + { + "epoch": 0.7853063913902921, + "grad_norm": 1.9739410877227783, + "learning_rate": 5e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6931922435760498, + "num_tokens": 185019571.0, + "step": 7151 + }, + { + "epoch": 0.7854162090929058, + "grad_norm": 1.7634201049804688, + "learning_rate": 5e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6932171583175659, + "num_tokens": 185046113.0, + "step": 7152 + }, + { + "epoch": 0.7855260267955194, + "grad_norm": 2.01438045501709, + "learning_rate": 5e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7000076770782471, + "num_tokens": 185066986.0, + "step": 7153 + }, + { + "epoch": 0.7856358444981331, + "grad_norm": 2.037837266921997, + "learning_rate": 5e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7222197651863098, + "num_tokens": 185086769.0, + "step": 7154 + }, + { + "epoch": 0.7857456622007467, + "grad_norm": 1.7326645851135254, + "learning_rate": 5e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7034191489219666, + "num_tokens": 185113199.0, + "step": 7155 + }, + { + "epoch": 0.7858554799033605, + "grad_norm": 1.8557294607162476, + "learning_rate": 5e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7176152467727661, + "num_tokens": 185136103.0, + "step": 7156 + }, + { + "epoch": 0.7859652976059741, + "grad_norm": 1.6280544996261597, + "learning_rate": 5e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6949678659439087, + "num_tokens": 185167000.0, + "step": 7157 + }, + { + "epoch": 0.7860751153085878, + "grad_norm": 1.9981529712677002, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7229669690132141, + "num_tokens": 185187414.0, + "step": 7158 + }, + { + "epoch": 0.7861849330112014, + "grad_norm": 1.7751097679138184, + "learning_rate": 5e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7335872650146484, + "num_tokens": 185213610.0, + "step": 7159 + }, + { + "epoch": 0.786294750713815, + "grad_norm": 1.7595175504684448, + "learning_rate": 5e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7173547744750977, + "num_tokens": 185239693.0, + "step": 7160 + }, + { + "epoch": 0.7864045684164287, + "grad_norm": 1.8827557563781738, + "learning_rate": 5e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6932806372642517, + "num_tokens": 185265288.0, + "step": 7161 + }, + { + "epoch": 0.7865143861190423, + "grad_norm": 1.8281867504119873, + "learning_rate": 5e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7141952514648438, + "num_tokens": 185290253.0, + "step": 7162 + }, + { + "epoch": 0.7866242038216561, + "grad_norm": 2.1159136295318604, + "learning_rate": 5e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7135003805160522, + "num_tokens": 185315223.0, + "step": 7163 + }, + { + "epoch": 0.7867340215242697, + "grad_norm": 1.7167794704437256, + "learning_rate": 5e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7050125598907471, + "num_tokens": 185342803.0, + "step": 7164 + }, + { + "epoch": 0.7868438392268834, + "grad_norm": 1.7723597288131714, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.737680196762085, + "num_tokens": 185366868.0, + "step": 7165 + }, + { + "epoch": 0.786953656929497, + "grad_norm": 1.8874297142028809, + "learning_rate": 5e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7098227739334106, + "num_tokens": 185389552.0, + "step": 7166 + }, + { + "epoch": 0.7870634746321107, + "grad_norm": 1.6695927381515503, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.729967474937439, + "num_tokens": 185416565.0, + "step": 7167 + }, + { + "epoch": 0.7871732923347243, + "grad_norm": 1.909454107284546, + "learning_rate": 5e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7015928030014038, + "num_tokens": 185440210.0, + "step": 7168 + }, + { + "epoch": 0.787283110037338, + "grad_norm": 1.683255910873413, + "learning_rate": 5e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7255417704582214, + "num_tokens": 185469118.0, + "step": 7169 + }, + { + "epoch": 0.7873929277399517, + "grad_norm": 1.7679837942123413, + "learning_rate": 5e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6955719590187073, + "num_tokens": 185495365.0, + "step": 7170 + }, + { + "epoch": 0.7875027454425654, + "grad_norm": 1.8960418701171875, + "learning_rate": 5e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7174343466758728, + "num_tokens": 185517084.0, + "step": 7171 + }, + { + "epoch": 0.787612563145179, + "grad_norm": 1.6853652000427246, + "learning_rate": 5e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7108331322669983, + "num_tokens": 185546525.0, + "step": 7172 + }, + { + "epoch": 0.7877223808477927, + "grad_norm": 1.6292496919631958, + "learning_rate": 5e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7001858949661255, + "num_tokens": 185578082.0, + "step": 7173 + }, + { + "epoch": 0.7878321985504063, + "grad_norm": 1.7014312744140625, + "learning_rate": 5e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7043395042419434, + "num_tokens": 185608334.0, + "step": 7174 + }, + { + "epoch": 0.78794201625302, + "grad_norm": 2.045011520385742, + "learning_rate": 5e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7121042013168335, + "num_tokens": 185629731.0, + "step": 7175 + }, + { + "epoch": 0.7880518339556336, + "grad_norm": 2.1507112979888916, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7160646319389343, + "num_tokens": 185648826.0, + "step": 7176 + }, + { + "epoch": 0.7881616516582474, + "grad_norm": 1.6016044616699219, + "learning_rate": 5e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.699342668056488, + "num_tokens": 185678010.0, + "step": 7177 + }, + { + "epoch": 0.788271469360861, + "grad_norm": 1.8379290103912354, + "learning_rate": 5e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6984524130821228, + "num_tokens": 185703840.0, + "step": 7178 + }, + { + "epoch": 0.7883812870634747, + "grad_norm": 1.835850477218628, + "learning_rate": 5e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7007695436477661, + "num_tokens": 185732375.0, + "step": 7179 + }, + { + "epoch": 0.7884911047660883, + "grad_norm": 1.7305241823196411, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7067936658859253, + "num_tokens": 185758764.0, + "step": 7180 + }, + { + "epoch": 0.788600922468702, + "grad_norm": 2.210430860519409, + "learning_rate": 5e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6886557340621948, + "num_tokens": 185786957.0, + "step": 7181 + }, + { + "epoch": 0.7887107401713156, + "grad_norm": 1.8076744079589844, + "learning_rate": 5e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6843186616897583, + "num_tokens": 185812529.0, + "step": 7182 + }, + { + "epoch": 0.7888205578739292, + "grad_norm": 1.8443098068237305, + "learning_rate": 5e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.688262403011322, + "num_tokens": 185838809.0, + "step": 7183 + }, + { + "epoch": 0.7889303755765429, + "grad_norm": 1.826969027519226, + "learning_rate": 5e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6890276670455933, + "num_tokens": 185866240.0, + "step": 7184 + }, + { + "epoch": 0.7890401932791566, + "grad_norm": 1.7713968753814697, + "learning_rate": 5e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7245416641235352, + "num_tokens": 185893185.0, + "step": 7185 + }, + { + "epoch": 0.7891500109817703, + "grad_norm": 1.6496095657348633, + "learning_rate": 5e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6786453723907471, + "num_tokens": 185925376.0, + "step": 7186 + }, + { + "epoch": 0.7892598286843839, + "grad_norm": 1.7547647953033447, + "learning_rate": 5e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6990870237350464, + "num_tokens": 185952200.0, + "step": 7187 + }, + { + "epoch": 0.7893696463869976, + "grad_norm": 2.0159246921539307, + "learning_rate": 5e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7240793704986572, + "num_tokens": 185972474.0, + "step": 7188 + }, + { + "epoch": 0.7894794640896112, + "grad_norm": 1.6577249765396118, + "learning_rate": 5e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7074549198150635, + "num_tokens": 186002702.0, + "step": 7189 + }, + { + "epoch": 0.7895892817922249, + "grad_norm": 1.9064441919326782, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7164144515991211, + "num_tokens": 186025016.0, + "step": 7190 + }, + { + "epoch": 0.7896990994948385, + "grad_norm": 1.7797306776046753, + "learning_rate": 5e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7466155290603638, + "num_tokens": 186047262.0, + "step": 7191 + }, + { + "epoch": 0.7898089171974523, + "grad_norm": 1.7249478101730347, + "learning_rate": 5e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7129256129264832, + "num_tokens": 186077218.0, + "step": 7192 + }, + { + "epoch": 0.7899187349000659, + "grad_norm": 2.163219928741455, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.727336049079895, + "num_tokens": 186094890.0, + "step": 7193 + }, + { + "epoch": 0.7900285526026796, + "grad_norm": 1.7219460010528564, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7335975170135498, + "num_tokens": 186120004.0, + "step": 7194 + }, + { + "epoch": 0.7901383703052932, + "grad_norm": 2.1350250244140625, + "learning_rate": 5e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7176953554153442, + "num_tokens": 186139605.0, + "step": 7195 + }, + { + "epoch": 0.7902481880079069, + "grad_norm": 1.8053960800170898, + "learning_rate": 5e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6958534121513367, + "num_tokens": 186164455.0, + "step": 7196 + }, + { + "epoch": 0.7903580057105205, + "grad_norm": 1.7253806591033936, + "learning_rate": 5e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6948311924934387, + "num_tokens": 186192368.0, + "step": 7197 + }, + { + "epoch": 0.7904678234131342, + "grad_norm": 1.8704428672790527, + "learning_rate": 5e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7178149223327637, + "num_tokens": 186215893.0, + "step": 7198 + }, + { + "epoch": 0.7905776411157479, + "grad_norm": 1.942968726158142, + "learning_rate": 5e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7048763036727905, + "num_tokens": 186239489.0, + "step": 7199 + }, + { + "epoch": 0.7906874588183616, + "grad_norm": 1.8057098388671875, + "learning_rate": 5e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7147451639175415, + "num_tokens": 186262559.0, + "step": 7200 + }, + { + "epoch": 0.7907972765209752, + "grad_norm": 1.8080484867095947, + "learning_rate": 5e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6874920129776001, + "num_tokens": 186289346.0, + "step": 7201 + }, + { + "epoch": 0.7909070942235888, + "grad_norm": 1.7733604907989502, + "learning_rate": 5e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6982932686805725, + "num_tokens": 186316946.0, + "step": 7202 + }, + { + "epoch": 0.7910169119262025, + "grad_norm": 1.9153679609298706, + "learning_rate": 5e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7105327844619751, + "num_tokens": 186340524.0, + "step": 7203 + }, + { + "epoch": 0.7911267296288161, + "grad_norm": 1.8330615758895874, + "learning_rate": 5e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7079219818115234, + "num_tokens": 186366460.0, + "step": 7204 + }, + { + "epoch": 0.7912365473314298, + "grad_norm": 1.7592544555664062, + "learning_rate": 5e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6891307830810547, + "num_tokens": 186396932.0, + "step": 7205 + }, + { + "epoch": 0.7913463650340435, + "grad_norm": 1.7528473138809204, + "learning_rate": 5e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.691368818283081, + "num_tokens": 186427160.0, + "step": 7206 + }, + { + "epoch": 0.7914561827366572, + "grad_norm": 1.9831576347351074, + "learning_rate": 5e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7225663661956787, + "num_tokens": 186448509.0, + "step": 7207 + }, + { + "epoch": 0.7915660004392708, + "grad_norm": 1.6723496913909912, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7165460586547852, + "num_tokens": 186478749.0, + "step": 7208 + }, + { + "epoch": 0.7916758181418845, + "grad_norm": 1.8259639739990234, + "learning_rate": 5e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7350319623947144, + "num_tokens": 186502148.0, + "step": 7209 + }, + { + "epoch": 0.7917856358444981, + "grad_norm": 2.1332597732543945, + "learning_rate": 5e-06, + "loss": 0.959, + "mean_token_accuracy": 0.70975661277771, + "num_tokens": 186521971.0, + "step": 7210 + }, + { + "epoch": 0.7918954535471118, + "grad_norm": 2.058619499206543, + "learning_rate": 5e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7097381353378296, + "num_tokens": 186542964.0, + "step": 7211 + }, + { + "epoch": 0.7920052712497254, + "grad_norm": 1.8358451128005981, + "learning_rate": 5e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6871435642242432, + "num_tokens": 186569477.0, + "step": 7212 + }, + { + "epoch": 0.7921150889523391, + "grad_norm": 1.5545991659164429, + "learning_rate": 5e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6856805682182312, + "num_tokens": 186605150.0, + "step": 7213 + }, + { + "epoch": 0.7922249066549528, + "grad_norm": 1.9284476041793823, + "learning_rate": 5e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.694841206073761, + "num_tokens": 186628236.0, + "step": 7214 + }, + { + "epoch": 0.7923347243575665, + "grad_norm": 1.9210352897644043, + "learning_rate": 5e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7040100693702698, + "num_tokens": 186651937.0, + "step": 7215 + }, + { + "epoch": 0.7924445420601801, + "grad_norm": 1.8320833444595337, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7413724660873413, + "num_tokens": 186677017.0, + "step": 7216 + }, + { + "epoch": 0.7925543597627938, + "grad_norm": 1.8537427186965942, + "learning_rate": 5e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6883007287979126, + "num_tokens": 186702732.0, + "step": 7217 + }, + { + "epoch": 0.7926641774654074, + "grad_norm": 1.889634132385254, + "learning_rate": 5e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7147889733314514, + "num_tokens": 186726988.0, + "step": 7218 + }, + { + "epoch": 0.792773995168021, + "grad_norm": 1.8774994611740112, + "learning_rate": 5e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7003909349441528, + "num_tokens": 186752739.0, + "step": 7219 + }, + { + "epoch": 0.7928838128706347, + "grad_norm": 1.6203734874725342, + "learning_rate": 5e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6877775192260742, + "num_tokens": 186784383.0, + "step": 7220 + }, + { + "epoch": 0.7929936305732485, + "grad_norm": 1.7469311952590942, + "learning_rate": 5e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6990644335746765, + "num_tokens": 186812735.0, + "step": 7221 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 1.7620840072631836, + "learning_rate": 5e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6996810436248779, + "num_tokens": 186839223.0, + "step": 7222 + }, + { + "epoch": 0.7932132659784757, + "grad_norm": 1.7601964473724365, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7170526385307312, + "num_tokens": 186865512.0, + "step": 7223 + }, + { + "epoch": 0.7933230836810894, + "grad_norm": 1.7296276092529297, + "learning_rate": 5e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6887748837471008, + "num_tokens": 186895588.0, + "step": 7224 + }, + { + "epoch": 0.793432901383703, + "grad_norm": 1.611606478691101, + "learning_rate": 5e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6893836259841919, + "num_tokens": 186926424.0, + "step": 7225 + }, + { + "epoch": 0.7935427190863167, + "grad_norm": 1.705552339553833, + "learning_rate": 5e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7357263565063477, + "num_tokens": 186953364.0, + "step": 7226 + }, + { + "epoch": 0.7936525367889303, + "grad_norm": 1.7038441896438599, + "learning_rate": 5e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7123962044715881, + "num_tokens": 186979582.0, + "step": 7227 + }, + { + "epoch": 0.7937623544915441, + "grad_norm": 1.8997491598129272, + "learning_rate": 5e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6919823884963989, + "num_tokens": 187003920.0, + "step": 7228 + }, + { + "epoch": 0.7938721721941577, + "grad_norm": 1.7708944082260132, + "learning_rate": 5e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.715054988861084, + "num_tokens": 187030369.0, + "step": 7229 + }, + { + "epoch": 0.7939819898967714, + "grad_norm": 1.763267993927002, + "learning_rate": 5e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.730373740196228, + "num_tokens": 187056415.0, + "step": 7230 + }, + { + "epoch": 0.794091807599385, + "grad_norm": 1.8896619081497192, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7238248586654663, + "num_tokens": 187078329.0, + "step": 7231 + }, + { + "epoch": 0.7942016253019987, + "grad_norm": 1.7259877920150757, + "learning_rate": 5e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7051703333854675, + "num_tokens": 187108135.0, + "step": 7232 + }, + { + "epoch": 0.7943114430046123, + "grad_norm": 1.8170472383499146, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7092794179916382, + "num_tokens": 187131863.0, + "step": 7233 + }, + { + "epoch": 0.794421260707226, + "grad_norm": 1.7394227981567383, + "learning_rate": 5e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.702777087688446, + "num_tokens": 187159055.0, + "step": 7234 + }, + { + "epoch": 0.7945310784098397, + "grad_norm": 1.8989098072052002, + "learning_rate": 5e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7217382192611694, + "num_tokens": 187181609.0, + "step": 7235 + }, + { + "epoch": 0.7946408961124534, + "grad_norm": 1.88768470287323, + "learning_rate": 5e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6877124309539795, + "num_tokens": 187206861.0, + "step": 7236 + }, + { + "epoch": 0.794750713815067, + "grad_norm": 1.90456223487854, + "learning_rate": 5e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7011196613311768, + "num_tokens": 187229514.0, + "step": 7237 + }, + { + "epoch": 0.7948605315176807, + "grad_norm": 1.7860723733901978, + "learning_rate": 5e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6966882944107056, + "num_tokens": 187260311.0, + "step": 7238 + }, + { + "epoch": 0.7949703492202943, + "grad_norm": 1.5533415079116821, + "learning_rate": 5e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6935237646102905, + "num_tokens": 187295439.0, + "step": 7239 + }, + { + "epoch": 0.795080166922908, + "grad_norm": 1.6556413173675537, + "learning_rate": 5e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7197589874267578, + "num_tokens": 187321991.0, + "step": 7240 + }, + { + "epoch": 0.7951899846255216, + "grad_norm": 1.5426504611968994, + "learning_rate": 5e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7155998945236206, + "num_tokens": 187355487.0, + "step": 7241 + }, + { + "epoch": 0.7952998023281352, + "grad_norm": 1.6798943281173706, + "learning_rate": 5e-06, + "loss": 0.962, + "mean_token_accuracy": 0.6989172697067261, + "num_tokens": 187382653.0, + "step": 7242 + }, + { + "epoch": 0.795409620030749, + "grad_norm": 1.9862034320831299, + "learning_rate": 5e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7224197387695312, + "num_tokens": 187404467.0, + "step": 7243 + }, + { + "epoch": 0.7955194377333626, + "grad_norm": 1.7774602174758911, + "learning_rate": 5e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6889818906784058, + "num_tokens": 187431536.0, + "step": 7244 + }, + { + "epoch": 0.7956292554359763, + "grad_norm": 2.018944263458252, + "learning_rate": 5e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7117280960083008, + "num_tokens": 187452868.0, + "step": 7245 + }, + { + "epoch": 0.7957390731385899, + "grad_norm": 1.692978024482727, + "learning_rate": 5e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7225571870803833, + "num_tokens": 187482752.0, + "step": 7246 + }, + { + "epoch": 0.7958488908412036, + "grad_norm": 1.8025996685028076, + "learning_rate": 5e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7132529020309448, + "num_tokens": 187507051.0, + "step": 7247 + }, + { + "epoch": 0.7959587085438172, + "grad_norm": 1.7805653810501099, + "learning_rate": 5e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7278573513031006, + "num_tokens": 187533913.0, + "step": 7248 + }, + { + "epoch": 0.7960685262464309, + "grad_norm": 1.7246780395507812, + "learning_rate": 5e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6967312097549438, + "num_tokens": 187559884.0, + "step": 7249 + }, + { + "epoch": 0.7961783439490446, + "grad_norm": 1.6891765594482422, + "learning_rate": 5e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7014705538749695, + "num_tokens": 187588931.0, + "step": 7250 + }, + { + "epoch": 0.7962881616516583, + "grad_norm": 1.6823835372924805, + "learning_rate": 5e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7109710574150085, + "num_tokens": 187616560.0, + "step": 7251 + }, + { + "epoch": 0.7963979793542719, + "grad_norm": 1.8491897583007812, + "learning_rate": 5e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.6977176666259766, + "num_tokens": 187640983.0, + "step": 7252 + }, + { + "epoch": 0.7965077970568856, + "grad_norm": 1.8982832431793213, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7118597626686096, + "num_tokens": 187665278.0, + "step": 7253 + }, + { + "epoch": 0.7966176147594992, + "grad_norm": 1.7426918745040894, + "learning_rate": 5e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6831507682800293, + "num_tokens": 187695861.0, + "step": 7254 + }, + { + "epoch": 0.7967274324621129, + "grad_norm": 1.979543924331665, + "learning_rate": 5e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7087182998657227, + "num_tokens": 187719766.0, + "step": 7255 + }, + { + "epoch": 0.7968372501647265, + "grad_norm": 1.847591757774353, + "learning_rate": 5e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7107985019683838, + "num_tokens": 187743360.0, + "step": 7256 + }, + { + "epoch": 0.7969470678673403, + "grad_norm": 1.8131557703018188, + "learning_rate": 5e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6851340532302856, + "num_tokens": 187770700.0, + "step": 7257 + }, + { + "epoch": 0.7970568855699539, + "grad_norm": 1.725511908531189, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7301366329193115, + "num_tokens": 187799145.0, + "step": 7258 + }, + { + "epoch": 0.7971667032725676, + "grad_norm": 2.0121967792510986, + "learning_rate": 5e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6893460154533386, + "num_tokens": 187822542.0, + "step": 7259 + }, + { + "epoch": 0.7972765209751812, + "grad_norm": 1.7395758628845215, + "learning_rate": 5e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6912674903869629, + "num_tokens": 187850321.0, + "step": 7260 + }, + { + "epoch": 0.7973863386777948, + "grad_norm": 1.8285621404647827, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7098920345306396, + "num_tokens": 187871973.0, + "step": 7261 + }, + { + "epoch": 0.7974961563804085, + "grad_norm": 1.7994351387023926, + "learning_rate": 5e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6932090520858765, + "num_tokens": 187896467.0, + "step": 7262 + }, + { + "epoch": 0.7976059740830221, + "grad_norm": 1.9648807048797607, + "learning_rate": 5e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7317448258399963, + "num_tokens": 187915244.0, + "step": 7263 + }, + { + "epoch": 0.7977157917856359, + "grad_norm": 1.6248953342437744, + "learning_rate": 5e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.709671139717102, + "num_tokens": 187943777.0, + "step": 7264 + }, + { + "epoch": 0.7978256094882495, + "grad_norm": 1.7711631059646606, + "learning_rate": 5e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.706979513168335, + "num_tokens": 187971006.0, + "step": 7265 + }, + { + "epoch": 0.7979354271908632, + "grad_norm": 1.918999433517456, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7142171859741211, + "num_tokens": 187993999.0, + "step": 7266 + }, + { + "epoch": 0.7980452448934768, + "grad_norm": 1.6595991849899292, + "learning_rate": 5e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6935073137283325, + "num_tokens": 188022530.0, + "step": 7267 + }, + { + "epoch": 0.7981550625960905, + "grad_norm": 1.7159467935562134, + "learning_rate": 5e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7237808704376221, + "num_tokens": 188049624.0, + "step": 7268 + }, + { + "epoch": 0.7982648802987041, + "grad_norm": 1.8057191371917725, + "learning_rate": 5e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7300109267234802, + "num_tokens": 188072901.0, + "step": 7269 + }, + { + "epoch": 0.7983746980013178, + "grad_norm": 1.8468098640441895, + "learning_rate": 5e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7184357643127441, + "num_tokens": 188096765.0, + "step": 7270 + }, + { + "epoch": 0.7984845157039314, + "grad_norm": 1.6118475198745728, + "learning_rate": 5e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7181233167648315, + "num_tokens": 188128128.0, + "step": 7271 + }, + { + "epoch": 0.7985943334065452, + "grad_norm": 1.9648696184158325, + "learning_rate": 5e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7148075699806213, + "num_tokens": 188150989.0, + "step": 7272 + }, + { + "epoch": 0.7987041511091588, + "grad_norm": 1.8270686864852905, + "learning_rate": 5e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7023705840110779, + "num_tokens": 188176868.0, + "step": 7273 + }, + { + "epoch": 0.7988139688117725, + "grad_norm": 1.7176780700683594, + "learning_rate": 5e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7130886316299438, + "num_tokens": 188204855.0, + "step": 7274 + }, + { + "epoch": 0.7989237865143861, + "grad_norm": 1.7527891397476196, + "learning_rate": 5e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7053328156471252, + "num_tokens": 188231710.0, + "step": 7275 + }, + { + "epoch": 0.7990336042169998, + "grad_norm": 1.7818416357040405, + "learning_rate": 5e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7132009267807007, + "num_tokens": 188255849.0, + "step": 7276 + }, + { + "epoch": 0.7991434219196134, + "grad_norm": 1.6108590364456177, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7037788033485413, + "num_tokens": 188288622.0, + "step": 7277 + }, + { + "epoch": 0.799253239622227, + "grad_norm": 1.9103466272354126, + "learning_rate": 5e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6975415945053101, + "num_tokens": 188312837.0, + "step": 7278 + }, + { + "epoch": 0.7993630573248408, + "grad_norm": 1.7558956146240234, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.695833683013916, + "num_tokens": 188340659.0, + "step": 7279 + }, + { + "epoch": 0.7994728750274545, + "grad_norm": 1.8409076929092407, + "learning_rate": 5e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7037050724029541, + "num_tokens": 188365013.0, + "step": 7280 + }, + { + "epoch": 0.7995826927300681, + "grad_norm": 1.6820166110992432, + "learning_rate": 5e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6854521036148071, + "num_tokens": 188394619.0, + "step": 7281 + }, + { + "epoch": 0.7996925104326817, + "grad_norm": 1.8020817041397095, + "learning_rate": 5e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7230876684188843, + "num_tokens": 188418398.0, + "step": 7282 + }, + { + "epoch": 0.7998023281352954, + "grad_norm": 1.885933756828308, + "learning_rate": 5e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7301676869392395, + "num_tokens": 188439918.0, + "step": 7283 + }, + { + "epoch": 0.799912145837909, + "grad_norm": 1.616840124130249, + "learning_rate": 5e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7201109528541565, + "num_tokens": 188471594.0, + "step": 7284 + }, + { + "epoch": 0.8000219635405227, + "grad_norm": 1.9537477493286133, + "learning_rate": 5e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7256778478622437, + "num_tokens": 188492738.0, + "step": 7285 + }, + { + "epoch": 0.8001317812431364, + "grad_norm": 1.6488001346588135, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7056561708450317, + "num_tokens": 188522357.0, + "step": 7286 + }, + { + "epoch": 0.8002415989457501, + "grad_norm": 1.8752429485321045, + "learning_rate": 5e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7095973491668701, + "num_tokens": 188545902.0, + "step": 7287 + }, + { + "epoch": 0.8003514166483637, + "grad_norm": 1.7426791191101074, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7080097198486328, + "num_tokens": 188573288.0, + "step": 7288 + }, + { + "epoch": 0.8004612343509774, + "grad_norm": 1.8025115728378296, + "learning_rate": 5e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7098469734191895, + "num_tokens": 188600098.0, + "step": 7289 + }, + { + "epoch": 0.800571052053591, + "grad_norm": 1.640067458152771, + "learning_rate": 5e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7036325931549072, + "num_tokens": 188629260.0, + "step": 7290 + }, + { + "epoch": 0.8006808697562047, + "grad_norm": 1.692908525466919, + "learning_rate": 5e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7240805625915527, + "num_tokens": 188658163.0, + "step": 7291 + }, + { + "epoch": 0.8007906874588183, + "grad_norm": 1.6824536323547363, + "learning_rate": 5e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7022627592086792, + "num_tokens": 188692146.0, + "step": 7292 + }, + { + "epoch": 0.8009005051614321, + "grad_norm": 1.4845623970031738, + "learning_rate": 5e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7076786160469055, + "num_tokens": 188728606.0, + "step": 7293 + }, + { + "epoch": 0.8010103228640457, + "grad_norm": 1.6620875597000122, + "learning_rate": 5e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.696921169757843, + "num_tokens": 188757311.0, + "step": 7294 + }, + { + "epoch": 0.8011201405666594, + "grad_norm": 1.7940025329589844, + "learning_rate": 5e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.736159086227417, + "num_tokens": 188781801.0, + "step": 7295 + }, + { + "epoch": 0.801229958269273, + "grad_norm": 1.935713529586792, + "learning_rate": 5e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6954728364944458, + "num_tokens": 188805546.0, + "step": 7296 + }, + { + "epoch": 0.8013397759718867, + "grad_norm": 1.7575947046279907, + "learning_rate": 5e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7026582956314087, + "num_tokens": 188835958.0, + "step": 7297 + }, + { + "epoch": 0.8014495936745003, + "grad_norm": 1.8015421628952026, + "learning_rate": 5e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7137119770050049, + "num_tokens": 188861288.0, + "step": 7298 + }, + { + "epoch": 0.801559411377114, + "grad_norm": 1.754601240158081, + "learning_rate": 5e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7073599100112915, + "num_tokens": 188885617.0, + "step": 7299 + }, + { + "epoch": 0.8016692290797276, + "grad_norm": 1.654895544052124, + "learning_rate": 5e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.734754204750061, + "num_tokens": 188912258.0, + "step": 7300 + }, + { + "epoch": 0.8017790467823414, + "grad_norm": 2.0705790519714355, + "learning_rate": 5e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7272597551345825, + "num_tokens": 188931456.0, + "step": 7301 + }, + { + "epoch": 0.801888864484955, + "grad_norm": 1.9533913135528564, + "learning_rate": 5e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7011733055114746, + "num_tokens": 188954866.0, + "step": 7302 + }, + { + "epoch": 0.8019986821875686, + "grad_norm": 1.7387696504592896, + "learning_rate": 5e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7055530548095703, + "num_tokens": 188982084.0, + "step": 7303 + }, + { + "epoch": 0.8021084998901823, + "grad_norm": 2.213951826095581, + "learning_rate": 5e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7130041122436523, + "num_tokens": 188999711.0, + "step": 7304 + }, + { + "epoch": 0.8022183175927959, + "grad_norm": 1.9355584383010864, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7142208814620972, + "num_tokens": 189021328.0, + "step": 7305 + }, + { + "epoch": 0.8023281352954096, + "grad_norm": 1.9409780502319336, + "learning_rate": 5e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7118242383003235, + "num_tokens": 189045564.0, + "step": 7306 + }, + { + "epoch": 0.8024379529980232, + "grad_norm": 1.6544795036315918, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7118892073631287, + "num_tokens": 189071909.0, + "step": 7307 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 1.7206355333328247, + "learning_rate": 5e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7056827545166016, + "num_tokens": 189098725.0, + "step": 7308 + }, + { + "epoch": 0.8026575884032506, + "grad_norm": 1.9150010347366333, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7406368851661682, + "num_tokens": 189121972.0, + "step": 7309 + }, + { + "epoch": 0.8027674061058643, + "grad_norm": 1.9231181144714355, + "learning_rate": 5e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6867061257362366, + "num_tokens": 189146553.0, + "step": 7310 + }, + { + "epoch": 0.8028772238084779, + "grad_norm": 1.5924659967422485, + "learning_rate": 5e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6981934309005737, + "num_tokens": 189178273.0, + "step": 7311 + }, + { + "epoch": 0.8029870415110916, + "grad_norm": 1.727815866470337, + "learning_rate": 5e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7255228161811829, + "num_tokens": 189204261.0, + "step": 7312 + }, + { + "epoch": 0.8030968592137052, + "grad_norm": 1.894590973854065, + "learning_rate": 5e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7092863917350769, + "num_tokens": 189228473.0, + "step": 7313 + }, + { + "epoch": 0.8032066769163189, + "grad_norm": 1.6372578144073486, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7333614826202393, + "num_tokens": 189257251.0, + "step": 7314 + }, + { + "epoch": 0.8033164946189326, + "grad_norm": 1.649552822113037, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.709356427192688, + "num_tokens": 189286338.0, + "step": 7315 + }, + { + "epoch": 0.8034263123215463, + "grad_norm": 1.761531114578247, + "learning_rate": 5e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7039560079574585, + "num_tokens": 189311047.0, + "step": 7316 + }, + { + "epoch": 0.8035361300241599, + "grad_norm": 1.9311473369598389, + "learning_rate": 5e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7182226777076721, + "num_tokens": 189332766.0, + "step": 7317 + }, + { + "epoch": 0.8036459477267736, + "grad_norm": 1.7818069458007812, + "learning_rate": 5e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6985945701599121, + "num_tokens": 189359031.0, + "step": 7318 + }, + { + "epoch": 0.8037557654293872, + "grad_norm": 1.930469036102295, + "learning_rate": 5e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6953926086425781, + "num_tokens": 189382273.0, + "step": 7319 + }, + { + "epoch": 0.8038655831320008, + "grad_norm": 1.6895384788513184, + "learning_rate": 5e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7248415946960449, + "num_tokens": 189410685.0, + "step": 7320 + }, + { + "epoch": 0.8039754008346145, + "grad_norm": 1.6157702207565308, + "learning_rate": 5e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6885516047477722, + "num_tokens": 189444700.0, + "step": 7321 + }, + { + "epoch": 0.8040852185372283, + "grad_norm": 2.1040265560150146, + "learning_rate": 5e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7219390869140625, + "num_tokens": 189463270.0, + "step": 7322 + }, + { + "epoch": 0.8041950362398419, + "grad_norm": 1.6886779069900513, + "learning_rate": 5e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7051973342895508, + "num_tokens": 189491367.0, + "step": 7323 + }, + { + "epoch": 0.8043048539424555, + "grad_norm": 1.879809021949768, + "learning_rate": 5e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.701296329498291, + "num_tokens": 189515835.0, + "step": 7324 + }, + { + "epoch": 0.8044146716450692, + "grad_norm": 1.8660088777542114, + "learning_rate": 5e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6969475746154785, + "num_tokens": 189540201.0, + "step": 7325 + }, + { + "epoch": 0.8045244893476828, + "grad_norm": 2.057102918624878, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7276168465614319, + "num_tokens": 189559643.0, + "step": 7326 + }, + { + "epoch": 0.8046343070502965, + "grad_norm": 1.8815009593963623, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7162143588066101, + "num_tokens": 189582534.0, + "step": 7327 + }, + { + "epoch": 0.8047441247529101, + "grad_norm": 1.8224395513534546, + "learning_rate": 5e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7125879526138306, + "num_tokens": 189609457.0, + "step": 7328 + }, + { + "epoch": 0.8048539424555239, + "grad_norm": 1.796164631843567, + "learning_rate": 5e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6953963041305542, + "num_tokens": 189634490.0, + "step": 7329 + }, + { + "epoch": 0.8049637601581375, + "grad_norm": 1.650033712387085, + "learning_rate": 5e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6782498359680176, + "num_tokens": 189667112.0, + "step": 7330 + }, + { + "epoch": 0.8050735778607512, + "grad_norm": 1.997921347618103, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.724149227142334, + "num_tokens": 189687504.0, + "step": 7331 + }, + { + "epoch": 0.8051833955633648, + "grad_norm": 1.7744725942611694, + "learning_rate": 5e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7199744582176208, + "num_tokens": 189714313.0, + "step": 7332 + }, + { + "epoch": 0.8052932132659785, + "grad_norm": 1.9713876247406006, + "learning_rate": 5e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7303377389907837, + "num_tokens": 189736198.0, + "step": 7333 + }, + { + "epoch": 0.8054030309685921, + "grad_norm": 1.6563949584960938, + "learning_rate": 5e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.68681800365448, + "num_tokens": 189767041.0, + "step": 7334 + }, + { + "epoch": 0.8055128486712058, + "grad_norm": 1.7382233142852783, + "learning_rate": 5e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7112040519714355, + "num_tokens": 189796100.0, + "step": 7335 + }, + { + "epoch": 0.8056226663738194, + "grad_norm": 1.858322024345398, + "learning_rate": 5e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.6985020637512207, + "num_tokens": 189820042.0, + "step": 7336 + }, + { + "epoch": 0.8057324840764332, + "grad_norm": 1.8991000652313232, + "learning_rate": 5e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6984701156616211, + "num_tokens": 189842731.0, + "step": 7337 + }, + { + "epoch": 0.8058423017790468, + "grad_norm": 1.7373005151748657, + "learning_rate": 5e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7060361504554749, + "num_tokens": 189868457.0, + "step": 7338 + }, + { + "epoch": 0.8059521194816605, + "grad_norm": 1.5071280002593994, + "learning_rate": 5e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7091864943504333, + "num_tokens": 189900945.0, + "step": 7339 + }, + { + "epoch": 0.8060619371842741, + "grad_norm": 1.7052767276763916, + "learning_rate": 5e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6959513425827026, + "num_tokens": 189930633.0, + "step": 7340 + }, + { + "epoch": 0.8061717548868877, + "grad_norm": 2.039687395095825, + "learning_rate": 5e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7044141292572021, + "num_tokens": 189951952.0, + "step": 7341 + }, + { + "epoch": 0.8062815725895014, + "grad_norm": 1.8024076223373413, + "learning_rate": 5e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7040428519248962, + "num_tokens": 189978001.0, + "step": 7342 + }, + { + "epoch": 0.806391390292115, + "grad_norm": 1.8664501905441284, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7248146533966064, + "num_tokens": 190000910.0, + "step": 7343 + }, + { + "epoch": 0.8065012079947288, + "grad_norm": 1.8575007915496826, + "learning_rate": 5e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6939728260040283, + "num_tokens": 190024904.0, + "step": 7344 + }, + { + "epoch": 0.8066110256973424, + "grad_norm": 1.8748584985733032, + "learning_rate": 5e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7107114791870117, + "num_tokens": 190048012.0, + "step": 7345 + }, + { + "epoch": 0.8067208433999561, + "grad_norm": 1.7689603567123413, + "learning_rate": 5e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7151646614074707, + "num_tokens": 190075549.0, + "step": 7346 + }, + { + "epoch": 0.8068306611025697, + "grad_norm": 1.7811810970306396, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7045801281929016, + "num_tokens": 190101869.0, + "step": 7347 + }, + { + "epoch": 0.8069404788051834, + "grad_norm": 1.8577204942703247, + "learning_rate": 5e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7254197597503662, + "num_tokens": 190124947.0, + "step": 7348 + }, + { + "epoch": 0.807050296507797, + "grad_norm": 1.7712597846984863, + "learning_rate": 5e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7005773782730103, + "num_tokens": 190151025.0, + "step": 7349 + }, + { + "epoch": 0.8071601142104107, + "grad_norm": 1.9521892070770264, + "learning_rate": 5e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7170408964157104, + "num_tokens": 190172186.0, + "step": 7350 + }, + { + "epoch": 0.8072699319130244, + "grad_norm": 1.5500534772872925, + "learning_rate": 5e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6861298084259033, + "num_tokens": 190206135.0, + "step": 7351 + }, + { + "epoch": 0.8073797496156381, + "grad_norm": 1.7441577911376953, + "learning_rate": 5e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6950675845146179, + "num_tokens": 190233494.0, + "step": 7352 + }, + { + "epoch": 0.8074895673182517, + "grad_norm": 1.7555642127990723, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.727519690990448, + "num_tokens": 190260275.0, + "step": 7353 + }, + { + "epoch": 0.8075993850208654, + "grad_norm": 2.047433853149414, + "learning_rate": 5e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7037308216094971, + "num_tokens": 190280106.0, + "step": 7354 + }, + { + "epoch": 0.807709202723479, + "grad_norm": 1.9402765035629272, + "learning_rate": 5e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7171898484230042, + "num_tokens": 190303309.0, + "step": 7355 + }, + { + "epoch": 0.8078190204260927, + "grad_norm": 1.7205954790115356, + "learning_rate": 5e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6995079517364502, + "num_tokens": 190332194.0, + "step": 7356 + }, + { + "epoch": 0.8079288381287063, + "grad_norm": 1.7677580118179321, + "learning_rate": 5e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7122751474380493, + "num_tokens": 190359756.0, + "step": 7357 + }, + { + "epoch": 0.8080386558313201, + "grad_norm": 1.671460509300232, + "learning_rate": 5e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6928220391273499, + "num_tokens": 190388310.0, + "step": 7358 + }, + { + "epoch": 0.8081484735339337, + "grad_norm": 1.7098950147628784, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7251679301261902, + "num_tokens": 190413631.0, + "step": 7359 + }, + { + "epoch": 0.8082582912365474, + "grad_norm": 1.869972825050354, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7250281572341919, + "num_tokens": 190435843.0, + "step": 7360 + }, + { + "epoch": 0.808368108939161, + "grad_norm": 1.6857653856277466, + "learning_rate": 5e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7385587096214294, + "num_tokens": 190462457.0, + "step": 7361 + }, + { + "epoch": 0.8084779266417746, + "grad_norm": 2.0974433422088623, + "learning_rate": 5e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6911606192588806, + "num_tokens": 190484990.0, + "step": 7362 + }, + { + "epoch": 0.8085877443443883, + "grad_norm": 2.3327109813690186, + "learning_rate": 5e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7015884518623352, + "num_tokens": 190502053.0, + "step": 7363 + }, + { + "epoch": 0.8086975620470019, + "grad_norm": 1.6753735542297363, + "learning_rate": 5e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7020679712295532, + "num_tokens": 190533198.0, + "step": 7364 + }, + { + "epoch": 0.8088073797496156, + "grad_norm": 1.7682805061340332, + "learning_rate": 5e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7052491307258606, + "num_tokens": 190558165.0, + "step": 7365 + }, + { + "epoch": 0.8089171974522293, + "grad_norm": 2.1076109409332275, + "learning_rate": 5e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7167597413063049, + "num_tokens": 190580143.0, + "step": 7366 + }, + { + "epoch": 0.809027015154843, + "grad_norm": 1.7186062335968018, + "learning_rate": 5e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6961698532104492, + "num_tokens": 190608474.0, + "step": 7367 + }, + { + "epoch": 0.8091368328574566, + "grad_norm": 2.1870315074920654, + "learning_rate": 5e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7098504900932312, + "num_tokens": 190627246.0, + "step": 7368 + }, + { + "epoch": 0.8092466505600703, + "grad_norm": 1.683398723602295, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7125909924507141, + "num_tokens": 190654238.0, + "step": 7369 + }, + { + "epoch": 0.8093564682626839, + "grad_norm": 1.730625033378601, + "learning_rate": 5e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7005621194839478, + "num_tokens": 190681355.0, + "step": 7370 + }, + { + "epoch": 0.8094662859652976, + "grad_norm": 1.6497271060943604, + "learning_rate": 5e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.676395833492279, + "num_tokens": 190714066.0, + "step": 7371 + }, + { + "epoch": 0.8095761036679112, + "grad_norm": 1.7770123481750488, + "learning_rate": 5e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7174410820007324, + "num_tokens": 190739689.0, + "step": 7372 + }, + { + "epoch": 0.809685921370525, + "grad_norm": 1.8497533798217773, + "learning_rate": 5e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7108343839645386, + "num_tokens": 190762330.0, + "step": 7373 + }, + { + "epoch": 0.8097957390731386, + "grad_norm": 1.6561610698699951, + "learning_rate": 5e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7229914665222168, + "num_tokens": 190789921.0, + "step": 7374 + }, + { + "epoch": 0.8099055567757523, + "grad_norm": 1.784658432006836, + "learning_rate": 5e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7126756310462952, + "num_tokens": 190816523.0, + "step": 7375 + }, + { + "epoch": 0.8100153744783659, + "grad_norm": 1.6701644659042358, + "learning_rate": 5e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7118868827819824, + "num_tokens": 190844879.0, + "step": 7376 + }, + { + "epoch": 0.8101251921809796, + "grad_norm": 1.8769410848617554, + "learning_rate": 5e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7043904066085815, + "num_tokens": 190869659.0, + "step": 7377 + }, + { + "epoch": 0.8102350098835932, + "grad_norm": 1.687223196029663, + "learning_rate": 5e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7255786061286926, + "num_tokens": 190897874.0, + "step": 7378 + }, + { + "epoch": 0.8103448275862069, + "grad_norm": 1.9683187007904053, + "learning_rate": 5e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7399580478668213, + "num_tokens": 190917999.0, + "step": 7379 + }, + { + "epoch": 0.8104546452888206, + "grad_norm": 1.711845874786377, + "learning_rate": 5e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7203512787818909, + "num_tokens": 190944894.0, + "step": 7380 + }, + { + "epoch": 0.8105644629914343, + "grad_norm": 1.8383585214614868, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7153140306472778, + "num_tokens": 190967581.0, + "step": 7381 + }, + { + "epoch": 0.8106742806940479, + "grad_norm": 2.830089807510376, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6989401578903198, + "num_tokens": 190995054.0, + "step": 7382 + }, + { + "epoch": 0.8107840983966615, + "grad_norm": 1.665030598640442, + "learning_rate": 5e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6922160387039185, + "num_tokens": 191025149.0, + "step": 7383 + }, + { + "epoch": 0.8108939160992752, + "grad_norm": 1.8821066617965698, + "learning_rate": 5e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7012404203414917, + "num_tokens": 191048482.0, + "step": 7384 + }, + { + "epoch": 0.8110037338018888, + "grad_norm": 1.7094120979309082, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7066183090209961, + "num_tokens": 191076288.0, + "step": 7385 + }, + { + "epoch": 0.8111135515045025, + "grad_norm": 1.6230255365371704, + "learning_rate": 5e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6892914772033691, + "num_tokens": 191106422.0, + "step": 7386 + }, + { + "epoch": 0.8112233692071162, + "grad_norm": 1.9893875122070312, + "learning_rate": 5e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7233871221542358, + "num_tokens": 191127857.0, + "step": 7387 + }, + { + "epoch": 0.8113331869097299, + "grad_norm": 2.0105249881744385, + "learning_rate": 5e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7283807992935181, + "num_tokens": 191148597.0, + "step": 7388 + }, + { + "epoch": 0.8114430046123435, + "grad_norm": 1.9417729377746582, + "learning_rate": 5e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7267000079154968, + "num_tokens": 191169839.0, + "step": 7389 + }, + { + "epoch": 0.8115528223149572, + "grad_norm": 1.7019777297973633, + "learning_rate": 5e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6889883279800415, + "num_tokens": 191196042.0, + "step": 7390 + }, + { + "epoch": 0.8116626400175708, + "grad_norm": 1.8740661144256592, + "learning_rate": 5e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7108035087585449, + "num_tokens": 191219479.0, + "step": 7391 + }, + { + "epoch": 0.8117724577201845, + "grad_norm": 1.9826934337615967, + "learning_rate": 5e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7340273261070251, + "num_tokens": 191240668.0, + "step": 7392 + }, + { + "epoch": 0.8118822754227981, + "grad_norm": 1.762310266494751, + "learning_rate": 5e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6997363567352295, + "num_tokens": 191269845.0, + "step": 7393 + }, + { + "epoch": 0.8119920931254118, + "grad_norm": 1.6869120597839355, + "learning_rate": 5e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7169591188430786, + "num_tokens": 191297469.0, + "step": 7394 + }, + { + "epoch": 0.8121019108280255, + "grad_norm": 1.6599355936050415, + "learning_rate": 5e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7011585235595703, + "num_tokens": 191326277.0, + "step": 7395 + }, + { + "epoch": 0.8122117285306392, + "grad_norm": 1.6926804780960083, + "learning_rate": 5e-06, + "loss": 1.084, + "mean_token_accuracy": 0.6780256032943726, + "num_tokens": 191356147.0, + "step": 7396 + }, + { + "epoch": 0.8123215462332528, + "grad_norm": 1.9171401262283325, + "learning_rate": 5e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7040364742279053, + "num_tokens": 191377325.0, + "step": 7397 + }, + { + "epoch": 0.8124313639358665, + "grad_norm": 1.807893991470337, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7271618843078613, + "num_tokens": 191400337.0, + "step": 7398 + }, + { + "epoch": 0.8125411816384801, + "grad_norm": 1.5624499320983887, + "learning_rate": 5e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7015965580940247, + "num_tokens": 191430412.0, + "step": 7399 + }, + { + "epoch": 0.8126509993410937, + "grad_norm": 1.9204789400100708, + "learning_rate": 5e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6969512104988098, + "num_tokens": 191452967.0, + "step": 7400 + }, + { + "epoch": 0.8127608170437074, + "grad_norm": 1.8061505556106567, + "learning_rate": 5e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7164140939712524, + "num_tokens": 191476278.0, + "step": 7401 + }, + { + "epoch": 0.8128706347463212, + "grad_norm": 1.7243568897247314, + "learning_rate": 5e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7348836660385132, + "num_tokens": 191500972.0, + "step": 7402 + }, + { + "epoch": 0.8129804524489348, + "grad_norm": 2.0758562088012695, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7233254909515381, + "num_tokens": 191518851.0, + "step": 7403 + }, + { + "epoch": 0.8130902701515484, + "grad_norm": 1.9102352857589722, + "learning_rate": 5e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7038881182670593, + "num_tokens": 191543903.0, + "step": 7404 + }, + { + "epoch": 0.8132000878541621, + "grad_norm": 1.7797000408172607, + "learning_rate": 5e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6887267827987671, + "num_tokens": 191569444.0, + "step": 7405 + }, + { + "epoch": 0.8133099055567757, + "grad_norm": 1.8143242597579956, + "learning_rate": 5e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7183838486671448, + "num_tokens": 191594614.0, + "step": 7406 + }, + { + "epoch": 0.8134197232593894, + "grad_norm": 1.7713841199874878, + "learning_rate": 5e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7085848450660706, + "num_tokens": 191619953.0, + "step": 7407 + }, + { + "epoch": 0.813529540962003, + "grad_norm": 1.721691370010376, + "learning_rate": 5e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7122970223426819, + "num_tokens": 191649912.0, + "step": 7408 + }, + { + "epoch": 0.8136393586646168, + "grad_norm": 1.825622797012329, + "learning_rate": 5e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7100735902786255, + "num_tokens": 191675609.0, + "step": 7409 + }, + { + "epoch": 0.8137491763672304, + "grad_norm": 1.745554804801941, + "learning_rate": 5e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7062526941299438, + "num_tokens": 191703833.0, + "step": 7410 + }, + { + "epoch": 0.8138589940698441, + "grad_norm": 1.6998047828674316, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7226005792617798, + "num_tokens": 191729781.0, + "step": 7411 + }, + { + "epoch": 0.8139688117724577, + "grad_norm": 1.7156026363372803, + "learning_rate": 5e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7174664735794067, + "num_tokens": 191755628.0, + "step": 7412 + }, + { + "epoch": 0.8140786294750714, + "grad_norm": 2.093585252761841, + "learning_rate": 5e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7209156155586243, + "num_tokens": 191775981.0, + "step": 7413 + }, + { + "epoch": 0.814188447177685, + "grad_norm": 1.7559632062911987, + "learning_rate": 5e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7242289781570435, + "num_tokens": 191802289.0, + "step": 7414 + }, + { + "epoch": 0.8142982648802987, + "grad_norm": 2.03765869140625, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7389931082725525, + "num_tokens": 191824263.0, + "step": 7415 + }, + { + "epoch": 0.8144080825829124, + "grad_norm": 1.6625207662582397, + "learning_rate": 5e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6980078220367432, + "num_tokens": 191855284.0, + "step": 7416 + }, + { + "epoch": 0.8145179002855261, + "grad_norm": 1.8026326894760132, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7050960063934326, + "num_tokens": 191882389.0, + "step": 7417 + }, + { + "epoch": 0.8146277179881397, + "grad_norm": 1.812851071357727, + "learning_rate": 5e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.714290976524353, + "num_tokens": 191907514.0, + "step": 7418 + }, + { + "epoch": 0.8147375356907534, + "grad_norm": 1.5490868091583252, + "learning_rate": 5e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7150974869728088, + "num_tokens": 191942427.0, + "step": 7419 + }, + { + "epoch": 0.814847353393367, + "grad_norm": 1.862433910369873, + "learning_rate": 5e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7125768661499023, + "num_tokens": 191965656.0, + "step": 7420 + }, + { + "epoch": 0.8149571710959806, + "grad_norm": 1.9195843935012817, + "learning_rate": 5e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7072195410728455, + "num_tokens": 191989000.0, + "step": 7421 + }, + { + "epoch": 0.8150669887985943, + "grad_norm": 1.9482452869415283, + "learning_rate": 5e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7080223560333252, + "num_tokens": 192012008.0, + "step": 7422 + }, + { + "epoch": 0.8151768065012079, + "grad_norm": 1.864022135734558, + "learning_rate": 5e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7110685706138611, + "num_tokens": 192035464.0, + "step": 7423 + }, + { + "epoch": 0.8152866242038217, + "grad_norm": 1.5836522579193115, + "learning_rate": 5e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6945204734802246, + "num_tokens": 192065093.0, + "step": 7424 + }, + { + "epoch": 0.8153964419064353, + "grad_norm": 1.9423538446426392, + "learning_rate": 5e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7109533548355103, + "num_tokens": 192087143.0, + "step": 7425 + }, + { + "epoch": 0.815506259609049, + "grad_norm": 1.932753324508667, + "learning_rate": 5e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7297793626785278, + "num_tokens": 192108691.0, + "step": 7426 + }, + { + "epoch": 0.8156160773116626, + "grad_norm": 1.8747491836547852, + "learning_rate": 5e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7106362581253052, + "num_tokens": 192130962.0, + "step": 7427 + }, + { + "epoch": 0.8157258950142763, + "grad_norm": 1.7714241743087769, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7131178379058838, + "num_tokens": 192157826.0, + "step": 7428 + }, + { + "epoch": 0.8158357127168899, + "grad_norm": 2.1192800998687744, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7189310193061829, + "num_tokens": 192178275.0, + "step": 7429 + }, + { + "epoch": 0.8159455304195036, + "grad_norm": 2.205744743347168, + "learning_rate": 5e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7032063007354736, + "num_tokens": 192196472.0, + "step": 7430 + }, + { + "epoch": 0.8160553481221173, + "grad_norm": 1.8128993511199951, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7010980844497681, + "num_tokens": 192222036.0, + "step": 7431 + }, + { + "epoch": 0.816165165824731, + "grad_norm": 2.013183355331421, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7193722724914551, + "num_tokens": 192242368.0, + "step": 7432 + }, + { + "epoch": 0.8162749835273446, + "grad_norm": 1.6786112785339355, + "learning_rate": 5e-06, + "loss": 1.0967, + "mean_token_accuracy": 0.6840593814849854, + "num_tokens": 192274525.0, + "step": 7433 + }, + { + "epoch": 0.8163848012299583, + "grad_norm": 1.877206563949585, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.715591311454773, + "num_tokens": 192297150.0, + "step": 7434 + }, + { + "epoch": 0.8164946189325719, + "grad_norm": 1.711327075958252, + "learning_rate": 5e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.705249547958374, + "num_tokens": 192325938.0, + "step": 7435 + }, + { + "epoch": 0.8166044366351856, + "grad_norm": 1.8451361656188965, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7237352728843689, + "num_tokens": 192348490.0, + "step": 7436 + }, + { + "epoch": 0.8167142543377992, + "grad_norm": 1.8182717561721802, + "learning_rate": 5e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6790664792060852, + "num_tokens": 192374918.0, + "step": 7437 + }, + { + "epoch": 0.816824072040413, + "grad_norm": 1.8833473920822144, + "learning_rate": 5e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7226225733757019, + "num_tokens": 192398850.0, + "step": 7438 + }, + { + "epoch": 0.8169338897430266, + "grad_norm": 1.5336638689041138, + "learning_rate": 5e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.722379207611084, + "num_tokens": 192429471.0, + "step": 7439 + }, + { + "epoch": 0.8170437074456403, + "grad_norm": 1.642011046409607, + "learning_rate": 5e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6884045004844666, + "num_tokens": 192458897.0, + "step": 7440 + }, + { + "epoch": 0.8171535251482539, + "grad_norm": 1.7926214933395386, + "learning_rate": 5e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7007893919944763, + "num_tokens": 192485298.0, + "step": 7441 + }, + { + "epoch": 0.8172633428508675, + "grad_norm": 1.9345194101333618, + "learning_rate": 5e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7096047401428223, + "num_tokens": 192507168.0, + "step": 7442 + }, + { + "epoch": 0.8173731605534812, + "grad_norm": 1.5964407920837402, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7072300314903259, + "num_tokens": 192536594.0, + "step": 7443 + }, + { + "epoch": 0.8174829782560948, + "grad_norm": 1.8107839822769165, + "learning_rate": 5e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7198160886764526, + "num_tokens": 192559931.0, + "step": 7444 + }, + { + "epoch": 0.8175927959587086, + "grad_norm": 1.6806389093399048, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7352539896965027, + "num_tokens": 192586455.0, + "step": 7445 + }, + { + "epoch": 0.8177026136613222, + "grad_norm": 1.8051353693008423, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7199341654777527, + "num_tokens": 192611221.0, + "step": 7446 + }, + { + "epoch": 0.8178124313639359, + "grad_norm": 1.7581901550292969, + "learning_rate": 5e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6979638338088989, + "num_tokens": 192637578.0, + "step": 7447 + }, + { + "epoch": 0.8179222490665495, + "grad_norm": 2.0951199531555176, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7119749784469604, + "num_tokens": 192657774.0, + "step": 7448 + }, + { + "epoch": 0.8180320667691632, + "grad_norm": 1.6241073608398438, + "learning_rate": 5e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.705723762512207, + "num_tokens": 192689119.0, + "step": 7449 + }, + { + "epoch": 0.8181418844717768, + "grad_norm": 1.8686479330062866, + "learning_rate": 5e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7006364464759827, + "num_tokens": 192716971.0, + "step": 7450 + }, + { + "epoch": 0.8182517021743905, + "grad_norm": 1.949634075164795, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7035727500915527, + "num_tokens": 192740509.0, + "step": 7451 + }, + { + "epoch": 0.8183615198770041, + "grad_norm": 2.2044100761413574, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7325705885887146, + "num_tokens": 192757866.0, + "step": 7452 + }, + { + "epoch": 0.8184713375796179, + "grad_norm": 1.580639123916626, + "learning_rate": 5e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6947692632675171, + "num_tokens": 192788470.0, + "step": 7453 + }, + { + "epoch": 0.8185811552822315, + "grad_norm": 1.9584376811981201, + "learning_rate": 5e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7088001370429993, + "num_tokens": 192811117.0, + "step": 7454 + }, + { + "epoch": 0.8186909729848452, + "grad_norm": 1.6597840785980225, + "learning_rate": 5e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6836698055267334, + "num_tokens": 192842376.0, + "step": 7455 + }, + { + "epoch": 0.8188007906874588, + "grad_norm": 1.7248684167861938, + "learning_rate": 5e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6891534328460693, + "num_tokens": 192871837.0, + "step": 7456 + }, + { + "epoch": 0.8189106083900725, + "grad_norm": 1.6268260478973389, + "learning_rate": 5e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6932915449142456, + "num_tokens": 192901556.0, + "step": 7457 + }, + { + "epoch": 0.8190204260926861, + "grad_norm": 1.5041710138320923, + "learning_rate": 5e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7101803421974182, + "num_tokens": 192936084.0, + "step": 7458 + }, + { + "epoch": 0.8191302437952998, + "grad_norm": 1.717685341835022, + "learning_rate": 5e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7273333072662354, + "num_tokens": 192961498.0, + "step": 7459 + }, + { + "epoch": 0.8192400614979135, + "grad_norm": 1.8278441429138184, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.708297610282898, + "num_tokens": 192985238.0, + "step": 7460 + }, + { + "epoch": 0.8193498792005272, + "grad_norm": 1.8345462083816528, + "learning_rate": 5e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7162034511566162, + "num_tokens": 193009987.0, + "step": 7461 + }, + { + "epoch": 0.8194596969031408, + "grad_norm": 1.6276038885116577, + "learning_rate": 5e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7283324003219604, + "num_tokens": 193039005.0, + "step": 7462 + }, + { + "epoch": 0.8195695146057544, + "grad_norm": 1.8762634992599487, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7201027274131775, + "num_tokens": 193061388.0, + "step": 7463 + }, + { + "epoch": 0.8196793323083681, + "grad_norm": 1.5929116010665894, + "learning_rate": 5e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7016759514808655, + "num_tokens": 193095428.0, + "step": 7464 + }, + { + "epoch": 0.8197891500109817, + "grad_norm": 1.78522527217865, + "learning_rate": 5e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6967146992683411, + "num_tokens": 193120533.0, + "step": 7465 + }, + { + "epoch": 0.8198989677135954, + "grad_norm": 1.8827625513076782, + "learning_rate": 5e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7259204983711243, + "num_tokens": 193142465.0, + "step": 7466 + }, + { + "epoch": 0.8200087854162091, + "grad_norm": 1.8037080764770508, + "learning_rate": 5e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.708015501499176, + "num_tokens": 193168782.0, + "step": 7467 + }, + { + "epoch": 0.8201186031188228, + "grad_norm": 1.966395616531372, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7106056213378906, + "num_tokens": 193188909.0, + "step": 7468 + }, + { + "epoch": 0.8202284208214364, + "grad_norm": 1.8275586366653442, + "learning_rate": 5e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7134795188903809, + "num_tokens": 193213567.0, + "step": 7469 + }, + { + "epoch": 0.8203382385240501, + "grad_norm": 1.864475965499878, + "learning_rate": 5e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6994789242744446, + "num_tokens": 193237114.0, + "step": 7470 + }, + { + "epoch": 0.8204480562266637, + "grad_norm": 1.654731035232544, + "learning_rate": 5e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6985588669776917, + "num_tokens": 193268478.0, + "step": 7471 + }, + { + "epoch": 0.8205578739292774, + "grad_norm": 1.6803869009017944, + "learning_rate": 5e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7062080502510071, + "num_tokens": 193297231.0, + "step": 7472 + }, + { + "epoch": 0.820667691631891, + "grad_norm": 1.9158053398132324, + "learning_rate": 5e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7131550312042236, + "num_tokens": 193319763.0, + "step": 7473 + }, + { + "epoch": 0.8207775093345048, + "grad_norm": 1.9673813581466675, + "learning_rate": 5e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7296801209449768, + "num_tokens": 193340826.0, + "step": 7474 + }, + { + "epoch": 0.8208873270371184, + "grad_norm": 1.89731764793396, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7275617122650146, + "num_tokens": 193362517.0, + "step": 7475 + }, + { + "epoch": 0.8209971447397321, + "grad_norm": 1.7309116125106812, + "learning_rate": 5e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7070006132125854, + "num_tokens": 193388331.0, + "step": 7476 + }, + { + "epoch": 0.8211069624423457, + "grad_norm": 1.742889642715454, + "learning_rate": 5e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.735874593257904, + "num_tokens": 193413324.0, + "step": 7477 + }, + { + "epoch": 0.8212167801449594, + "grad_norm": 1.927252173423767, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7312347292900085, + "num_tokens": 193435442.0, + "step": 7478 + }, + { + "epoch": 0.821326597847573, + "grad_norm": 2.0245745182037354, + "learning_rate": 5e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7109522819519043, + "num_tokens": 193455516.0, + "step": 7479 + }, + { + "epoch": 0.8214364155501866, + "grad_norm": 1.8557199239730835, + "learning_rate": 5e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6943016648292542, + "num_tokens": 193480820.0, + "step": 7480 + }, + { + "epoch": 0.8215462332528004, + "grad_norm": 1.5580910444259644, + "learning_rate": 5e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6896960735321045, + "num_tokens": 193515987.0, + "step": 7481 + }, + { + "epoch": 0.821656050955414, + "grad_norm": 1.8930678367614746, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.740105152130127, + "num_tokens": 193538490.0, + "step": 7482 + }, + { + "epoch": 0.8217658686580277, + "grad_norm": 2.05898380279541, + "learning_rate": 5e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7180389165878296, + "num_tokens": 193558154.0, + "step": 7483 + }, + { + "epoch": 0.8218756863606413, + "grad_norm": 1.7889691591262817, + "learning_rate": 5e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7068122625350952, + "num_tokens": 193583168.0, + "step": 7484 + }, + { + "epoch": 0.821985504063255, + "grad_norm": 1.6065442562103271, + "learning_rate": 5e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7056886553764343, + "num_tokens": 193615129.0, + "step": 7485 + }, + { + "epoch": 0.8220953217658686, + "grad_norm": 2.2149717807769775, + "learning_rate": 5e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7373881340026855, + "num_tokens": 193631960.0, + "step": 7486 + }, + { + "epoch": 0.8222051394684823, + "grad_norm": 1.8795950412750244, + "learning_rate": 5e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7153902053833008, + "num_tokens": 193655253.0, + "step": 7487 + }, + { + "epoch": 0.8223149571710959, + "grad_norm": 1.651556134223938, + "learning_rate": 5e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7159736156463623, + "num_tokens": 193684403.0, + "step": 7488 + }, + { + "epoch": 0.8224247748737097, + "grad_norm": 1.7580337524414062, + "learning_rate": 5e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7099490761756897, + "num_tokens": 193711748.0, + "step": 7489 + }, + { + "epoch": 0.8225345925763233, + "grad_norm": 1.6734410524368286, + "learning_rate": 5e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6897790431976318, + "num_tokens": 193743304.0, + "step": 7490 + }, + { + "epoch": 0.822644410278937, + "grad_norm": 1.7390395402908325, + "learning_rate": 5e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6978215575218201, + "num_tokens": 193772620.0, + "step": 7491 + }, + { + "epoch": 0.8227542279815506, + "grad_norm": 1.7032450437545776, + "learning_rate": 5e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6955195665359497, + "num_tokens": 193801416.0, + "step": 7492 + }, + { + "epoch": 0.8228640456841643, + "grad_norm": 1.7525596618652344, + "learning_rate": 5e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6994030475616455, + "num_tokens": 193827326.0, + "step": 7493 + }, + { + "epoch": 0.8229738633867779, + "grad_norm": 1.9135810136795044, + "learning_rate": 5e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6911891102790833, + "num_tokens": 193854784.0, + "step": 7494 + }, + { + "epoch": 0.8230836810893916, + "grad_norm": 1.8999615907669067, + "learning_rate": 5e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7238823771476746, + "num_tokens": 193876308.0, + "step": 7495 + }, + { + "epoch": 0.8231934987920053, + "grad_norm": 1.7176125049591064, + "learning_rate": 5e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6936928629875183, + "num_tokens": 193904550.0, + "step": 7496 + }, + { + "epoch": 0.823303316494619, + "grad_norm": 1.7474125623703003, + "learning_rate": 5e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7109956741333008, + "num_tokens": 193929510.0, + "step": 7497 + }, + { + "epoch": 0.8234131341972326, + "grad_norm": 1.7621382474899292, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7268826365470886, + "num_tokens": 193954632.0, + "step": 7498 + }, + { + "epoch": 0.8235229518998463, + "grad_norm": 1.7555066347122192, + "learning_rate": 5e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6923509836196899, + "num_tokens": 193979471.0, + "step": 7499 + }, + { + "epoch": 0.8236327696024599, + "grad_norm": 1.638906478881836, + "learning_rate": 5e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7039994597434998, + "num_tokens": 194008101.0, + "step": 7500 + }, + { + "epoch": 0.8237425873050735, + "grad_norm": 1.8107075691223145, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7189545631408691, + "num_tokens": 194033845.0, + "step": 7501 + }, + { + "epoch": 0.8238524050076872, + "grad_norm": 1.7275110483169556, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7159979939460754, + "num_tokens": 194059932.0, + "step": 7502 + }, + { + "epoch": 0.823962222710301, + "grad_norm": 1.6744384765625, + "learning_rate": 5e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6890619993209839, + "num_tokens": 194089162.0, + "step": 7503 + }, + { + "epoch": 0.8240720404129146, + "grad_norm": 1.9101693630218506, + "learning_rate": 5e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7142353653907776, + "num_tokens": 194113357.0, + "step": 7504 + }, + { + "epoch": 0.8241818581155282, + "grad_norm": 1.680140495300293, + "learning_rate": 5e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7120929956436157, + "num_tokens": 194140820.0, + "step": 7505 + }, + { + "epoch": 0.8242916758181419, + "grad_norm": 1.8703868389129639, + "learning_rate": 5e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7148239612579346, + "num_tokens": 194165931.0, + "step": 7506 + }, + { + "epoch": 0.8244014935207555, + "grad_norm": 1.8592782020568848, + "learning_rate": 5e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6986093521118164, + "num_tokens": 194190841.0, + "step": 7507 + }, + { + "epoch": 0.8245113112233692, + "grad_norm": 1.6590213775634766, + "learning_rate": 5e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.702297568321228, + "num_tokens": 194221655.0, + "step": 7508 + }, + { + "epoch": 0.8246211289259828, + "grad_norm": 2.106050729751587, + "learning_rate": 5e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7314303517341614, + "num_tokens": 194239783.0, + "step": 7509 + }, + { + "epoch": 0.8247309466285966, + "grad_norm": 1.6899641752243042, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7101812362670898, + "num_tokens": 194269702.0, + "step": 7510 + }, + { + "epoch": 0.8248407643312102, + "grad_norm": 1.7510350942611694, + "learning_rate": 5e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.697712242603302, + "num_tokens": 194296765.0, + "step": 7511 + }, + { + "epoch": 0.8249505820338239, + "grad_norm": 1.6755260229110718, + "learning_rate": 5e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7129597663879395, + "num_tokens": 194326505.0, + "step": 7512 + }, + { + "epoch": 0.8250603997364375, + "grad_norm": 1.840819001197815, + "learning_rate": 5e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.721429705619812, + "num_tokens": 194351746.0, + "step": 7513 + }, + { + "epoch": 0.8251702174390512, + "grad_norm": 1.8134788274765015, + "learning_rate": 5e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6991628408432007, + "num_tokens": 194377626.0, + "step": 7514 + }, + { + "epoch": 0.8252800351416648, + "grad_norm": 1.6501998901367188, + "learning_rate": 5e-06, + "loss": 0.917, + "mean_token_accuracy": 0.716037392616272, + "num_tokens": 194404783.0, + "step": 7515 + }, + { + "epoch": 0.8253898528442785, + "grad_norm": 1.7841622829437256, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7170804738998413, + "num_tokens": 194429923.0, + "step": 7516 + }, + { + "epoch": 0.8254996705468921, + "grad_norm": 2.109889507293701, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7470561861991882, + "num_tokens": 194448346.0, + "step": 7517 + }, + { + "epoch": 0.8256094882495059, + "grad_norm": 1.8216246366500854, + "learning_rate": 5e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7011961936950684, + "num_tokens": 194475127.0, + "step": 7518 + }, + { + "epoch": 0.8257193059521195, + "grad_norm": 2.002821445465088, + "learning_rate": 5e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6904715299606323, + "num_tokens": 194499498.0, + "step": 7519 + }, + { + "epoch": 0.8258291236547332, + "grad_norm": 1.8324397802352905, + "learning_rate": 5e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6930776834487915, + "num_tokens": 194526469.0, + "step": 7520 + }, + { + "epoch": 0.8259389413573468, + "grad_norm": 1.7807273864746094, + "learning_rate": 5e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7228161096572876, + "num_tokens": 194553558.0, + "step": 7521 + }, + { + "epoch": 0.8260487590599604, + "grad_norm": 1.6999661922454834, + "learning_rate": 5e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6903870701789856, + "num_tokens": 194582171.0, + "step": 7522 + }, + { + "epoch": 0.8261585767625741, + "grad_norm": 1.987252950668335, + "learning_rate": 5e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7067843675613403, + "num_tokens": 194603283.0, + "step": 7523 + }, + { + "epoch": 0.8262683944651877, + "grad_norm": 1.6966615915298462, + "learning_rate": 5e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6814473271369934, + "num_tokens": 194633078.0, + "step": 7524 + }, + { + "epoch": 0.8263782121678015, + "grad_norm": 1.7537208795547485, + "learning_rate": 5e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7003260850906372, + "num_tokens": 194664257.0, + "step": 7525 + }, + { + "epoch": 0.8264880298704151, + "grad_norm": 1.75711989402771, + "learning_rate": 5e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7005226016044617, + "num_tokens": 194690424.0, + "step": 7526 + }, + { + "epoch": 0.8265978475730288, + "grad_norm": 1.8659213781356812, + "learning_rate": 5e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7114208936691284, + "num_tokens": 194713770.0, + "step": 7527 + }, + { + "epoch": 0.8267076652756424, + "grad_norm": 1.7891367673873901, + "learning_rate": 5e-06, + "loss": 1.1126, + "mean_token_accuracy": 0.670133113861084, + "num_tokens": 194742856.0, + "step": 7528 + }, + { + "epoch": 0.8268174829782561, + "grad_norm": 1.7535321712493896, + "learning_rate": 5e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6902099847793579, + "num_tokens": 194769868.0, + "step": 7529 + }, + { + "epoch": 0.8269273006808697, + "grad_norm": 1.8035639524459839, + "learning_rate": 5e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6869224905967712, + "num_tokens": 194797117.0, + "step": 7530 + }, + { + "epoch": 0.8270371183834834, + "grad_norm": 1.774170160293579, + "learning_rate": 5e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.678623616695404, + "num_tokens": 194825235.0, + "step": 7531 + }, + { + "epoch": 0.8271469360860971, + "grad_norm": 1.740838885307312, + "learning_rate": 5e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7155998945236206, + "num_tokens": 194851001.0, + "step": 7532 + }, + { + "epoch": 0.8272567537887108, + "grad_norm": 1.7343182563781738, + "learning_rate": 5e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7207823395729065, + "num_tokens": 194875000.0, + "step": 7533 + }, + { + "epoch": 0.8273665714913244, + "grad_norm": 1.6675537824630737, + "learning_rate": 5e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7108926773071289, + "num_tokens": 194901905.0, + "step": 7534 + }, + { + "epoch": 0.8274763891939381, + "grad_norm": 1.6644542217254639, + "learning_rate": 5e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7318190932273865, + "num_tokens": 194926151.0, + "step": 7535 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 1.676466703414917, + "learning_rate": 5e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7016189098358154, + "num_tokens": 194953909.0, + "step": 7536 + }, + { + "epoch": 0.8276960245991654, + "grad_norm": 1.834397315979004, + "learning_rate": 5e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6751342415809631, + "num_tokens": 194982299.0, + "step": 7537 + }, + { + "epoch": 0.827805842301779, + "grad_norm": 1.7561283111572266, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.740366518497467, + "num_tokens": 195005450.0, + "step": 7538 + }, + { + "epoch": 0.8279156600043928, + "grad_norm": 1.763962984085083, + "learning_rate": 5e-06, + "loss": 0.973, + "mean_token_accuracy": 0.6972446441650391, + "num_tokens": 195031579.0, + "step": 7539 + }, + { + "epoch": 0.8280254777070064, + "grad_norm": 1.6113742589950562, + "learning_rate": 5e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6960906982421875, + "num_tokens": 195065918.0, + "step": 7540 + }, + { + "epoch": 0.82813529540962, + "grad_norm": 1.9487452507019043, + "learning_rate": 5e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7116173505783081, + "num_tokens": 195088676.0, + "step": 7541 + }, + { + "epoch": 0.8282451131122337, + "grad_norm": 1.606131672859192, + "learning_rate": 5e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6953019499778748, + "num_tokens": 195120527.0, + "step": 7542 + }, + { + "epoch": 0.8283549308148473, + "grad_norm": 1.6575316190719604, + "learning_rate": 5e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6908892393112183, + "num_tokens": 195151524.0, + "step": 7543 + }, + { + "epoch": 0.828464748517461, + "grad_norm": 1.5956735610961914, + "learning_rate": 5e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.721623420715332, + "num_tokens": 195179675.0, + "step": 7544 + }, + { + "epoch": 0.8285745662200746, + "grad_norm": 1.7410258054733276, + "learning_rate": 5e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6953803896903992, + "num_tokens": 195207893.0, + "step": 7545 + }, + { + "epoch": 0.8286843839226883, + "grad_norm": 1.692467451095581, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7323088049888611, + "num_tokens": 195234332.0, + "step": 7546 + }, + { + "epoch": 0.828794201625302, + "grad_norm": 1.864575743675232, + "learning_rate": 5e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6967648267745972, + "num_tokens": 195257616.0, + "step": 7547 + }, + { + "epoch": 0.8289040193279157, + "grad_norm": 1.6905254125595093, + "learning_rate": 5e-06, + "loss": 1.002, + "mean_token_accuracy": 0.693202555179596, + "num_tokens": 195286989.0, + "step": 7548 + }, + { + "epoch": 0.8290138370305293, + "grad_norm": 1.6761890649795532, + "learning_rate": 5e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6867066621780396, + "num_tokens": 195319075.0, + "step": 7549 + }, + { + "epoch": 0.829123654733143, + "grad_norm": 1.995406985282898, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7377574443817139, + "num_tokens": 195339849.0, + "step": 7550 + }, + { + "epoch": 0.8292334724357566, + "grad_norm": 1.8174842596054077, + "learning_rate": 5e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7273749113082886, + "num_tokens": 195362925.0, + "step": 7551 + }, + { + "epoch": 0.8293432901383703, + "grad_norm": 1.793877363204956, + "learning_rate": 5e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6909655332565308, + "num_tokens": 195388769.0, + "step": 7552 + }, + { + "epoch": 0.8294531078409839, + "grad_norm": 1.9754747152328491, + "learning_rate": 5e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6987027525901794, + "num_tokens": 195413719.0, + "step": 7553 + }, + { + "epoch": 0.8295629255435977, + "grad_norm": 1.6913273334503174, + "learning_rate": 5e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.707589864730835, + "num_tokens": 195440107.0, + "step": 7554 + }, + { + "epoch": 0.8296727432462113, + "grad_norm": 1.5927503108978271, + "learning_rate": 5e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7006962299346924, + "num_tokens": 195472007.0, + "step": 7555 + }, + { + "epoch": 0.829782560948825, + "grad_norm": 1.7261149883270264, + "learning_rate": 5e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6958506107330322, + "num_tokens": 195500814.0, + "step": 7556 + }, + { + "epoch": 0.8298923786514386, + "grad_norm": 1.7148680686950684, + "learning_rate": 5e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7100441455841064, + "num_tokens": 195527367.0, + "step": 7557 + }, + { + "epoch": 0.8300021963540523, + "grad_norm": 1.7149537801742554, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6905357837677002, + "num_tokens": 195555321.0, + "step": 7558 + }, + { + "epoch": 0.8301120140566659, + "grad_norm": 1.6998244524002075, + "learning_rate": 5e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6931160688400269, + "num_tokens": 195585556.0, + "step": 7559 + }, + { + "epoch": 0.8302218317592795, + "grad_norm": 1.8254263401031494, + "learning_rate": 5e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7053263187408447, + "num_tokens": 195609634.0, + "step": 7560 + }, + { + "epoch": 0.8303316494618933, + "grad_norm": 1.9171839952468872, + "learning_rate": 5e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.7007757425308228, + "num_tokens": 195635756.0, + "step": 7561 + }, + { + "epoch": 0.830441467164507, + "grad_norm": 1.7630056142807007, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7189633846282959, + "num_tokens": 195660886.0, + "step": 7562 + }, + { + "epoch": 0.8305512848671206, + "grad_norm": 1.7951154708862305, + "learning_rate": 5e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6824808716773987, + "num_tokens": 195687758.0, + "step": 7563 + }, + { + "epoch": 0.8306611025697342, + "grad_norm": 1.806606411933899, + "learning_rate": 5e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7085961699485779, + "num_tokens": 195712803.0, + "step": 7564 + }, + { + "epoch": 0.8307709202723479, + "grad_norm": 1.5947227478027344, + "learning_rate": 5e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7061043381690979, + "num_tokens": 195743314.0, + "step": 7565 + }, + { + "epoch": 0.8308807379749615, + "grad_norm": 1.6952028274536133, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7263420820236206, + "num_tokens": 195769237.0, + "step": 7566 + }, + { + "epoch": 0.8309905556775752, + "grad_norm": 1.8524707555770874, + "learning_rate": 5e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6787984371185303, + "num_tokens": 195796694.0, + "step": 7567 + }, + { + "epoch": 0.8311003733801889, + "grad_norm": 2.112861394882202, + "learning_rate": 5e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7105104923248291, + "num_tokens": 195818916.0, + "step": 7568 + }, + { + "epoch": 0.8312101910828026, + "grad_norm": 1.9703071117401123, + "learning_rate": 5e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7015529870986938, + "num_tokens": 195841419.0, + "step": 7569 + }, + { + "epoch": 0.8313200087854162, + "grad_norm": 1.6482768058776855, + "learning_rate": 5e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6928901672363281, + "num_tokens": 195870793.0, + "step": 7570 + }, + { + "epoch": 0.8314298264880299, + "grad_norm": 1.836241364479065, + "learning_rate": 5e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7458387613296509, + "num_tokens": 195893300.0, + "step": 7571 + }, + { + "epoch": 0.8315396441906435, + "grad_norm": 1.6167998313903809, + "learning_rate": 5e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6818567514419556, + "num_tokens": 195924552.0, + "step": 7572 + }, + { + "epoch": 0.8316494618932572, + "grad_norm": 1.625609278678894, + "learning_rate": 5e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6944202780723572, + "num_tokens": 195956565.0, + "step": 7573 + }, + { + "epoch": 0.8317592795958708, + "grad_norm": 1.6591107845306396, + "learning_rate": 5e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7411019206047058, + "num_tokens": 195982383.0, + "step": 7574 + }, + { + "epoch": 0.8318690972984845, + "grad_norm": 2.0850021839141846, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7194539308547974, + "num_tokens": 196001938.0, + "step": 7575 + }, + { + "epoch": 0.8319789150010982, + "grad_norm": 1.7763851881027222, + "learning_rate": 5e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7116954326629639, + "num_tokens": 196027067.0, + "step": 7576 + }, + { + "epoch": 0.8320887327037119, + "grad_norm": 1.8135662078857422, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.731158971786499, + "num_tokens": 196050797.0, + "step": 7577 + }, + { + "epoch": 0.8321985504063255, + "grad_norm": 1.8503942489624023, + "learning_rate": 5e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.695623517036438, + "num_tokens": 196076504.0, + "step": 7578 + }, + { + "epoch": 0.8323083681089392, + "grad_norm": 1.8656718730926514, + "learning_rate": 5e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7141067981719971, + "num_tokens": 196101491.0, + "step": 7579 + }, + { + "epoch": 0.8324181858115528, + "grad_norm": 2.0512025356292725, + "learning_rate": 5e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7277061939239502, + "num_tokens": 196123430.0, + "step": 7580 + }, + { + "epoch": 0.8325280035141664, + "grad_norm": 1.7574084997177124, + "learning_rate": 5e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6984784603118896, + "num_tokens": 196151181.0, + "step": 7581 + }, + { + "epoch": 0.8326378212167801, + "grad_norm": 1.6199257373809814, + "learning_rate": 5e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6816328763961792, + "num_tokens": 196180942.0, + "step": 7582 + }, + { + "epoch": 0.8327476389193939, + "grad_norm": 1.7827907800674438, + "learning_rate": 5e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7118825316429138, + "num_tokens": 196206079.0, + "step": 7583 + }, + { + "epoch": 0.8328574566220075, + "grad_norm": 1.724161148071289, + "learning_rate": 5e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7120860815048218, + "num_tokens": 196233034.0, + "step": 7584 + }, + { + "epoch": 0.8329672743246211, + "grad_norm": 1.8535526990890503, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7189986705780029, + "num_tokens": 196254676.0, + "step": 7585 + }, + { + "epoch": 0.8330770920272348, + "grad_norm": 1.5967739820480347, + "learning_rate": 5e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6769180297851562, + "num_tokens": 196284745.0, + "step": 7586 + }, + { + "epoch": 0.8331869097298484, + "grad_norm": 1.5875955820083618, + "learning_rate": 5e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7028287649154663, + "num_tokens": 196318013.0, + "step": 7587 + }, + { + "epoch": 0.8332967274324621, + "grad_norm": 1.7030019760131836, + "learning_rate": 5e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.6935189962387085, + "num_tokens": 196347577.0, + "step": 7588 + }, + { + "epoch": 0.8334065451350757, + "grad_norm": 1.6898033618927002, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.6972275972366333, + "num_tokens": 196379951.0, + "step": 7589 + }, + { + "epoch": 0.8335163628376895, + "grad_norm": 1.747214674949646, + "learning_rate": 5e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.6833872199058533, + "num_tokens": 196408712.0, + "step": 7590 + }, + { + "epoch": 0.8336261805403031, + "grad_norm": 1.9187724590301514, + "learning_rate": 5e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7018673419952393, + "num_tokens": 196429030.0, + "step": 7591 + }, + { + "epoch": 0.8337359982429168, + "grad_norm": 1.8441568613052368, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.6992114782333374, + "num_tokens": 196454356.0, + "step": 7592 + }, + { + "epoch": 0.8338458159455304, + "grad_norm": 1.8922710418701172, + "learning_rate": 5e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6928666830062866, + "num_tokens": 196480728.0, + "step": 7593 + }, + { + "epoch": 0.8339556336481441, + "grad_norm": 2.0499677658081055, + "learning_rate": 5e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.717497706413269, + "num_tokens": 196500796.0, + "step": 7594 + }, + { + "epoch": 0.8340654513507577, + "grad_norm": 1.6789957284927368, + "learning_rate": 5e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6776414513587952, + "num_tokens": 196533512.0, + "step": 7595 + }, + { + "epoch": 0.8341752690533714, + "grad_norm": 1.9256891012191772, + "learning_rate": 5e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7152444124221802, + "num_tokens": 196554319.0, + "step": 7596 + }, + { + "epoch": 0.8342850867559851, + "grad_norm": 1.6694939136505127, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7075710296630859, + "num_tokens": 196581841.0, + "step": 7597 + }, + { + "epoch": 0.8343949044585988, + "grad_norm": 1.8975727558135986, + "learning_rate": 5e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7096244096755981, + "num_tokens": 196608228.0, + "step": 7598 + }, + { + "epoch": 0.8345047221612124, + "grad_norm": 1.717107892036438, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.69761061668396, + "num_tokens": 196636844.0, + "step": 7599 + }, + { + "epoch": 0.834614539863826, + "grad_norm": 1.9302715063095093, + "learning_rate": 5e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7121423482894897, + "num_tokens": 196660034.0, + "step": 7600 + }, + { + "epoch": 0.8347243575664397, + "grad_norm": 1.703731894493103, + "learning_rate": 5e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7015448212623596, + "num_tokens": 196688195.0, + "step": 7601 + }, + { + "epoch": 0.8348341752690533, + "grad_norm": 1.7435050010681152, + "learning_rate": 5e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7250629663467407, + "num_tokens": 196714709.0, + "step": 7602 + }, + { + "epoch": 0.834943992971667, + "grad_norm": 1.7390884160995483, + "learning_rate": 5e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7066513299942017, + "num_tokens": 196740439.0, + "step": 7603 + }, + { + "epoch": 0.8350538106742806, + "grad_norm": 1.733608365058899, + "learning_rate": 5e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7096596956253052, + "num_tokens": 196769637.0, + "step": 7604 + }, + { + "epoch": 0.8351636283768944, + "grad_norm": 1.8329228162765503, + "learning_rate": 5e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7101122140884399, + "num_tokens": 196792957.0, + "step": 7605 + }, + { + "epoch": 0.835273446079508, + "grad_norm": 2.0029635429382324, + "learning_rate": 5e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.6955124139785767, + "num_tokens": 196814896.0, + "step": 7606 + }, + { + "epoch": 0.8353832637821217, + "grad_norm": 1.9639577865600586, + "learning_rate": 5e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7176860570907593, + "num_tokens": 196837631.0, + "step": 7607 + }, + { + "epoch": 0.8354930814847353, + "grad_norm": 1.7732288837432861, + "learning_rate": 5e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6966614723205566, + "num_tokens": 196864656.0, + "step": 7608 + }, + { + "epoch": 0.835602899187349, + "grad_norm": 1.9597309827804565, + "learning_rate": 5e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7098289132118225, + "num_tokens": 196886485.0, + "step": 7609 + }, + { + "epoch": 0.8357127168899626, + "grad_norm": 1.7279731035232544, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7165725827217102, + "num_tokens": 196911396.0, + "step": 7610 + }, + { + "epoch": 0.8358225345925763, + "grad_norm": 1.9582033157348633, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7235451340675354, + "num_tokens": 196933497.0, + "step": 7611 + }, + { + "epoch": 0.83593235229519, + "grad_norm": 1.5366055965423584, + "learning_rate": 5e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.690815806388855, + "num_tokens": 196967170.0, + "step": 7612 + }, + { + "epoch": 0.8360421699978037, + "grad_norm": 1.810152292251587, + "learning_rate": 5e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6865847706794739, + "num_tokens": 196992629.0, + "step": 7613 + }, + { + "epoch": 0.8361519877004173, + "grad_norm": 1.9606857299804688, + "learning_rate": 5e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7170629501342773, + "num_tokens": 197014365.0, + "step": 7614 + }, + { + "epoch": 0.836261805403031, + "grad_norm": 1.6249336004257202, + "learning_rate": 5e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6898863315582275, + "num_tokens": 197046324.0, + "step": 7615 + }, + { + "epoch": 0.8363716231056446, + "grad_norm": 1.630352258682251, + "learning_rate": 5e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.6610451936721802, + "num_tokens": 197080673.0, + "step": 7616 + }, + { + "epoch": 0.8364814408082583, + "grad_norm": 1.834403395652771, + "learning_rate": 5e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6905804872512817, + "num_tokens": 197107226.0, + "step": 7617 + }, + { + "epoch": 0.8365912585108719, + "grad_norm": 1.8122797012329102, + "learning_rate": 5e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7259191274642944, + "num_tokens": 197129188.0, + "step": 7618 + }, + { + "epoch": 0.8367010762134857, + "grad_norm": 1.6836206912994385, + "learning_rate": 5e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6857641339302063, + "num_tokens": 197159974.0, + "step": 7619 + }, + { + "epoch": 0.8368108939160993, + "grad_norm": 1.6068024635314941, + "learning_rate": 5e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7132053971290588, + "num_tokens": 197188844.0, + "step": 7620 + }, + { + "epoch": 0.836920711618713, + "grad_norm": 1.8545048236846924, + "learning_rate": 5e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7105754017829895, + "num_tokens": 197211605.0, + "step": 7621 + }, + { + "epoch": 0.8370305293213266, + "grad_norm": 1.7377244234085083, + "learning_rate": 5e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7285565137863159, + "num_tokens": 197236873.0, + "step": 7622 + }, + { + "epoch": 0.8371403470239402, + "grad_norm": 1.8048841953277588, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.716420590877533, + "num_tokens": 197262331.0, + "step": 7623 + }, + { + "epoch": 0.8372501647265539, + "grad_norm": 1.6276811361312866, + "learning_rate": 5e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7335399985313416, + "num_tokens": 197289913.0, + "step": 7624 + }, + { + "epoch": 0.8373599824291675, + "grad_norm": 1.8423352241516113, + "learning_rate": 5e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7097798585891724, + "num_tokens": 197315487.0, + "step": 7625 + }, + { + "epoch": 0.8374698001317813, + "grad_norm": 1.6659576892852783, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6942944526672363, + "num_tokens": 197346097.0, + "step": 7626 + }, + { + "epoch": 0.8375796178343949, + "grad_norm": 1.9007784128189087, + "learning_rate": 5e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7210072875022888, + "num_tokens": 197371021.0, + "step": 7627 + }, + { + "epoch": 0.8376894355370086, + "grad_norm": 1.796290636062622, + "learning_rate": 5e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6942205429077148, + "num_tokens": 197396251.0, + "step": 7628 + }, + { + "epoch": 0.8377992532396222, + "grad_norm": 2.190732955932617, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.739591121673584, + "num_tokens": 197414113.0, + "step": 7629 + }, + { + "epoch": 0.8379090709422359, + "grad_norm": 1.7831997871398926, + "learning_rate": 5e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7035542726516724, + "num_tokens": 197438787.0, + "step": 7630 + }, + { + "epoch": 0.8380188886448495, + "grad_norm": 1.8916914463043213, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7212734222412109, + "num_tokens": 197461020.0, + "step": 7631 + }, + { + "epoch": 0.8381287063474632, + "grad_norm": 1.6529228687286377, + "learning_rate": 5e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.7022226452827454, + "num_tokens": 197493003.0, + "step": 7632 + }, + { + "epoch": 0.8382385240500769, + "grad_norm": 1.7236213684082031, + "learning_rate": 5e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7071616053581238, + "num_tokens": 197523313.0, + "step": 7633 + }, + { + "epoch": 0.8383483417526906, + "grad_norm": 1.8152296543121338, + "learning_rate": 5e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.693570613861084, + "num_tokens": 197549334.0, + "step": 7634 + }, + { + "epoch": 0.8384581594553042, + "grad_norm": 1.6281018257141113, + "learning_rate": 5e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7079242467880249, + "num_tokens": 197579831.0, + "step": 7635 + }, + { + "epoch": 0.8385679771579179, + "grad_norm": 1.8509159088134766, + "learning_rate": 5e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.718511700630188, + "num_tokens": 197602857.0, + "step": 7636 + }, + { + "epoch": 0.8386777948605315, + "grad_norm": 1.6860696077346802, + "learning_rate": 5e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6940504908561707, + "num_tokens": 197631762.0, + "step": 7637 + }, + { + "epoch": 0.8387876125631452, + "grad_norm": 1.9827163219451904, + "learning_rate": 5e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7043024301528931, + "num_tokens": 197652515.0, + "step": 7638 + }, + { + "epoch": 0.8388974302657588, + "grad_norm": 1.8874783515930176, + "learning_rate": 5e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7026785612106323, + "num_tokens": 197675677.0, + "step": 7639 + }, + { + "epoch": 0.8390072479683724, + "grad_norm": 1.77503502368927, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.709041178226471, + "num_tokens": 197703257.0, + "step": 7640 + }, + { + "epoch": 0.8391170656709862, + "grad_norm": 1.776540756225586, + "learning_rate": 5e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7105196714401245, + "num_tokens": 197732603.0, + "step": 7641 + }, + { + "epoch": 0.8392268833735999, + "grad_norm": 1.743637204170227, + "learning_rate": 5e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7049677968025208, + "num_tokens": 197760744.0, + "step": 7642 + }, + { + "epoch": 0.8393367010762135, + "grad_norm": 1.6657435894012451, + "learning_rate": 5e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7443218231201172, + "num_tokens": 197787862.0, + "step": 7643 + }, + { + "epoch": 0.8394465187788271, + "grad_norm": 1.61849045753479, + "learning_rate": 5e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7073186635971069, + "num_tokens": 197819589.0, + "step": 7644 + }, + { + "epoch": 0.8395563364814408, + "grad_norm": 1.5435221195220947, + "learning_rate": 5e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6803103685379028, + "num_tokens": 197853701.0, + "step": 7645 + }, + { + "epoch": 0.8396661541840544, + "grad_norm": 1.7924164533615112, + "learning_rate": 5e-06, + "loss": 0.96, + "mean_token_accuracy": 0.707840085029602, + "num_tokens": 197879627.0, + "step": 7646 + }, + { + "epoch": 0.8397759718866681, + "grad_norm": 2.2126481533050537, + "learning_rate": 5e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6947447061538696, + "num_tokens": 197900297.0, + "step": 7647 + }, + { + "epoch": 0.8398857895892818, + "grad_norm": 1.923011064529419, + "learning_rate": 5e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7007411122322083, + "num_tokens": 197923635.0, + "step": 7648 + }, + { + "epoch": 0.8399956072918955, + "grad_norm": 1.7370299100875854, + "learning_rate": 5e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6970584392547607, + "num_tokens": 197949877.0, + "step": 7649 + }, + { + "epoch": 0.8401054249945091, + "grad_norm": 1.7193355560302734, + "learning_rate": 5e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7071384191513062, + "num_tokens": 197976320.0, + "step": 7650 + }, + { + "epoch": 0.8402152426971228, + "grad_norm": 1.9545015096664429, + "learning_rate": 5e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7126679420471191, + "num_tokens": 197998365.0, + "step": 7651 + }, + { + "epoch": 0.8403250603997364, + "grad_norm": 1.667618989944458, + "learning_rate": 5e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.6915785074234009, + "num_tokens": 198030935.0, + "step": 7652 + }, + { + "epoch": 0.8404348781023501, + "grad_norm": 1.6401258707046509, + "learning_rate": 5e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7003411054611206, + "num_tokens": 198061732.0, + "step": 7653 + }, + { + "epoch": 0.8405446958049637, + "grad_norm": 1.7416976690292358, + "learning_rate": 5e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.690698504447937, + "num_tokens": 198088356.0, + "step": 7654 + }, + { + "epoch": 0.8406545135075775, + "grad_norm": 2.0915865898132324, + "learning_rate": 5e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7087056040763855, + "num_tokens": 198111834.0, + "step": 7655 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 2.0921783447265625, + "learning_rate": 5e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7263236045837402, + "num_tokens": 198133410.0, + "step": 7656 + }, + { + "epoch": 0.8408741489128048, + "grad_norm": 1.7043856382369995, + "learning_rate": 5e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7191940546035767, + "num_tokens": 198157712.0, + "step": 7657 + }, + { + "epoch": 0.8409839666154184, + "grad_norm": 1.7238121032714844, + "learning_rate": 5e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7164239883422852, + "num_tokens": 198182657.0, + "step": 7658 + }, + { + "epoch": 0.8410937843180321, + "grad_norm": 1.8277621269226074, + "learning_rate": 5e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.701257586479187, + "num_tokens": 198208742.0, + "step": 7659 + }, + { + "epoch": 0.8412036020206457, + "grad_norm": 1.7546498775482178, + "learning_rate": 5e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7072559595108032, + "num_tokens": 198234814.0, + "step": 7660 + }, + { + "epoch": 0.8413134197232593, + "grad_norm": 1.9696904420852661, + "learning_rate": 5e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.707033634185791, + "num_tokens": 198259136.0, + "step": 7661 + }, + { + "epoch": 0.8414232374258731, + "grad_norm": 1.6555277109146118, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7034116387367249, + "num_tokens": 198286913.0, + "step": 7662 + }, + { + "epoch": 0.8415330551284868, + "grad_norm": 1.793588399887085, + "learning_rate": 5e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7252006530761719, + "num_tokens": 198311348.0, + "step": 7663 + }, + { + "epoch": 0.8416428728311004, + "grad_norm": 1.6998534202575684, + "learning_rate": 5e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7099815607070923, + "num_tokens": 198339854.0, + "step": 7664 + }, + { + "epoch": 0.841752690533714, + "grad_norm": 1.7902811765670776, + "learning_rate": 5e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7070574760437012, + "num_tokens": 198366200.0, + "step": 7665 + }, + { + "epoch": 0.8418625082363277, + "grad_norm": 1.6163631677627563, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7014198899269104, + "num_tokens": 198395522.0, + "step": 7666 + }, + { + "epoch": 0.8419723259389413, + "grad_norm": 1.9906469583511353, + "learning_rate": 5e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7122057676315308, + "num_tokens": 198416663.0, + "step": 7667 + }, + { + "epoch": 0.842082143641555, + "grad_norm": 1.7902255058288574, + "learning_rate": 5e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6926999092102051, + "num_tokens": 198441986.0, + "step": 7668 + }, + { + "epoch": 0.8421919613441686, + "grad_norm": 1.8311501741409302, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7232615947723389, + "num_tokens": 198466229.0, + "step": 7669 + }, + { + "epoch": 0.8423017790467824, + "grad_norm": 1.7394980192184448, + "learning_rate": 5e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6876881718635559, + "num_tokens": 198494197.0, + "step": 7670 + }, + { + "epoch": 0.842411596749396, + "grad_norm": 1.6172263622283936, + "learning_rate": 5e-06, + "loss": 1.029, + "mean_token_accuracy": 0.697074294090271, + "num_tokens": 198527005.0, + "step": 7671 + }, + { + "epoch": 0.8425214144520097, + "grad_norm": 1.746861457824707, + "learning_rate": 5e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6758009195327759, + "num_tokens": 198555891.0, + "step": 7672 + }, + { + "epoch": 0.8426312321546233, + "grad_norm": 1.9671052694320679, + "learning_rate": 5e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7007051110267639, + "num_tokens": 198581580.0, + "step": 7673 + }, + { + "epoch": 0.842741049857237, + "grad_norm": 1.4830349683761597, + "learning_rate": 5e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6950058341026306, + "num_tokens": 198612927.0, + "step": 7674 + }, + { + "epoch": 0.8428508675598506, + "grad_norm": 1.8954287767410278, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7200285792350769, + "num_tokens": 198635558.0, + "step": 7675 + }, + { + "epoch": 0.8429606852624643, + "grad_norm": 1.9072918891906738, + "learning_rate": 5e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6973928213119507, + "num_tokens": 198661624.0, + "step": 7676 + }, + { + "epoch": 0.843070502965078, + "grad_norm": 1.6318292617797852, + "learning_rate": 5e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.7001445889472961, + "num_tokens": 198693724.0, + "step": 7677 + }, + { + "epoch": 0.8431803206676917, + "grad_norm": 1.9468846321105957, + "learning_rate": 5e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6967851519584656, + "num_tokens": 198718422.0, + "step": 7678 + }, + { + "epoch": 0.8432901383703053, + "grad_norm": 1.7392163276672363, + "learning_rate": 5e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7154219150543213, + "num_tokens": 198743521.0, + "step": 7679 + }, + { + "epoch": 0.843399956072919, + "grad_norm": 1.744294285774231, + "learning_rate": 5e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6937611699104309, + "num_tokens": 198768483.0, + "step": 7680 + }, + { + "epoch": 0.8435097737755326, + "grad_norm": 2.014885425567627, + "learning_rate": 5e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.73834627866745, + "num_tokens": 198788851.0, + "step": 7681 + }, + { + "epoch": 0.8436195914781462, + "grad_norm": 1.8029879331588745, + "learning_rate": 5e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.684154748916626, + "num_tokens": 198816908.0, + "step": 7682 + }, + { + "epoch": 0.8437294091807599, + "grad_norm": 1.755321979522705, + "learning_rate": 5e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6895231008529663, + "num_tokens": 198845938.0, + "step": 7683 + }, + { + "epoch": 0.8438392268833736, + "grad_norm": 1.7629603147506714, + "learning_rate": 5e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6899110078811646, + "num_tokens": 198875802.0, + "step": 7684 + }, + { + "epoch": 0.8439490445859873, + "grad_norm": 1.8229854106903076, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7149044275283813, + "num_tokens": 198900567.0, + "step": 7685 + }, + { + "epoch": 0.8440588622886009, + "grad_norm": 1.7446327209472656, + "learning_rate": 5e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6959227919578552, + "num_tokens": 198925165.0, + "step": 7686 + }, + { + "epoch": 0.8441686799912146, + "grad_norm": 1.6469764709472656, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.6989953517913818, + "num_tokens": 198955685.0, + "step": 7687 + }, + { + "epoch": 0.8442784976938282, + "grad_norm": 1.6504170894622803, + "learning_rate": 5e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7000513076782227, + "num_tokens": 198987857.0, + "step": 7688 + }, + { + "epoch": 0.8443883153964419, + "grad_norm": 1.8370224237442017, + "learning_rate": 5e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7254328727722168, + "num_tokens": 199010641.0, + "step": 7689 + }, + { + "epoch": 0.8444981330990555, + "grad_norm": 2.0139613151550293, + "learning_rate": 5e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7079473733901978, + "num_tokens": 199030595.0, + "step": 7690 + }, + { + "epoch": 0.8446079508016693, + "grad_norm": 1.603803277015686, + "learning_rate": 5e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.706017255783081, + "num_tokens": 199061982.0, + "step": 7691 + }, + { + "epoch": 0.8447177685042829, + "grad_norm": 1.9502707719802856, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.706669807434082, + "num_tokens": 199083978.0, + "step": 7692 + }, + { + "epoch": 0.8448275862068966, + "grad_norm": 2.029447078704834, + "learning_rate": 5e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6970170736312866, + "num_tokens": 199111747.0, + "step": 7693 + }, + { + "epoch": 0.8449374039095102, + "grad_norm": 1.6109470129013062, + "learning_rate": 5e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7256886959075928, + "num_tokens": 199139193.0, + "step": 7694 + }, + { + "epoch": 0.8450472216121239, + "grad_norm": 1.7950944900512695, + "learning_rate": 5e-06, + "loss": 1.0818, + "mean_token_accuracy": 0.6745716333389282, + "num_tokens": 199167696.0, + "step": 7695 + }, + { + "epoch": 0.8451570393147375, + "grad_norm": 1.9652221202850342, + "learning_rate": 5e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6986879706382751, + "num_tokens": 199190842.0, + "step": 7696 + }, + { + "epoch": 0.8452668570173512, + "grad_norm": 1.813402771949768, + "learning_rate": 5e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7067075967788696, + "num_tokens": 199214300.0, + "step": 7697 + }, + { + "epoch": 0.8453766747199648, + "grad_norm": 1.9840112924575806, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7074398994445801, + "num_tokens": 199236998.0, + "step": 7698 + }, + { + "epoch": 0.8454864924225786, + "grad_norm": 1.8663729429244995, + "learning_rate": 5e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6953204870223999, + "num_tokens": 199260663.0, + "step": 7699 + }, + { + "epoch": 0.8455963101251922, + "grad_norm": 1.7005467414855957, + "learning_rate": 5e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7162163853645325, + "num_tokens": 199290471.0, + "step": 7700 + }, + { + "epoch": 0.8457061278278059, + "grad_norm": 1.6856276988983154, + "learning_rate": 5e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7039585113525391, + "num_tokens": 199319849.0, + "step": 7701 + }, + { + "epoch": 0.8458159455304195, + "grad_norm": 1.7090378999710083, + "learning_rate": 5e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7275291681289673, + "num_tokens": 199345560.0, + "step": 7702 + }, + { + "epoch": 0.8459257632330331, + "grad_norm": 1.7469969987869263, + "learning_rate": 5e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7037830352783203, + "num_tokens": 199372998.0, + "step": 7703 + }, + { + "epoch": 0.8460355809356468, + "grad_norm": 1.6201581954956055, + "learning_rate": 5e-06, + "loss": 1.048, + "mean_token_accuracy": 0.689750611782074, + "num_tokens": 199405770.0, + "step": 7704 + }, + { + "epoch": 0.8461453986382604, + "grad_norm": 1.7732744216918945, + "learning_rate": 5e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7375257611274719, + "num_tokens": 199428421.0, + "step": 7705 + }, + { + "epoch": 0.8462552163408742, + "grad_norm": 1.7767366170883179, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6933835744857788, + "num_tokens": 199453315.0, + "step": 7706 + }, + { + "epoch": 0.8463650340434878, + "grad_norm": 1.703260898590088, + "learning_rate": 5e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.6995055079460144, + "num_tokens": 199481915.0, + "step": 7707 + }, + { + "epoch": 0.8464748517461015, + "grad_norm": 1.7763885259628296, + "learning_rate": 5e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7125864624977112, + "num_tokens": 199507770.0, + "step": 7708 + }, + { + "epoch": 0.8465846694487151, + "grad_norm": 1.7319458723068237, + "learning_rate": 5e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7109540700912476, + "num_tokens": 199535590.0, + "step": 7709 + }, + { + "epoch": 0.8466944871513288, + "grad_norm": 1.7606899738311768, + "learning_rate": 5e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7034229636192322, + "num_tokens": 199562615.0, + "step": 7710 + }, + { + "epoch": 0.8468043048539424, + "grad_norm": 1.782792329788208, + "learning_rate": 5e-06, + "loss": 1.1226, + "mean_token_accuracy": 0.66875159740448, + "num_tokens": 199591990.0, + "step": 7711 + }, + { + "epoch": 0.8469141225565561, + "grad_norm": 1.750158667564392, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6980994939804077, + "num_tokens": 199620649.0, + "step": 7712 + }, + { + "epoch": 0.8470239402591698, + "grad_norm": 1.873761773109436, + "learning_rate": 5e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7049404382705688, + "num_tokens": 199644034.0, + "step": 7713 + }, + { + "epoch": 0.8471337579617835, + "grad_norm": 1.7717465162277222, + "learning_rate": 5e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6946780681610107, + "num_tokens": 199672201.0, + "step": 7714 + }, + { + "epoch": 0.8472435756643971, + "grad_norm": 1.8472191095352173, + "learning_rate": 5e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6933009028434753, + "num_tokens": 199698129.0, + "step": 7715 + }, + { + "epoch": 0.8473533933670108, + "grad_norm": 1.8890979290008545, + "learning_rate": 5e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7298027276992798, + "num_tokens": 199720628.0, + "step": 7716 + }, + { + "epoch": 0.8474632110696244, + "grad_norm": 1.8374987840652466, + "learning_rate": 5e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7026581168174744, + "num_tokens": 199745727.0, + "step": 7717 + }, + { + "epoch": 0.8475730287722381, + "grad_norm": 1.8258734941482544, + "learning_rate": 5e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7224959135055542, + "num_tokens": 199769066.0, + "step": 7718 + }, + { + "epoch": 0.8476828464748517, + "grad_norm": 1.8416160345077515, + "learning_rate": 5e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6936614513397217, + "num_tokens": 199795000.0, + "step": 7719 + }, + { + "epoch": 0.8477926641774655, + "grad_norm": 1.7814922332763672, + "learning_rate": 5e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6935194730758667, + "num_tokens": 199819787.0, + "step": 7720 + }, + { + "epoch": 0.8479024818800791, + "grad_norm": 1.9563485383987427, + "learning_rate": 5e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7052510380744934, + "num_tokens": 199842146.0, + "step": 7721 + }, + { + "epoch": 0.8480122995826928, + "grad_norm": 1.554352879524231, + "learning_rate": 5e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7008252143859863, + "num_tokens": 199876423.0, + "step": 7722 + }, + { + "epoch": 0.8481221172853064, + "grad_norm": 2.069068193435669, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7437714338302612, + "num_tokens": 199894966.0, + "step": 7723 + }, + { + "epoch": 0.84823193498792, + "grad_norm": 1.6933640241622925, + "learning_rate": 5e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7088065147399902, + "num_tokens": 199922243.0, + "step": 7724 + }, + { + "epoch": 0.8483417526905337, + "grad_norm": 1.868064045906067, + "learning_rate": 5e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6874186992645264, + "num_tokens": 199945759.0, + "step": 7725 + }, + { + "epoch": 0.8484515703931473, + "grad_norm": 1.8129686117172241, + "learning_rate": 5e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7068331837654114, + "num_tokens": 199970837.0, + "step": 7726 + }, + { + "epoch": 0.848561388095761, + "grad_norm": 2.1323769092559814, + "learning_rate": 5e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.719538688659668, + "num_tokens": 199987909.0, + "step": 7727 + }, + { + "epoch": 0.8486712057983747, + "grad_norm": 1.4501253366470337, + "learning_rate": 5e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7302781343460083, + "num_tokens": 200023406.0, + "step": 7728 + }, + { + "epoch": 0.8487810235009884, + "grad_norm": 1.8556036949157715, + "learning_rate": 5e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7019984722137451, + "num_tokens": 200048261.0, + "step": 7729 + }, + { + "epoch": 0.848890841203602, + "grad_norm": 1.6443790197372437, + "learning_rate": 5e-06, + "loss": 0.937, + "mean_token_accuracy": 0.712897002696991, + "num_tokens": 200077119.0, + "step": 7730 + }, + { + "epoch": 0.8490006589062157, + "grad_norm": 1.9544072151184082, + "learning_rate": 5e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.6999223232269287, + "num_tokens": 200099106.0, + "step": 7731 + }, + { + "epoch": 0.8491104766088293, + "grad_norm": 1.6420130729675293, + "learning_rate": 5e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6910289525985718, + "num_tokens": 200128551.0, + "step": 7732 + }, + { + "epoch": 0.849220294311443, + "grad_norm": 1.6796433925628662, + "learning_rate": 5e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6965590715408325, + "num_tokens": 200158957.0, + "step": 7733 + }, + { + "epoch": 0.8493301120140566, + "grad_norm": 2.1382226943969727, + "learning_rate": 5e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7270002961158752, + "num_tokens": 200177743.0, + "step": 7734 + }, + { + "epoch": 0.8494399297166704, + "grad_norm": 1.8869656324386597, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7339646220207214, + "num_tokens": 200200658.0, + "step": 7735 + }, + { + "epoch": 0.849549747419284, + "grad_norm": 1.6995232105255127, + "learning_rate": 5e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7308986186981201, + "num_tokens": 200227050.0, + "step": 7736 + }, + { + "epoch": 0.8496595651218977, + "grad_norm": 1.870392918586731, + "learning_rate": 5e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7019385695457458, + "num_tokens": 200253617.0, + "step": 7737 + }, + { + "epoch": 0.8497693828245113, + "grad_norm": 1.721322774887085, + "learning_rate": 5e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7054088115692139, + "num_tokens": 200284687.0, + "step": 7738 + }, + { + "epoch": 0.849879200527125, + "grad_norm": 2.038961410522461, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7158881425857544, + "num_tokens": 200306100.0, + "step": 7739 + }, + { + "epoch": 0.8499890182297386, + "grad_norm": 1.9681819677352905, + "learning_rate": 5e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7075208425521851, + "num_tokens": 200327554.0, + "step": 7740 + }, + { + "epoch": 0.8500988359323522, + "grad_norm": 1.6847739219665527, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.70039302110672, + "num_tokens": 200359115.0, + "step": 7741 + }, + { + "epoch": 0.850208653634966, + "grad_norm": 1.7684067487716675, + "learning_rate": 5e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7052675485610962, + "num_tokens": 200385951.0, + "step": 7742 + }, + { + "epoch": 0.8503184713375797, + "grad_norm": 1.9383111000061035, + "learning_rate": 5e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7119873762130737, + "num_tokens": 200408320.0, + "step": 7743 + }, + { + "epoch": 0.8504282890401933, + "grad_norm": 1.8432748317718506, + "learning_rate": 5e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7297643423080444, + "num_tokens": 200430239.0, + "step": 7744 + }, + { + "epoch": 0.8505381067428069, + "grad_norm": 1.6724802255630493, + "learning_rate": 5e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7063174843788147, + "num_tokens": 200459135.0, + "step": 7745 + }, + { + "epoch": 0.8506479244454206, + "grad_norm": 1.8470540046691895, + "learning_rate": 5e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7037709355354309, + "num_tokens": 200482570.0, + "step": 7746 + }, + { + "epoch": 0.8507577421480342, + "grad_norm": 1.99677574634552, + "learning_rate": 5e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.723669171333313, + "num_tokens": 200504061.0, + "step": 7747 + }, + { + "epoch": 0.8508675598506479, + "grad_norm": 1.6704412698745728, + "learning_rate": 5e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6803258061408997, + "num_tokens": 200536545.0, + "step": 7748 + }, + { + "epoch": 0.8509773775532616, + "grad_norm": 1.9271405935287476, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7115171551704407, + "num_tokens": 200561569.0, + "step": 7749 + }, + { + "epoch": 0.8510871952558753, + "grad_norm": 1.6919147968292236, + "learning_rate": 5e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7257939577102661, + "num_tokens": 200589161.0, + "step": 7750 + }, + { + "epoch": 0.8511970129584889, + "grad_norm": 1.8925031423568726, + "learning_rate": 5e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6975752115249634, + "num_tokens": 200614833.0, + "step": 7751 + }, + { + "epoch": 0.8513068306611026, + "grad_norm": 2.110957622528076, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7312251329421997, + "num_tokens": 200632916.0, + "step": 7752 + }, + { + "epoch": 0.8514166483637162, + "grad_norm": 1.9299219846725464, + "learning_rate": 5e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7105562090873718, + "num_tokens": 200655628.0, + "step": 7753 + }, + { + "epoch": 0.8515264660663299, + "grad_norm": 1.723899006843567, + "learning_rate": 5e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7241324186325073, + "num_tokens": 200682090.0, + "step": 7754 + }, + { + "epoch": 0.8516362837689435, + "grad_norm": 1.7657703161239624, + "learning_rate": 5e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7049726247787476, + "num_tokens": 200708468.0, + "step": 7755 + }, + { + "epoch": 0.8517461014715572, + "grad_norm": 2.055891752243042, + "learning_rate": 5e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7256650924682617, + "num_tokens": 200726417.0, + "step": 7756 + }, + { + "epoch": 0.8518559191741709, + "grad_norm": 1.7418391704559326, + "learning_rate": 5e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6955482363700867, + "num_tokens": 200753495.0, + "step": 7757 + }, + { + "epoch": 0.8519657368767846, + "grad_norm": 1.5493232011795044, + "learning_rate": 5e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7066495418548584, + "num_tokens": 200786153.0, + "step": 7758 + }, + { + "epoch": 0.8520755545793982, + "grad_norm": 1.9501748085021973, + "learning_rate": 5e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6872661113739014, + "num_tokens": 200808682.0, + "step": 7759 + }, + { + "epoch": 0.8521853722820119, + "grad_norm": 1.70016348361969, + "learning_rate": 5e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7207043170928955, + "num_tokens": 200833864.0, + "step": 7760 + }, + { + "epoch": 0.8522951899846255, + "grad_norm": 1.8600993156433105, + "learning_rate": 5e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7203034162521362, + "num_tokens": 200857252.0, + "step": 7761 + }, + { + "epoch": 0.8524050076872391, + "grad_norm": 1.758640170097351, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.718691349029541, + "num_tokens": 200881793.0, + "step": 7762 + }, + { + "epoch": 0.8525148253898528, + "grad_norm": 1.7828925848007202, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7111078500747681, + "num_tokens": 200907415.0, + "step": 7763 + }, + { + "epoch": 0.8526246430924665, + "grad_norm": 1.8369131088256836, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7217775583267212, + "num_tokens": 200932071.0, + "step": 7764 + }, + { + "epoch": 0.8527344607950802, + "grad_norm": 1.7187758684158325, + "learning_rate": 5e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7057788372039795, + "num_tokens": 200957594.0, + "step": 7765 + }, + { + "epoch": 0.8528442784976938, + "grad_norm": 1.67428719997406, + "learning_rate": 5e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6979029774665833, + "num_tokens": 200985970.0, + "step": 7766 + }, + { + "epoch": 0.8529540962003075, + "grad_norm": 1.7298065423965454, + "learning_rate": 5e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6982290148735046, + "num_tokens": 201014832.0, + "step": 7767 + }, + { + "epoch": 0.8530639139029211, + "grad_norm": 1.8054183721542358, + "learning_rate": 5e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7124347686767578, + "num_tokens": 201040436.0, + "step": 7768 + }, + { + "epoch": 0.8531737316055348, + "grad_norm": 1.786509394645691, + "learning_rate": 5e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6924138069152832, + "num_tokens": 201068090.0, + "step": 7769 + }, + { + "epoch": 0.8532835493081484, + "grad_norm": 1.7874407768249512, + "learning_rate": 5e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7056686878204346, + "num_tokens": 201094214.0, + "step": 7770 + }, + { + "epoch": 0.8533933670107622, + "grad_norm": 1.7673137187957764, + "learning_rate": 5e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7462223172187805, + "num_tokens": 201116165.0, + "step": 7771 + }, + { + "epoch": 0.8535031847133758, + "grad_norm": 1.8890154361724854, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7177965641021729, + "num_tokens": 201137560.0, + "step": 7772 + }, + { + "epoch": 0.8536130024159895, + "grad_norm": 1.6691023111343384, + "learning_rate": 5e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7071183919906616, + "num_tokens": 201169042.0, + "step": 7773 + }, + { + "epoch": 0.8537228201186031, + "grad_norm": 1.7293522357940674, + "learning_rate": 5e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6957665085792542, + "num_tokens": 201194454.0, + "step": 7774 + }, + { + "epoch": 0.8538326378212168, + "grad_norm": 1.9329979419708252, + "learning_rate": 5e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7092509269714355, + "num_tokens": 201215461.0, + "step": 7775 + }, + { + "epoch": 0.8539424555238304, + "grad_norm": 1.873308539390564, + "learning_rate": 5e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6871798038482666, + "num_tokens": 201241869.0, + "step": 7776 + }, + { + "epoch": 0.8540522732264441, + "grad_norm": 1.8353614807128906, + "learning_rate": 5e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7244335412979126, + "num_tokens": 201266885.0, + "step": 7777 + }, + { + "epoch": 0.8541620909290578, + "grad_norm": 1.8303591012954712, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7086958885192871, + "num_tokens": 201288316.0, + "step": 7778 + }, + { + "epoch": 0.8542719086316715, + "grad_norm": 1.7542767524719238, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7143675088882446, + "num_tokens": 201314711.0, + "step": 7779 + }, + { + "epoch": 0.8543817263342851, + "grad_norm": 2.0038130283355713, + "learning_rate": 5e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6994783878326416, + "num_tokens": 201335100.0, + "step": 7780 + }, + { + "epoch": 0.8544915440368988, + "grad_norm": 1.5567924976348877, + "learning_rate": 5e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7141942381858826, + "num_tokens": 201364225.0, + "step": 7781 + }, + { + "epoch": 0.8546013617395124, + "grad_norm": 1.7858612537384033, + "learning_rate": 5e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7080404758453369, + "num_tokens": 201390733.0, + "step": 7782 + }, + { + "epoch": 0.854711179442126, + "grad_norm": 1.7269763946533203, + "learning_rate": 5e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7278521060943604, + "num_tokens": 201417677.0, + "step": 7783 + }, + { + "epoch": 0.8548209971447397, + "grad_norm": 1.7247095108032227, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7324310541152954, + "num_tokens": 201443809.0, + "step": 7784 + }, + { + "epoch": 0.8549308148473534, + "grad_norm": 1.6742849349975586, + "learning_rate": 5e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7043924331665039, + "num_tokens": 201473680.0, + "step": 7785 + }, + { + "epoch": 0.8550406325499671, + "grad_norm": 1.7577098608016968, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7371605634689331, + "num_tokens": 201496478.0, + "step": 7786 + }, + { + "epoch": 0.8551504502525807, + "grad_norm": 1.725791335105896, + "learning_rate": 5e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6988421082496643, + "num_tokens": 201525624.0, + "step": 7787 + }, + { + "epoch": 0.8552602679551944, + "grad_norm": 1.6488206386566162, + "learning_rate": 5e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7085492610931396, + "num_tokens": 201555049.0, + "step": 7788 + }, + { + "epoch": 0.855370085657808, + "grad_norm": 1.7189542055130005, + "learning_rate": 5e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6880201101303101, + "num_tokens": 201582116.0, + "step": 7789 + }, + { + "epoch": 0.8554799033604217, + "grad_norm": 1.6738200187683105, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.714623212814331, + "num_tokens": 201609575.0, + "step": 7790 + }, + { + "epoch": 0.8555897210630353, + "grad_norm": 1.8996729850769043, + "learning_rate": 5e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.7054133415222168, + "num_tokens": 201634517.0, + "step": 7791 + }, + { + "epoch": 0.855699538765649, + "grad_norm": 1.8526723384857178, + "learning_rate": 5e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6921353340148926, + "num_tokens": 201658653.0, + "step": 7792 + }, + { + "epoch": 0.8558093564682627, + "grad_norm": 1.9580177068710327, + "learning_rate": 5e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7137247323989868, + "num_tokens": 201680742.0, + "step": 7793 + }, + { + "epoch": 0.8559191741708764, + "grad_norm": 2.003314256668091, + "learning_rate": 5e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7123432159423828, + "num_tokens": 201706206.0, + "step": 7794 + }, + { + "epoch": 0.85602899187349, + "grad_norm": 1.6679017543792725, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7219858169555664, + "num_tokens": 201735772.0, + "step": 7795 + }, + { + "epoch": 0.8561388095761037, + "grad_norm": 1.5244629383087158, + "learning_rate": 5e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6848578453063965, + "num_tokens": 201770909.0, + "step": 7796 + }, + { + "epoch": 0.8562486272787173, + "grad_norm": 1.6508201360702515, + "learning_rate": 5e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7047281265258789, + "num_tokens": 201799119.0, + "step": 7797 + }, + { + "epoch": 0.856358444981331, + "grad_norm": 1.801320195198059, + "learning_rate": 5e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6936807632446289, + "num_tokens": 201824559.0, + "step": 7798 + }, + { + "epoch": 0.8564682626839446, + "grad_norm": 1.6949553489685059, + "learning_rate": 5e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7239165902137756, + "num_tokens": 201849626.0, + "step": 7799 + }, + { + "epoch": 0.8565780803865584, + "grad_norm": 1.7030904293060303, + "learning_rate": 5e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7001161575317383, + "num_tokens": 201878589.0, + "step": 7800 + }, + { + "epoch": 0.856687898089172, + "grad_norm": 1.7517555952072144, + "learning_rate": 5e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7044906616210938, + "num_tokens": 201904915.0, + "step": 7801 + }, + { + "epoch": 0.8567977157917857, + "grad_norm": 1.7923026084899902, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7088136672973633, + "num_tokens": 201930541.0, + "step": 7802 + }, + { + "epoch": 0.8569075334943993, + "grad_norm": 1.7594903707504272, + "learning_rate": 5e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7123559713363647, + "num_tokens": 201957308.0, + "step": 7803 + }, + { + "epoch": 0.8570173511970129, + "grad_norm": 1.7360979318618774, + "learning_rate": 5e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7152740955352783, + "num_tokens": 201983376.0, + "step": 7804 + }, + { + "epoch": 0.8571271688996266, + "grad_norm": 1.9057941436767578, + "learning_rate": 5e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6973564624786377, + "num_tokens": 202009110.0, + "step": 7805 + }, + { + "epoch": 0.8572369866022402, + "grad_norm": 1.972821831703186, + "learning_rate": 5e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7056361436843872, + "num_tokens": 202032291.0, + "step": 7806 + }, + { + "epoch": 0.857346804304854, + "grad_norm": 1.7833590507507324, + "learning_rate": 5e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7133033275604248, + "num_tokens": 202057835.0, + "step": 7807 + }, + { + "epoch": 0.8574566220074676, + "grad_norm": 1.7770905494689941, + "learning_rate": 5e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6838685274124146, + "num_tokens": 202086503.0, + "step": 7808 + }, + { + "epoch": 0.8575664397100813, + "grad_norm": 1.855088472366333, + "learning_rate": 5e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.708348274230957, + "num_tokens": 202109561.0, + "step": 7809 + }, + { + "epoch": 0.8576762574126949, + "grad_norm": 1.799567461013794, + "learning_rate": 5e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7305285930633545, + "num_tokens": 202132123.0, + "step": 7810 + }, + { + "epoch": 0.8577860751153086, + "grad_norm": 1.9102782011032104, + "learning_rate": 5e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7080466747283936, + "num_tokens": 202155519.0, + "step": 7811 + }, + { + "epoch": 0.8578958928179222, + "grad_norm": 1.623589038848877, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7192674875259399, + "num_tokens": 202185905.0, + "step": 7812 + }, + { + "epoch": 0.8580057105205359, + "grad_norm": 1.6365787982940674, + "learning_rate": 5e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7105405330657959, + "num_tokens": 202217256.0, + "step": 7813 + }, + { + "epoch": 0.8581155282231496, + "grad_norm": 1.9465175867080688, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.731113076210022, + "num_tokens": 202238083.0, + "step": 7814 + }, + { + "epoch": 0.8582253459257633, + "grad_norm": 1.7570445537567139, + "learning_rate": 5e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7166156768798828, + "num_tokens": 202263881.0, + "step": 7815 + }, + { + "epoch": 0.8583351636283769, + "grad_norm": 1.55216646194458, + "learning_rate": 5e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6935396194458008, + "num_tokens": 202296184.0, + "step": 7816 + }, + { + "epoch": 0.8584449813309906, + "grad_norm": 1.7864052057266235, + "learning_rate": 5e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7064284086227417, + "num_tokens": 202321503.0, + "step": 7817 + }, + { + "epoch": 0.8585547990336042, + "grad_norm": 1.6208781003952026, + "learning_rate": 5e-06, + "loss": 1.0872, + "mean_token_accuracy": 0.6740789413452148, + "num_tokens": 202352804.0, + "step": 7818 + }, + { + "epoch": 0.8586646167362179, + "grad_norm": 1.7853686809539795, + "learning_rate": 5e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7050924897193909, + "num_tokens": 202377668.0, + "step": 7819 + }, + { + "epoch": 0.8587744344388315, + "grad_norm": 1.985796570777893, + "learning_rate": 5e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.709729790687561, + "num_tokens": 202399865.0, + "step": 7820 + }, + { + "epoch": 0.8588842521414451, + "grad_norm": 1.6736154556274414, + "learning_rate": 5e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7069107294082642, + "num_tokens": 202428722.0, + "step": 7821 + }, + { + "epoch": 0.8589940698440589, + "grad_norm": 1.6590913534164429, + "learning_rate": 5e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7074386477470398, + "num_tokens": 202457862.0, + "step": 7822 + }, + { + "epoch": 0.8591038875466726, + "grad_norm": 1.8242560625076294, + "learning_rate": 5e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.691441535949707, + "num_tokens": 202481717.0, + "step": 7823 + }, + { + "epoch": 0.8592137052492862, + "grad_norm": 1.7045097351074219, + "learning_rate": 5e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6946784853935242, + "num_tokens": 202509306.0, + "step": 7824 + }, + { + "epoch": 0.8593235229518998, + "grad_norm": 1.937052845954895, + "learning_rate": 5e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7061765193939209, + "num_tokens": 202531967.0, + "step": 7825 + }, + { + "epoch": 0.8594333406545135, + "grad_norm": 1.7376973628997803, + "learning_rate": 5e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7062731981277466, + "num_tokens": 202559161.0, + "step": 7826 + }, + { + "epoch": 0.8595431583571271, + "grad_norm": 1.8889485597610474, + "learning_rate": 5e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7135646343231201, + "num_tokens": 202582993.0, + "step": 7827 + }, + { + "epoch": 0.8596529760597408, + "grad_norm": 1.7359734773635864, + "learning_rate": 5e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6940611004829407, + "num_tokens": 202609193.0, + "step": 7828 + }, + { + "epoch": 0.8597627937623545, + "grad_norm": 1.8367563486099243, + "learning_rate": 5e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7221933603286743, + "num_tokens": 202635227.0, + "step": 7829 + }, + { + "epoch": 0.8598726114649682, + "grad_norm": 1.753130555152893, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7116588354110718, + "num_tokens": 202662110.0, + "step": 7830 + }, + { + "epoch": 0.8599824291675818, + "grad_norm": 1.917980670928955, + "learning_rate": 5e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7155774831771851, + "num_tokens": 202683831.0, + "step": 7831 + }, + { + "epoch": 0.8600922468701955, + "grad_norm": 1.6707391738891602, + "learning_rate": 5e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7286891937255859, + "num_tokens": 202709989.0, + "step": 7832 + }, + { + "epoch": 0.8602020645728091, + "grad_norm": 1.6940240859985352, + "learning_rate": 5e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7078273296356201, + "num_tokens": 202737324.0, + "step": 7833 + }, + { + "epoch": 0.8603118822754228, + "grad_norm": 1.796189785003662, + "learning_rate": 5e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6960462331771851, + "num_tokens": 202763013.0, + "step": 7834 + }, + { + "epoch": 0.8604216999780364, + "grad_norm": 1.61565363407135, + "learning_rate": 5e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7162064909934998, + "num_tokens": 202789836.0, + "step": 7835 + }, + { + "epoch": 0.8605315176806502, + "grad_norm": 1.8781179189682007, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7162915468215942, + "num_tokens": 202812571.0, + "step": 7836 + }, + { + "epoch": 0.8606413353832638, + "grad_norm": 1.9256683588027954, + "learning_rate": 5e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.70023512840271, + "num_tokens": 202836773.0, + "step": 7837 + }, + { + "epoch": 0.8607511530858775, + "grad_norm": 1.7106772661209106, + "learning_rate": 5e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.694914698600769, + "num_tokens": 202863459.0, + "step": 7838 + }, + { + "epoch": 0.8608609707884911, + "grad_norm": 1.8513567447662354, + "learning_rate": 5e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7518602013587952, + "num_tokens": 202885718.0, + "step": 7839 + }, + { + "epoch": 0.8609707884911048, + "grad_norm": 1.9020122289657593, + "learning_rate": 5e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7089025974273682, + "num_tokens": 202909087.0, + "step": 7840 + }, + { + "epoch": 0.8610806061937184, + "grad_norm": 1.7076207399368286, + "learning_rate": 5e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6894579529762268, + "num_tokens": 202937448.0, + "step": 7841 + }, + { + "epoch": 0.861190423896332, + "grad_norm": 1.7082710266113281, + "learning_rate": 5e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7047770619392395, + "num_tokens": 202967465.0, + "step": 7842 + }, + { + "epoch": 0.8613002415989458, + "grad_norm": 1.7464485168457031, + "learning_rate": 5e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.710739254951477, + "num_tokens": 202994142.0, + "step": 7843 + }, + { + "epoch": 0.8614100593015594, + "grad_norm": 1.7798644304275513, + "learning_rate": 5e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6981635093688965, + "num_tokens": 203020286.0, + "step": 7844 + }, + { + "epoch": 0.8615198770041731, + "grad_norm": 1.690909743309021, + "learning_rate": 5e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.692610502243042, + "num_tokens": 203049960.0, + "step": 7845 + }, + { + "epoch": 0.8616296947067867, + "grad_norm": 1.666178584098816, + "learning_rate": 5e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.709595799446106, + "num_tokens": 203081704.0, + "step": 7846 + }, + { + "epoch": 0.8617395124094004, + "grad_norm": 2.18095064163208, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7216782569885254, + "num_tokens": 203100561.0, + "step": 7847 + }, + { + "epoch": 0.861849330112014, + "grad_norm": 1.738966464996338, + "learning_rate": 5e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7108758687973022, + "num_tokens": 203125924.0, + "step": 7848 + }, + { + "epoch": 0.8619591478146277, + "grad_norm": 1.643872857093811, + "learning_rate": 5e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7061325311660767, + "num_tokens": 203153725.0, + "step": 7849 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 1.7930792570114136, + "learning_rate": 5e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7037095427513123, + "num_tokens": 203179137.0, + "step": 7850 + }, + { + "epoch": 0.8621787832198551, + "grad_norm": 1.6583656072616577, + "learning_rate": 5e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6905989050865173, + "num_tokens": 203208654.0, + "step": 7851 + }, + { + "epoch": 0.8622886009224687, + "grad_norm": 1.729790210723877, + "learning_rate": 5e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7347575426101685, + "num_tokens": 203234375.0, + "step": 7852 + }, + { + "epoch": 0.8623984186250824, + "grad_norm": 1.7613136768341064, + "learning_rate": 5e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6963420510292053, + "num_tokens": 203261259.0, + "step": 7853 + }, + { + "epoch": 0.862508236327696, + "grad_norm": 1.6598767042160034, + "learning_rate": 5e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7074965238571167, + "num_tokens": 203288550.0, + "step": 7854 + }, + { + "epoch": 0.8626180540303097, + "grad_norm": 1.762611746788025, + "learning_rate": 5e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7233752608299255, + "num_tokens": 203312404.0, + "step": 7855 + }, + { + "epoch": 0.8627278717329233, + "grad_norm": 1.8074647188186646, + "learning_rate": 5e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7075306177139282, + "num_tokens": 203337615.0, + "step": 7856 + }, + { + "epoch": 0.862837689435537, + "grad_norm": 1.7321139574050903, + "learning_rate": 5e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.716295599937439, + "num_tokens": 203362271.0, + "step": 7857 + }, + { + "epoch": 0.8629475071381507, + "grad_norm": 1.5972368717193604, + "learning_rate": 5e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7138198018074036, + "num_tokens": 203393577.0, + "step": 7858 + }, + { + "epoch": 0.8630573248407644, + "grad_norm": 1.7467530965805054, + "learning_rate": 5e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7246524095535278, + "num_tokens": 203418196.0, + "step": 7859 + }, + { + "epoch": 0.863167142543378, + "grad_norm": 1.7051312923431396, + "learning_rate": 5e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7121894955635071, + "num_tokens": 203445964.0, + "step": 7860 + }, + { + "epoch": 0.8632769602459917, + "grad_norm": 1.8913321495056152, + "learning_rate": 5e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7131319046020508, + "num_tokens": 203468963.0, + "step": 7861 + }, + { + "epoch": 0.8633867779486053, + "grad_norm": 2.005596399307251, + "learning_rate": 5e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7146098017692566, + "num_tokens": 203489280.0, + "step": 7862 + }, + { + "epoch": 0.863496595651219, + "grad_norm": 1.578010082244873, + "learning_rate": 5e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6976747512817383, + "num_tokens": 203522362.0, + "step": 7863 + }, + { + "epoch": 0.8636064133538326, + "grad_norm": 1.801892876625061, + "learning_rate": 5e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6959697604179382, + "num_tokens": 203549713.0, + "step": 7864 + }, + { + "epoch": 0.8637162310564463, + "grad_norm": 1.8155601024627686, + "learning_rate": 5e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7033358812332153, + "num_tokens": 203575950.0, + "step": 7865 + }, + { + "epoch": 0.86382604875906, + "grad_norm": 1.569250464439392, + "learning_rate": 5e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6848456859588623, + "num_tokens": 203610738.0, + "step": 7866 + }, + { + "epoch": 0.8639358664616736, + "grad_norm": 1.8284798860549927, + "learning_rate": 5e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.716010570526123, + "num_tokens": 203634308.0, + "step": 7867 + }, + { + "epoch": 0.8640456841642873, + "grad_norm": 2.2753658294677734, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7281529307365417, + "num_tokens": 203651083.0, + "step": 7868 + }, + { + "epoch": 0.8641555018669009, + "grad_norm": 1.6416383981704712, + "learning_rate": 5e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7033717036247253, + "num_tokens": 203682313.0, + "step": 7869 + }, + { + "epoch": 0.8642653195695146, + "grad_norm": 1.7962095737457275, + "learning_rate": 5e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7129613161087036, + "num_tokens": 203705617.0, + "step": 7870 + }, + { + "epoch": 0.8643751372721282, + "grad_norm": 1.8455091714859009, + "learning_rate": 5e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.687073826789856, + "num_tokens": 203732090.0, + "step": 7871 + }, + { + "epoch": 0.864484954974742, + "grad_norm": 1.8067550659179688, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7431562542915344, + "num_tokens": 203754527.0, + "step": 7872 + }, + { + "epoch": 0.8645947726773556, + "grad_norm": 1.77234947681427, + "learning_rate": 5e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7070016264915466, + "num_tokens": 203781027.0, + "step": 7873 + }, + { + "epoch": 0.8647045903799693, + "grad_norm": 1.8189786672592163, + "learning_rate": 5e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7273848056793213, + "num_tokens": 203808043.0, + "step": 7874 + }, + { + "epoch": 0.8648144080825829, + "grad_norm": 1.8818472623825073, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7416661977767944, + "num_tokens": 203829475.0, + "step": 7875 + }, + { + "epoch": 0.8649242257851966, + "grad_norm": 1.7417293787002563, + "learning_rate": 5e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7066000699996948, + "num_tokens": 203857575.0, + "step": 7876 + }, + { + "epoch": 0.8650340434878102, + "grad_norm": 1.7603415250778198, + "learning_rate": 5e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6964318752288818, + "num_tokens": 203884629.0, + "step": 7877 + }, + { + "epoch": 0.8651438611904239, + "grad_norm": 1.87527596950531, + "learning_rate": 5e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7095310688018799, + "num_tokens": 203910042.0, + "step": 7878 + }, + { + "epoch": 0.8652536788930375, + "grad_norm": 1.7609426975250244, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7189550995826721, + "num_tokens": 203938081.0, + "step": 7879 + }, + { + "epoch": 0.8653634965956513, + "grad_norm": 1.8790391683578491, + "learning_rate": 5e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7045773863792419, + "num_tokens": 203962956.0, + "step": 7880 + }, + { + "epoch": 0.8654733142982649, + "grad_norm": 1.7260135412216187, + "learning_rate": 5e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7044989466667175, + "num_tokens": 203991460.0, + "step": 7881 + }, + { + "epoch": 0.8655831320008786, + "grad_norm": 2.0198447704315186, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7224128246307373, + "num_tokens": 204011662.0, + "step": 7882 + }, + { + "epoch": 0.8656929497034922, + "grad_norm": 1.7631515264511108, + "learning_rate": 5e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7145541906356812, + "num_tokens": 204038481.0, + "step": 7883 + }, + { + "epoch": 0.8658027674061058, + "grad_norm": 1.9130797386169434, + "learning_rate": 5e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7186707258224487, + "num_tokens": 204062040.0, + "step": 7884 + }, + { + "epoch": 0.8659125851087195, + "grad_norm": 1.6112231016159058, + "learning_rate": 5e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7112743258476257, + "num_tokens": 204093358.0, + "step": 7885 + }, + { + "epoch": 0.8660224028113331, + "grad_norm": 1.9910871982574463, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7284599542617798, + "num_tokens": 204113560.0, + "step": 7886 + }, + { + "epoch": 0.8661322205139469, + "grad_norm": 1.7215129137039185, + "learning_rate": 5e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6826391816139221, + "num_tokens": 204138583.0, + "step": 7887 + }, + { + "epoch": 0.8662420382165605, + "grad_norm": 1.6644680500030518, + "learning_rate": 5e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7173745632171631, + "num_tokens": 204166768.0, + "step": 7888 + }, + { + "epoch": 0.8663518559191742, + "grad_norm": 1.6371707916259766, + "learning_rate": 5e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.71226567029953, + "num_tokens": 204194522.0, + "step": 7889 + }, + { + "epoch": 0.8664616736217878, + "grad_norm": 1.850206971168518, + "learning_rate": 5e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7227740287780762, + "num_tokens": 204218634.0, + "step": 7890 + }, + { + "epoch": 0.8665714913244015, + "grad_norm": 1.901784062385559, + "learning_rate": 5e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7027859687805176, + "num_tokens": 204240809.0, + "step": 7891 + }, + { + "epoch": 0.8666813090270151, + "grad_norm": 2.14170503616333, + "learning_rate": 5e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7136670351028442, + "num_tokens": 204260713.0, + "step": 7892 + }, + { + "epoch": 0.8667911267296288, + "grad_norm": 2.0106427669525146, + "learning_rate": 5e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7020919322967529, + "num_tokens": 204282464.0, + "step": 7893 + }, + { + "epoch": 0.8669009444322425, + "grad_norm": 1.727247714996338, + "learning_rate": 5e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6695047616958618, + "num_tokens": 204311318.0, + "step": 7894 + }, + { + "epoch": 0.8670107621348562, + "grad_norm": 1.8734010457992554, + "learning_rate": 5e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7048230171203613, + "num_tokens": 204336898.0, + "step": 7895 + }, + { + "epoch": 0.8671205798374698, + "grad_norm": 2.0213887691497803, + "learning_rate": 5e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7251856923103333, + "num_tokens": 204356605.0, + "step": 7896 + }, + { + "epoch": 0.8672303975400835, + "grad_norm": 1.789239525794983, + "learning_rate": 5e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7213339805603027, + "num_tokens": 204381348.0, + "step": 7897 + }, + { + "epoch": 0.8673402152426971, + "grad_norm": 1.5555542707443237, + "learning_rate": 5e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7129188776016235, + "num_tokens": 204411480.0, + "step": 7898 + }, + { + "epoch": 0.8674500329453108, + "grad_norm": 1.9205459356307983, + "learning_rate": 5e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7311908006668091, + "num_tokens": 204431997.0, + "step": 7899 + }, + { + "epoch": 0.8675598506479244, + "grad_norm": 1.9178344011306763, + "learning_rate": 5e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.705292284488678, + "num_tokens": 204454799.0, + "step": 7900 + }, + { + "epoch": 0.8676696683505382, + "grad_norm": 1.9090406894683838, + "learning_rate": 5e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.714516282081604, + "num_tokens": 204476041.0, + "step": 7901 + }, + { + "epoch": 0.8677794860531518, + "grad_norm": 1.7079046964645386, + "learning_rate": 5e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7051985859870911, + "num_tokens": 204504173.0, + "step": 7902 + }, + { + "epoch": 0.8678893037557655, + "grad_norm": 1.698075294494629, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.720507025718689, + "num_tokens": 204529260.0, + "step": 7903 + }, + { + "epoch": 0.8679991214583791, + "grad_norm": 1.8659336566925049, + "learning_rate": 5e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6833102107048035, + "num_tokens": 204554076.0, + "step": 7904 + }, + { + "epoch": 0.8681089391609927, + "grad_norm": 1.6431019306182861, + "learning_rate": 5e-06, + "loss": 1.1059, + "mean_token_accuracy": 0.6699619293212891, + "num_tokens": 204587118.0, + "step": 7905 + }, + { + "epoch": 0.8682187568636064, + "grad_norm": 1.7911574840545654, + "learning_rate": 5e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7050924301147461, + "num_tokens": 204611869.0, + "step": 7906 + }, + { + "epoch": 0.86832857456622, + "grad_norm": 1.7105947732925415, + "learning_rate": 5e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6892233490943909, + "num_tokens": 204640211.0, + "step": 7907 + }, + { + "epoch": 0.8684383922688337, + "grad_norm": 1.983276128768921, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7336072325706482, + "num_tokens": 204662099.0, + "step": 7908 + }, + { + "epoch": 0.8685482099714474, + "grad_norm": 1.6888718605041504, + "learning_rate": 5e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.708034873008728, + "num_tokens": 204691423.0, + "step": 7909 + }, + { + "epoch": 0.8686580276740611, + "grad_norm": 1.7474241256713867, + "learning_rate": 5e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7088779211044312, + "num_tokens": 204720413.0, + "step": 7910 + }, + { + "epoch": 0.8687678453766747, + "grad_norm": 1.7870928049087524, + "learning_rate": 5e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6910922527313232, + "num_tokens": 204745904.0, + "step": 7911 + }, + { + "epoch": 0.8688776630792884, + "grad_norm": 1.848129153251648, + "learning_rate": 5e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7163471579551697, + "num_tokens": 204768941.0, + "step": 7912 + }, + { + "epoch": 0.868987480781902, + "grad_norm": 1.7577468156814575, + "learning_rate": 5e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7031545639038086, + "num_tokens": 204793821.0, + "step": 7913 + }, + { + "epoch": 0.8690972984845157, + "grad_norm": 2.031934976577759, + "learning_rate": 5e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7506306171417236, + "num_tokens": 204811848.0, + "step": 7914 + }, + { + "epoch": 0.8692071161871293, + "grad_norm": 1.9577054977416992, + "learning_rate": 5e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7100496888160706, + "num_tokens": 204833036.0, + "step": 7915 + }, + { + "epoch": 0.8693169338897431, + "grad_norm": 1.7941648960113525, + "learning_rate": 5e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6894142031669617, + "num_tokens": 204859841.0, + "step": 7916 + }, + { + "epoch": 0.8694267515923567, + "grad_norm": 1.7379478216171265, + "learning_rate": 5e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7037074565887451, + "num_tokens": 204886353.0, + "step": 7917 + }, + { + "epoch": 0.8695365692949704, + "grad_norm": 1.6929066181182861, + "learning_rate": 5e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.714113712310791, + "num_tokens": 204913369.0, + "step": 7918 + }, + { + "epoch": 0.869646386997584, + "grad_norm": 1.7939461469650269, + "learning_rate": 5e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7246198654174805, + "num_tokens": 204937695.0, + "step": 7919 + }, + { + "epoch": 0.8697562047001977, + "grad_norm": 1.8642998933792114, + "learning_rate": 5e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6877483129501343, + "num_tokens": 204962578.0, + "step": 7920 + }, + { + "epoch": 0.8698660224028113, + "grad_norm": 1.7364555597305298, + "learning_rate": 5e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.736535906791687, + "num_tokens": 204989544.0, + "step": 7921 + }, + { + "epoch": 0.869975840105425, + "grad_norm": 1.6744805574417114, + "learning_rate": 5e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7052887678146362, + "num_tokens": 205015634.0, + "step": 7922 + }, + { + "epoch": 0.8700856578080387, + "grad_norm": 1.828595757484436, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.726281464099884, + "num_tokens": 205041672.0, + "step": 7923 + }, + { + "epoch": 0.8701954755106523, + "grad_norm": 1.832388997077942, + "learning_rate": 5e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.710430383682251, + "num_tokens": 205065189.0, + "step": 7924 + }, + { + "epoch": 0.870305293213266, + "grad_norm": 2.104789972305298, + "learning_rate": 5e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.72142493724823, + "num_tokens": 205084629.0, + "step": 7925 + }, + { + "epoch": 0.8704151109158796, + "grad_norm": 1.6742281913757324, + "learning_rate": 5e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7091566324234009, + "num_tokens": 205112527.0, + "step": 7926 + }, + { + "epoch": 0.8705249286184933, + "grad_norm": 1.630359411239624, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7199206948280334, + "num_tokens": 205142616.0, + "step": 7927 + }, + { + "epoch": 0.8706347463211069, + "grad_norm": 1.7031140327453613, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7387499809265137, + "num_tokens": 205170880.0, + "step": 7928 + }, + { + "epoch": 0.8707445640237206, + "grad_norm": 1.8122082948684692, + "learning_rate": 5e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6901968717575073, + "num_tokens": 205199124.0, + "step": 7929 + }, + { + "epoch": 0.8708543817263343, + "grad_norm": 1.7980843782424927, + "learning_rate": 5e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7105878591537476, + "num_tokens": 205224367.0, + "step": 7930 + }, + { + "epoch": 0.870964199428948, + "grad_norm": 1.986035704612732, + "learning_rate": 5e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7353092432022095, + "num_tokens": 205244686.0, + "step": 7931 + }, + { + "epoch": 0.8710740171315616, + "grad_norm": 1.9599573612213135, + "learning_rate": 5e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7250925302505493, + "num_tokens": 205265798.0, + "step": 7932 + }, + { + "epoch": 0.8711838348341753, + "grad_norm": 1.9619160890579224, + "learning_rate": 5e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7399275302886963, + "num_tokens": 205285959.0, + "step": 7933 + }, + { + "epoch": 0.8712936525367889, + "grad_norm": 1.9751487970352173, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7260861992835999, + "num_tokens": 205306729.0, + "step": 7934 + }, + { + "epoch": 0.8714034702394026, + "grad_norm": 1.574242353439331, + "learning_rate": 5e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6877583265304565, + "num_tokens": 205338632.0, + "step": 7935 + }, + { + "epoch": 0.8715132879420162, + "grad_norm": 2.013294219970703, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.739724338054657, + "num_tokens": 205358271.0, + "step": 7936 + }, + { + "epoch": 0.87162310564463, + "grad_norm": 1.689604640007019, + "learning_rate": 5e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.6894478797912598, + "num_tokens": 205387424.0, + "step": 7937 + }, + { + "epoch": 0.8717329233472436, + "grad_norm": 1.6284582614898682, + "learning_rate": 5e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7007029056549072, + "num_tokens": 205419205.0, + "step": 7938 + }, + { + "epoch": 0.8718427410498573, + "grad_norm": 1.8556290864944458, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7171227931976318, + "num_tokens": 205442978.0, + "step": 7939 + }, + { + "epoch": 0.8719525587524709, + "grad_norm": 1.8516689538955688, + "learning_rate": 5e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6907331943511963, + "num_tokens": 205467628.0, + "step": 7940 + }, + { + "epoch": 0.8720623764550846, + "grad_norm": 1.7230758666992188, + "learning_rate": 5e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6953909397125244, + "num_tokens": 205496444.0, + "step": 7941 + }, + { + "epoch": 0.8721721941576982, + "grad_norm": 1.9481821060180664, + "learning_rate": 5e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7006335258483887, + "num_tokens": 205519700.0, + "step": 7942 + }, + { + "epoch": 0.8722820118603118, + "grad_norm": 1.5856369733810425, + "learning_rate": 5e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.726045548915863, + "num_tokens": 205550076.0, + "step": 7943 + }, + { + "epoch": 0.8723918295629255, + "grad_norm": 1.7123609781265259, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.711835503578186, + "num_tokens": 205576793.0, + "step": 7944 + }, + { + "epoch": 0.8725016472655392, + "grad_norm": 1.7436622381210327, + "learning_rate": 5e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7003370523452759, + "num_tokens": 205601354.0, + "step": 7945 + }, + { + "epoch": 0.8726114649681529, + "grad_norm": 1.7865245342254639, + "learning_rate": 5e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7102159261703491, + "num_tokens": 205626787.0, + "step": 7946 + }, + { + "epoch": 0.8727212826707665, + "grad_norm": 1.8491770029067993, + "learning_rate": 5e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7153870463371277, + "num_tokens": 205650875.0, + "step": 7947 + }, + { + "epoch": 0.8728311003733802, + "grad_norm": 1.7642934322357178, + "learning_rate": 5e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.700701892375946, + "num_tokens": 205676005.0, + "step": 7948 + }, + { + "epoch": 0.8729409180759938, + "grad_norm": 1.6200323104858398, + "learning_rate": 5e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.6982083320617676, + "num_tokens": 205706217.0, + "step": 7949 + }, + { + "epoch": 0.8730507357786075, + "grad_norm": 1.8201415538787842, + "learning_rate": 5e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7202072143554688, + "num_tokens": 205729484.0, + "step": 7950 + }, + { + "epoch": 0.8731605534812211, + "grad_norm": 1.5078176259994507, + "learning_rate": 5e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6838122606277466, + "num_tokens": 205764210.0, + "step": 7951 + }, + { + "epoch": 0.8732703711838349, + "grad_norm": 1.8874108791351318, + "learning_rate": 5e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7140480875968933, + "num_tokens": 205786890.0, + "step": 7952 + }, + { + "epoch": 0.8733801888864485, + "grad_norm": 1.834810495376587, + "learning_rate": 5e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6977660655975342, + "num_tokens": 205811969.0, + "step": 7953 + }, + { + "epoch": 0.8734900065890622, + "grad_norm": 1.7189550399780273, + "learning_rate": 5e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7007111310958862, + "num_tokens": 205839548.0, + "step": 7954 + }, + { + "epoch": 0.8735998242916758, + "grad_norm": 1.6233943700790405, + "learning_rate": 5e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7115520238876343, + "num_tokens": 205869749.0, + "step": 7955 + }, + { + "epoch": 0.8737096419942895, + "grad_norm": 1.8050148487091064, + "learning_rate": 5e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7072482109069824, + "num_tokens": 205895586.0, + "step": 7956 + }, + { + "epoch": 0.8738194596969031, + "grad_norm": 1.9197804927825928, + "learning_rate": 5e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6901335716247559, + "num_tokens": 205922674.0, + "step": 7957 + }, + { + "epoch": 0.8739292773995168, + "grad_norm": 1.9593186378479004, + "learning_rate": 5e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7172609567642212, + "num_tokens": 205944397.0, + "step": 7958 + }, + { + "epoch": 0.8740390951021305, + "grad_norm": 1.9549970626831055, + "learning_rate": 5e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7042727470397949, + "num_tokens": 205967557.0, + "step": 7959 + }, + { + "epoch": 0.8741489128047442, + "grad_norm": 1.7745541334152222, + "learning_rate": 5e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7105445861816406, + "num_tokens": 205992266.0, + "step": 7960 + }, + { + "epoch": 0.8742587305073578, + "grad_norm": 1.7524354457855225, + "learning_rate": 5e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7163877487182617, + "num_tokens": 206017009.0, + "step": 7961 + }, + { + "epoch": 0.8743685482099715, + "grad_norm": 1.9095207452774048, + "learning_rate": 5e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7203428149223328, + "num_tokens": 206039485.0, + "step": 7962 + }, + { + "epoch": 0.8744783659125851, + "grad_norm": 1.7687290906906128, + "learning_rate": 5e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.721846342086792, + "num_tokens": 206063686.0, + "step": 7963 + }, + { + "epoch": 0.8745881836151987, + "grad_norm": 1.6091744899749756, + "learning_rate": 5e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7055143713951111, + "num_tokens": 206092923.0, + "step": 7964 + }, + { + "epoch": 0.8746980013178124, + "grad_norm": 2.0063087940216064, + "learning_rate": 5e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7269915342330933, + "num_tokens": 206115113.0, + "step": 7965 + }, + { + "epoch": 0.8748078190204261, + "grad_norm": 1.9582414627075195, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7022969722747803, + "num_tokens": 206135872.0, + "step": 7966 + }, + { + "epoch": 0.8749176367230398, + "grad_norm": 2.017895460128784, + "learning_rate": 5e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7102959156036377, + "num_tokens": 206160674.0, + "step": 7967 + }, + { + "epoch": 0.8750274544256534, + "grad_norm": 1.5634442567825317, + "learning_rate": 5e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.710936427116394, + "num_tokens": 206193572.0, + "step": 7968 + }, + { + "epoch": 0.8751372721282671, + "grad_norm": 1.7128417491912842, + "learning_rate": 5e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7154934406280518, + "num_tokens": 206220915.0, + "step": 7969 + }, + { + "epoch": 0.8752470898308807, + "grad_norm": 1.6538480520248413, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7144162058830261, + "num_tokens": 206247294.0, + "step": 7970 + }, + { + "epoch": 0.8753569075334944, + "grad_norm": 1.7739126682281494, + "learning_rate": 5e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7079262733459473, + "num_tokens": 206275242.0, + "step": 7971 + }, + { + "epoch": 0.875466725236108, + "grad_norm": 1.7203580141067505, + "learning_rate": 5e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7199088931083679, + "num_tokens": 206302009.0, + "step": 7972 + }, + { + "epoch": 0.8755765429387217, + "grad_norm": 1.6355513334274292, + "learning_rate": 5e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7008994817733765, + "num_tokens": 206330575.0, + "step": 7973 + }, + { + "epoch": 0.8756863606413354, + "grad_norm": 1.747109055519104, + "learning_rate": 5e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7174835205078125, + "num_tokens": 206358170.0, + "step": 7974 + }, + { + "epoch": 0.8757961783439491, + "grad_norm": 1.9329596757888794, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7199041843414307, + "num_tokens": 206381653.0, + "step": 7975 + }, + { + "epoch": 0.8759059960465627, + "grad_norm": 1.7286994457244873, + "learning_rate": 5e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7023658752441406, + "num_tokens": 206408099.0, + "step": 7976 + }, + { + "epoch": 0.8760158137491764, + "grad_norm": 1.8363510370254517, + "learning_rate": 5e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7070561051368713, + "num_tokens": 206434825.0, + "step": 7977 + }, + { + "epoch": 0.87612563145179, + "grad_norm": 2.1614699363708496, + "learning_rate": 5e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7200894355773926, + "num_tokens": 206453319.0, + "step": 7978 + }, + { + "epoch": 0.8762354491544037, + "grad_norm": 1.8300765752792358, + "learning_rate": 5e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7151020169258118, + "num_tokens": 206475513.0, + "step": 7979 + }, + { + "epoch": 0.8763452668570173, + "grad_norm": 1.5892024040222168, + "learning_rate": 5e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.688127875328064, + "num_tokens": 206506623.0, + "step": 7980 + }, + { + "epoch": 0.8764550845596311, + "grad_norm": 1.8656947612762451, + "learning_rate": 5e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7260315418243408, + "num_tokens": 206531096.0, + "step": 7981 + }, + { + "epoch": 0.8765649022622447, + "grad_norm": 1.6600189208984375, + "learning_rate": 5e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7093673944473267, + "num_tokens": 206561204.0, + "step": 7982 + }, + { + "epoch": 0.8766747199648584, + "grad_norm": 1.8591723442077637, + "learning_rate": 5e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7340984344482422, + "num_tokens": 206583555.0, + "step": 7983 + }, + { + "epoch": 0.876784537667472, + "grad_norm": 1.888129711151123, + "learning_rate": 5e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6997877359390259, + "num_tokens": 206607450.0, + "step": 7984 + }, + { + "epoch": 0.8768943553700856, + "grad_norm": 1.5779478549957275, + "learning_rate": 5e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7109071612358093, + "num_tokens": 206639646.0, + "step": 7985 + }, + { + "epoch": 0.8770041730726993, + "grad_norm": 1.7689688205718994, + "learning_rate": 5e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7156256437301636, + "num_tokens": 206663373.0, + "step": 7986 + }, + { + "epoch": 0.8771139907753129, + "grad_norm": 1.9649252891540527, + "learning_rate": 5e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.7018009424209595, + "num_tokens": 206688539.0, + "step": 7987 + }, + { + "epoch": 0.8772238084779267, + "grad_norm": 1.6697251796722412, + "learning_rate": 5e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6942419409751892, + "num_tokens": 206720009.0, + "step": 7988 + }, + { + "epoch": 0.8773336261805403, + "grad_norm": 2.095102071762085, + "learning_rate": 5e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7138553857803345, + "num_tokens": 206739200.0, + "step": 7989 + }, + { + "epoch": 0.877443443883154, + "grad_norm": 1.942592740058899, + "learning_rate": 5e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6948752403259277, + "num_tokens": 206765461.0, + "step": 7990 + }, + { + "epoch": 0.8775532615857676, + "grad_norm": 1.821779727935791, + "learning_rate": 5e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7137187719345093, + "num_tokens": 206789656.0, + "step": 7991 + }, + { + "epoch": 0.8776630792883813, + "grad_norm": 2.0766215324401855, + "learning_rate": 5e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.712455689907074, + "num_tokens": 206809850.0, + "step": 7992 + }, + { + "epoch": 0.8777728969909949, + "grad_norm": 1.8134920597076416, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.734525203704834, + "num_tokens": 206831827.0, + "step": 7993 + }, + { + "epoch": 0.8778827146936086, + "grad_norm": 1.6900304555892944, + "learning_rate": 5e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6950114965438843, + "num_tokens": 206858764.0, + "step": 7994 + }, + { + "epoch": 0.8779925323962223, + "grad_norm": 1.8594051599502563, + "learning_rate": 5e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7206584215164185, + "num_tokens": 206882224.0, + "step": 7995 + }, + { + "epoch": 0.878102350098836, + "grad_norm": 1.6347028017044067, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7237975597381592, + "num_tokens": 206910187.0, + "step": 7996 + }, + { + "epoch": 0.8782121678014496, + "grad_norm": 1.8103477954864502, + "learning_rate": 5e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7063368558883667, + "num_tokens": 206934065.0, + "step": 7997 + }, + { + "epoch": 0.8783219855040633, + "grad_norm": 1.8990814685821533, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.715442419052124, + "num_tokens": 206956288.0, + "step": 7998 + }, + { + "epoch": 0.8784318032066769, + "grad_norm": 1.8537366390228271, + "learning_rate": 5e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7213861346244812, + "num_tokens": 206978689.0, + "step": 7999 + }, + { + "epoch": 0.8785416209092906, + "grad_norm": 1.910616159439087, + "learning_rate": 5e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6980987787246704, + "num_tokens": 207005126.0, + "step": 8000 + }, + { + "epoch": 0.8786514386119042, + "grad_norm": 1.8301600217819214, + "learning_rate": 5e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7222046256065369, + "num_tokens": 207031127.0, + "step": 8001 + }, + { + "epoch": 0.8787612563145178, + "grad_norm": 1.9476876258850098, + "learning_rate": 5e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7313001751899719, + "num_tokens": 207051892.0, + "step": 8002 + }, + { + "epoch": 0.8788710740171316, + "grad_norm": 2.01180100440979, + "learning_rate": 5e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7330259084701538, + "num_tokens": 207071146.0, + "step": 8003 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 1.9452790021896362, + "learning_rate": 5e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7107132077217102, + "num_tokens": 207094039.0, + "step": 8004 + }, + { + "epoch": 0.8790907094223589, + "grad_norm": 1.829396367073059, + "learning_rate": 5e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7003028988838196, + "num_tokens": 207118716.0, + "step": 8005 + }, + { + "epoch": 0.8792005271249725, + "grad_norm": 1.9205151796340942, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7317902445793152, + "num_tokens": 207139572.0, + "step": 8006 + }, + { + "epoch": 0.8793103448275862, + "grad_norm": 1.85976243019104, + "learning_rate": 5e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7199029922485352, + "num_tokens": 207163380.0, + "step": 8007 + }, + { + "epoch": 0.8794201625301998, + "grad_norm": 1.7104485034942627, + "learning_rate": 5e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7042163610458374, + "num_tokens": 207190074.0, + "step": 8008 + }, + { + "epoch": 0.8795299802328135, + "grad_norm": 1.6990981101989746, + "learning_rate": 5e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.706116259098053, + "num_tokens": 207215437.0, + "step": 8009 + }, + { + "epoch": 0.8796397979354272, + "grad_norm": 1.8207639455795288, + "learning_rate": 5e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7141872048377991, + "num_tokens": 207238958.0, + "step": 8010 + }, + { + "epoch": 0.8797496156380409, + "grad_norm": 1.7341885566711426, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7115544080734253, + "num_tokens": 207265203.0, + "step": 8011 + }, + { + "epoch": 0.8798594333406545, + "grad_norm": 1.7233127355575562, + "learning_rate": 5e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6966218948364258, + "num_tokens": 207294601.0, + "step": 8012 + }, + { + "epoch": 0.8799692510432682, + "grad_norm": 1.9708120822906494, + "learning_rate": 5e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7001771926879883, + "num_tokens": 207315509.0, + "step": 8013 + }, + { + "epoch": 0.8800790687458818, + "grad_norm": 1.651489019393921, + "learning_rate": 5e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6959127187728882, + "num_tokens": 207345249.0, + "step": 8014 + }, + { + "epoch": 0.8801888864484955, + "grad_norm": 1.7114388942718506, + "learning_rate": 5e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7209647297859192, + "num_tokens": 207370741.0, + "step": 8015 + }, + { + "epoch": 0.8802987041511091, + "grad_norm": 1.7644342184066772, + "learning_rate": 5e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7110099792480469, + "num_tokens": 207394794.0, + "step": 8016 + }, + { + "epoch": 0.8804085218537229, + "grad_norm": 1.810028076171875, + "learning_rate": 5e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7083498239517212, + "num_tokens": 207418031.0, + "step": 8017 + }, + { + "epoch": 0.8805183395563365, + "grad_norm": 1.7463839054107666, + "learning_rate": 5e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6955853700637817, + "num_tokens": 207445888.0, + "step": 8018 + }, + { + "epoch": 0.8806281572589502, + "grad_norm": 1.9014207124710083, + "learning_rate": 5e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6886644959449768, + "num_tokens": 207469385.0, + "step": 8019 + }, + { + "epoch": 0.8807379749615638, + "grad_norm": 1.7051328420639038, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7187178134918213, + "num_tokens": 207495365.0, + "step": 8020 + }, + { + "epoch": 0.8808477926641775, + "grad_norm": 1.5692169666290283, + "learning_rate": 5e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7015725374221802, + "num_tokens": 207525025.0, + "step": 8021 + }, + { + "epoch": 0.8809576103667911, + "grad_norm": 1.7138780355453491, + "learning_rate": 5e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6842983365058899, + "num_tokens": 207553187.0, + "step": 8022 + }, + { + "epoch": 0.8810674280694047, + "grad_norm": 2.013298988342285, + "learning_rate": 5e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7026684284210205, + "num_tokens": 207573417.0, + "step": 8023 + }, + { + "epoch": 0.8811772457720185, + "grad_norm": 2.0111582279205322, + "learning_rate": 5e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.695223331451416, + "num_tokens": 207594913.0, + "step": 8024 + }, + { + "epoch": 0.8812870634746321, + "grad_norm": 1.8365061283111572, + "learning_rate": 5e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7065167427062988, + "num_tokens": 207617641.0, + "step": 8025 + }, + { + "epoch": 0.8813968811772458, + "grad_norm": 1.8528046607971191, + "learning_rate": 5e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7075858116149902, + "num_tokens": 207641657.0, + "step": 8026 + }, + { + "epoch": 0.8815066988798594, + "grad_norm": 1.6842724084854126, + "learning_rate": 5e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7108469009399414, + "num_tokens": 207669572.0, + "step": 8027 + }, + { + "epoch": 0.8816165165824731, + "grad_norm": 1.8444774150848389, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7145337462425232, + "num_tokens": 207693702.0, + "step": 8028 + }, + { + "epoch": 0.8817263342850867, + "grad_norm": 2.1061410903930664, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7166931629180908, + "num_tokens": 207713312.0, + "step": 8029 + }, + { + "epoch": 0.8818361519877004, + "grad_norm": 1.625396490097046, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7214740514755249, + "num_tokens": 207741918.0, + "step": 8030 + }, + { + "epoch": 0.881945969690314, + "grad_norm": 1.854303002357483, + "learning_rate": 5e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7089753150939941, + "num_tokens": 207763074.0, + "step": 8031 + }, + { + "epoch": 0.8820557873929278, + "grad_norm": 2.015430450439453, + "learning_rate": 5e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7051458358764648, + "num_tokens": 207783539.0, + "step": 8032 + }, + { + "epoch": 0.8821656050955414, + "grad_norm": 1.948325514793396, + "learning_rate": 5e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6988205909729004, + "num_tokens": 207807493.0, + "step": 8033 + }, + { + "epoch": 0.8822754227981551, + "grad_norm": 1.9602965116500854, + "learning_rate": 5e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.706089437007904, + "num_tokens": 207831078.0, + "step": 8034 + }, + { + "epoch": 0.8823852405007687, + "grad_norm": 1.7602386474609375, + "learning_rate": 5e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7158860564231873, + "num_tokens": 207857562.0, + "step": 8035 + }, + { + "epoch": 0.8824950582033824, + "grad_norm": 1.5021491050720215, + "learning_rate": 5e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7222256660461426, + "num_tokens": 207888993.0, + "step": 8036 + }, + { + "epoch": 0.882604875905996, + "grad_norm": 1.8070825338363647, + "learning_rate": 5e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7032999992370605, + "num_tokens": 207912460.0, + "step": 8037 + }, + { + "epoch": 0.8827146936086097, + "grad_norm": 1.7446869611740112, + "learning_rate": 5e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7170883417129517, + "num_tokens": 207936878.0, + "step": 8038 + }, + { + "epoch": 0.8828245113112234, + "grad_norm": 1.8367304801940918, + "learning_rate": 5e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7114087343215942, + "num_tokens": 207961185.0, + "step": 8039 + }, + { + "epoch": 0.8829343290138371, + "grad_norm": 2.041364908218384, + "learning_rate": 5e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6996326446533203, + "num_tokens": 207984484.0, + "step": 8040 + }, + { + "epoch": 0.8830441467164507, + "grad_norm": 2.0844340324401855, + "learning_rate": 5e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7065214514732361, + "num_tokens": 208006172.0, + "step": 8041 + }, + { + "epoch": 0.8831539644190644, + "grad_norm": 1.6625783443450928, + "learning_rate": 5e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.704252302646637, + "num_tokens": 208035435.0, + "step": 8042 + }, + { + "epoch": 0.883263782121678, + "grad_norm": 1.7562330961227417, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7274040579795837, + "num_tokens": 208057949.0, + "step": 8043 + }, + { + "epoch": 0.8833735998242916, + "grad_norm": 1.879550576210022, + "learning_rate": 5e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6788182258605957, + "num_tokens": 208081506.0, + "step": 8044 + }, + { + "epoch": 0.8834834175269053, + "grad_norm": 1.7321605682373047, + "learning_rate": 5e-06, + "loss": 0.979, + "mean_token_accuracy": 0.705898642539978, + "num_tokens": 208111511.0, + "step": 8045 + }, + { + "epoch": 0.883593235229519, + "grad_norm": 1.8290613889694214, + "learning_rate": 5e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7177245616912842, + "num_tokens": 208136980.0, + "step": 8046 + }, + { + "epoch": 0.8837030529321327, + "grad_norm": 1.5674442052841187, + "learning_rate": 5e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6962777376174927, + "num_tokens": 208169456.0, + "step": 8047 + }, + { + "epoch": 0.8838128706347463, + "grad_norm": 1.791601300239563, + "learning_rate": 5e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.718446671962738, + "num_tokens": 208196198.0, + "step": 8048 + }, + { + "epoch": 0.88392268833736, + "grad_norm": 1.6952075958251953, + "learning_rate": 5e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.709445059299469, + "num_tokens": 208224573.0, + "step": 8049 + }, + { + "epoch": 0.8840325060399736, + "grad_norm": 1.7239313125610352, + "learning_rate": 5e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.6969977617263794, + "num_tokens": 208254468.0, + "step": 8050 + }, + { + "epoch": 0.8841423237425873, + "grad_norm": 1.8498250246047974, + "learning_rate": 5e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7054075002670288, + "num_tokens": 208280864.0, + "step": 8051 + }, + { + "epoch": 0.8842521414452009, + "grad_norm": 1.923271894454956, + "learning_rate": 5e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.712360143661499, + "num_tokens": 208303400.0, + "step": 8052 + }, + { + "epoch": 0.8843619591478147, + "grad_norm": 1.5838792324066162, + "learning_rate": 5e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7021785974502563, + "num_tokens": 208332043.0, + "step": 8053 + }, + { + "epoch": 0.8844717768504283, + "grad_norm": 1.8119280338287354, + "learning_rate": 5e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7084509134292603, + "num_tokens": 208357272.0, + "step": 8054 + }, + { + "epoch": 0.884581594553042, + "grad_norm": 1.742154836654663, + "learning_rate": 5e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6918702125549316, + "num_tokens": 208385720.0, + "step": 8055 + }, + { + "epoch": 0.8846914122556556, + "grad_norm": 1.8688421249389648, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7229727506637573, + "num_tokens": 208409345.0, + "step": 8056 + }, + { + "epoch": 0.8848012299582693, + "grad_norm": 2.0463645458221436, + "learning_rate": 5e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7187613248825073, + "num_tokens": 208428856.0, + "step": 8057 + }, + { + "epoch": 0.8849110476608829, + "grad_norm": 1.7454928159713745, + "learning_rate": 5e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.715834379196167, + "num_tokens": 208456215.0, + "step": 8058 + }, + { + "epoch": 0.8850208653634966, + "grad_norm": 1.8700354099273682, + "learning_rate": 5e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7146205902099609, + "num_tokens": 208480805.0, + "step": 8059 + }, + { + "epoch": 0.8851306830661102, + "grad_norm": 2.019155263900757, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7272399067878723, + "num_tokens": 208500226.0, + "step": 8060 + }, + { + "epoch": 0.885240500768724, + "grad_norm": 1.8020119667053223, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7185564637184143, + "num_tokens": 208522457.0, + "step": 8061 + }, + { + "epoch": 0.8853503184713376, + "grad_norm": 1.9149322509765625, + "learning_rate": 5e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7140464782714844, + "num_tokens": 208544842.0, + "step": 8062 + }, + { + "epoch": 0.8854601361739513, + "grad_norm": 1.7850276231765747, + "learning_rate": 5e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7024708986282349, + "num_tokens": 208572279.0, + "step": 8063 + }, + { + "epoch": 0.8855699538765649, + "grad_norm": 1.6242254972457886, + "learning_rate": 5e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7167035341262817, + "num_tokens": 208602598.0, + "step": 8064 + }, + { + "epoch": 0.8856797715791785, + "grad_norm": 1.6876795291900635, + "learning_rate": 5e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.6950454115867615, + "num_tokens": 208630362.0, + "step": 8065 + }, + { + "epoch": 0.8857895892817922, + "grad_norm": 1.8455791473388672, + "learning_rate": 5e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.714666485786438, + "num_tokens": 208653484.0, + "step": 8066 + }, + { + "epoch": 0.8858994069844058, + "grad_norm": 1.5220600366592407, + "learning_rate": 5e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7118327617645264, + "num_tokens": 208689895.0, + "step": 8067 + }, + { + "epoch": 0.8860092246870196, + "grad_norm": 1.9019440412521362, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7205747961997986, + "num_tokens": 208712801.0, + "step": 8068 + }, + { + "epoch": 0.8861190423896332, + "grad_norm": 1.8013060092926025, + "learning_rate": 5e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7041471004486084, + "num_tokens": 208742859.0, + "step": 8069 + }, + { + "epoch": 0.8862288600922469, + "grad_norm": 1.9708672761917114, + "learning_rate": 5e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6998909711837769, + "num_tokens": 208766665.0, + "step": 8070 + }, + { + "epoch": 0.8863386777948605, + "grad_norm": 1.711288571357727, + "learning_rate": 5e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6923984885215759, + "num_tokens": 208796250.0, + "step": 8071 + }, + { + "epoch": 0.8864484954974742, + "grad_norm": 1.600067377090454, + "learning_rate": 5e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6886984705924988, + "num_tokens": 208828008.0, + "step": 8072 + }, + { + "epoch": 0.8865583132000878, + "grad_norm": 1.813883900642395, + "learning_rate": 5e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6969321966171265, + "num_tokens": 208853919.0, + "step": 8073 + }, + { + "epoch": 0.8866681309027015, + "grad_norm": 1.8534455299377441, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7288843989372253, + "num_tokens": 208874749.0, + "step": 8074 + }, + { + "epoch": 0.8867779486053152, + "grad_norm": 1.6870851516723633, + "learning_rate": 5e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7327462434768677, + "num_tokens": 208902890.0, + "step": 8075 + }, + { + "epoch": 0.8868877663079289, + "grad_norm": 1.7430639266967773, + "learning_rate": 5e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7199357748031616, + "num_tokens": 208927709.0, + "step": 8076 + }, + { + "epoch": 0.8869975840105425, + "grad_norm": 1.7112281322479248, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6945638060569763, + "num_tokens": 208956084.0, + "step": 8077 + }, + { + "epoch": 0.8871074017131562, + "grad_norm": 1.7051317691802979, + "learning_rate": 5e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7043771743774414, + "num_tokens": 208983875.0, + "step": 8078 + }, + { + "epoch": 0.8872172194157698, + "grad_norm": 1.824203610420227, + "learning_rate": 5e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7075790166854858, + "num_tokens": 209006614.0, + "step": 8079 + }, + { + "epoch": 0.8873270371183835, + "grad_norm": 1.5362920761108398, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7387797832489014, + "num_tokens": 209039139.0, + "step": 8080 + }, + { + "epoch": 0.8874368548209971, + "grad_norm": 1.7542338371276855, + "learning_rate": 5e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7169722318649292, + "num_tokens": 209064872.0, + "step": 8081 + }, + { + "epoch": 0.8875466725236109, + "grad_norm": 1.7356144189834595, + "learning_rate": 5e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6960086822509766, + "num_tokens": 209090691.0, + "step": 8082 + }, + { + "epoch": 0.8876564902262245, + "grad_norm": 1.6746797561645508, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7184947729110718, + "num_tokens": 209118975.0, + "step": 8083 + }, + { + "epoch": 0.8877663079288381, + "grad_norm": 1.8077757358551025, + "learning_rate": 5e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6994614601135254, + "num_tokens": 209147472.0, + "step": 8084 + }, + { + "epoch": 0.8878761256314518, + "grad_norm": 1.6251145601272583, + "learning_rate": 5e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7152107357978821, + "num_tokens": 209176134.0, + "step": 8085 + }, + { + "epoch": 0.8879859433340654, + "grad_norm": 1.8420615196228027, + "learning_rate": 5e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.6993576288223267, + "num_tokens": 209199701.0, + "step": 8086 + }, + { + "epoch": 0.8880957610366791, + "grad_norm": 1.8227686882019043, + "learning_rate": 5e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7290269136428833, + "num_tokens": 209221617.0, + "step": 8087 + }, + { + "epoch": 0.8882055787392927, + "grad_norm": 1.960972547531128, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7079123258590698, + "num_tokens": 209243794.0, + "step": 8088 + }, + { + "epoch": 0.8883153964419065, + "grad_norm": 1.7854459285736084, + "learning_rate": 5e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7103808522224426, + "num_tokens": 209268235.0, + "step": 8089 + }, + { + "epoch": 0.8884252141445201, + "grad_norm": 1.7712314128875732, + "learning_rate": 5e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.693950891494751, + "num_tokens": 209293832.0, + "step": 8090 + }, + { + "epoch": 0.8885350318471338, + "grad_norm": 1.8882473707199097, + "learning_rate": 5e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6886622309684753, + "num_tokens": 209319139.0, + "step": 8091 + }, + { + "epoch": 0.8886448495497474, + "grad_norm": 1.8881157636642456, + "learning_rate": 5e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7074129581451416, + "num_tokens": 209342644.0, + "step": 8092 + }, + { + "epoch": 0.8887546672523611, + "grad_norm": 1.7400457859039307, + "learning_rate": 5e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.6939033269882202, + "num_tokens": 209369251.0, + "step": 8093 + }, + { + "epoch": 0.8888644849549747, + "grad_norm": 2.1227800846099854, + "learning_rate": 5e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7145941853523254, + "num_tokens": 209390298.0, + "step": 8094 + }, + { + "epoch": 0.8889743026575884, + "grad_norm": 1.848823070526123, + "learning_rate": 5e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7005425095558167, + "num_tokens": 209414265.0, + "step": 8095 + }, + { + "epoch": 0.889084120360202, + "grad_norm": 1.8705313205718994, + "learning_rate": 5e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7046558856964111, + "num_tokens": 209437950.0, + "step": 8096 + }, + { + "epoch": 0.8891939380628158, + "grad_norm": 1.728835940361023, + "learning_rate": 5e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7067335844039917, + "num_tokens": 209465219.0, + "step": 8097 + }, + { + "epoch": 0.8893037557654294, + "grad_norm": 1.7696541547775269, + "learning_rate": 5e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7118580341339111, + "num_tokens": 209491379.0, + "step": 8098 + }, + { + "epoch": 0.8894135734680431, + "grad_norm": 1.7609175443649292, + "learning_rate": 5e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6859981417655945, + "num_tokens": 209518179.0, + "step": 8099 + }, + { + "epoch": 0.8895233911706567, + "grad_norm": 1.7851877212524414, + "learning_rate": 5e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7017426490783691, + "num_tokens": 209544214.0, + "step": 8100 + }, + { + "epoch": 0.8896332088732704, + "grad_norm": 1.8431529998779297, + "learning_rate": 5e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7014378905296326, + "num_tokens": 209569905.0, + "step": 8101 + }, + { + "epoch": 0.889743026575884, + "grad_norm": 1.927240014076233, + "learning_rate": 5e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7109829783439636, + "num_tokens": 209592468.0, + "step": 8102 + }, + { + "epoch": 0.8898528442784976, + "grad_norm": 1.5816607475280762, + "learning_rate": 5e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7142921090126038, + "num_tokens": 209621788.0, + "step": 8103 + }, + { + "epoch": 0.8899626619811114, + "grad_norm": 1.7147307395935059, + "learning_rate": 5e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7161918878555298, + "num_tokens": 209652478.0, + "step": 8104 + }, + { + "epoch": 0.890072479683725, + "grad_norm": 1.9271897077560425, + "learning_rate": 5e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.6981872320175171, + "num_tokens": 209675416.0, + "step": 8105 + }, + { + "epoch": 0.8901822973863387, + "grad_norm": 1.8827377557754517, + "learning_rate": 5e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.704875111579895, + "num_tokens": 209699693.0, + "step": 8106 + }, + { + "epoch": 0.8902921150889523, + "grad_norm": 1.7320548295974731, + "learning_rate": 5e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7135390043258667, + "num_tokens": 209724453.0, + "step": 8107 + }, + { + "epoch": 0.890401932791566, + "grad_norm": 1.696460247039795, + "learning_rate": 5e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7079194784164429, + "num_tokens": 209753753.0, + "step": 8108 + }, + { + "epoch": 0.8905117504941796, + "grad_norm": 1.5795344114303589, + "learning_rate": 5e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7061347365379333, + "num_tokens": 209785224.0, + "step": 8109 + }, + { + "epoch": 0.8906215681967933, + "grad_norm": 1.8449734449386597, + "learning_rate": 5e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6866039037704468, + "num_tokens": 209809951.0, + "step": 8110 + }, + { + "epoch": 0.890731385899407, + "grad_norm": 1.8846039772033691, + "learning_rate": 5e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6905670762062073, + "num_tokens": 209836098.0, + "step": 8111 + }, + { + "epoch": 0.8908412036020207, + "grad_norm": 1.6881853342056274, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.727685272693634, + "num_tokens": 209862559.0, + "step": 8112 + }, + { + "epoch": 0.8909510213046343, + "grad_norm": 1.6836854219436646, + "learning_rate": 5e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7004224061965942, + "num_tokens": 209889548.0, + "step": 8113 + }, + { + "epoch": 0.891060839007248, + "grad_norm": 1.5971297025680542, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7199169993400574, + "num_tokens": 209917486.0, + "step": 8114 + }, + { + "epoch": 0.8911706567098616, + "grad_norm": 1.6834431886672974, + "learning_rate": 5e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6918734312057495, + "num_tokens": 209944980.0, + "step": 8115 + }, + { + "epoch": 0.8912804744124753, + "grad_norm": 1.708850383758545, + "learning_rate": 5e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7009140849113464, + "num_tokens": 209973470.0, + "step": 8116 + }, + { + "epoch": 0.8913902921150889, + "grad_norm": 1.994096040725708, + "learning_rate": 5e-06, + "loss": 0.967, + "mean_token_accuracy": 0.712705135345459, + "num_tokens": 209993794.0, + "step": 8117 + }, + { + "epoch": 0.8915001098177027, + "grad_norm": 1.535586953163147, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6942949295043945, + "num_tokens": 210029142.0, + "step": 8118 + }, + { + "epoch": 0.8916099275203163, + "grad_norm": 2.0610971450805664, + "learning_rate": 5e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7227683663368225, + "num_tokens": 210049228.0, + "step": 8119 + }, + { + "epoch": 0.89171974522293, + "grad_norm": 1.8042478561401367, + "learning_rate": 5e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6926711201667786, + "num_tokens": 210074040.0, + "step": 8120 + }, + { + "epoch": 0.8918295629255436, + "grad_norm": 1.7832578420639038, + "learning_rate": 5e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6915013790130615, + "num_tokens": 210101211.0, + "step": 8121 + }, + { + "epoch": 0.8919393806281573, + "grad_norm": 1.737966775894165, + "learning_rate": 5e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6869091987609863, + "num_tokens": 210129865.0, + "step": 8122 + }, + { + "epoch": 0.8920491983307709, + "grad_norm": 1.6917731761932373, + "learning_rate": 5e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6911941766738892, + "num_tokens": 210159300.0, + "step": 8123 + }, + { + "epoch": 0.8921590160333845, + "grad_norm": 1.6671243906021118, + "learning_rate": 5e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.702302873134613, + "num_tokens": 210185916.0, + "step": 8124 + }, + { + "epoch": 0.8922688337359982, + "grad_norm": 1.6968401670455933, + "learning_rate": 5e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6951076984405518, + "num_tokens": 210216275.0, + "step": 8125 + }, + { + "epoch": 0.892378651438612, + "grad_norm": 1.941314458847046, + "learning_rate": 5e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7344084978103638, + "num_tokens": 210236402.0, + "step": 8126 + }, + { + "epoch": 0.8924884691412256, + "grad_norm": 1.801069736480713, + "learning_rate": 5e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7174473404884338, + "num_tokens": 210260420.0, + "step": 8127 + }, + { + "epoch": 0.8925982868438392, + "grad_norm": 1.7401841878890991, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.715499758720398, + "num_tokens": 210284769.0, + "step": 8128 + }, + { + "epoch": 0.8927081045464529, + "grad_norm": 1.8378512859344482, + "learning_rate": 5e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7408134937286377, + "num_tokens": 210307500.0, + "step": 8129 + }, + { + "epoch": 0.8928179222490665, + "grad_norm": 1.6338595151901245, + "learning_rate": 5e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.698195219039917, + "num_tokens": 210339583.0, + "step": 8130 + }, + { + "epoch": 0.8929277399516802, + "grad_norm": 1.9342247247695923, + "learning_rate": 5e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6987506747245789, + "num_tokens": 210363837.0, + "step": 8131 + }, + { + "epoch": 0.8930375576542938, + "grad_norm": 1.7913423776626587, + "learning_rate": 5e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6819374561309814, + "num_tokens": 210389582.0, + "step": 8132 + }, + { + "epoch": 0.8931473753569076, + "grad_norm": 1.7433676719665527, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7215894460678101, + "num_tokens": 210414943.0, + "step": 8133 + }, + { + "epoch": 0.8932571930595212, + "grad_norm": 1.7911328077316284, + "learning_rate": 5e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.713487982749939, + "num_tokens": 210440155.0, + "step": 8134 + }, + { + "epoch": 0.8933670107621349, + "grad_norm": 1.6684316396713257, + "learning_rate": 5e-06, + "loss": 0.924, + "mean_token_accuracy": 0.720133364200592, + "num_tokens": 210467899.0, + "step": 8135 + }, + { + "epoch": 0.8934768284647485, + "grad_norm": 1.7067785263061523, + "learning_rate": 5e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7055611610412598, + "num_tokens": 210498752.0, + "step": 8136 + }, + { + "epoch": 0.8935866461673622, + "grad_norm": 1.85904860496521, + "learning_rate": 5e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7178975343704224, + "num_tokens": 210520414.0, + "step": 8137 + }, + { + "epoch": 0.8936964638699758, + "grad_norm": 1.6524019241333008, + "learning_rate": 5e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.710885763168335, + "num_tokens": 210549927.0, + "step": 8138 + }, + { + "epoch": 0.8938062815725895, + "grad_norm": 1.747942566871643, + "learning_rate": 5e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6957264542579651, + "num_tokens": 210577653.0, + "step": 8139 + }, + { + "epoch": 0.8939160992752032, + "grad_norm": 1.623929738998413, + "learning_rate": 5e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7004398107528687, + "num_tokens": 210611249.0, + "step": 8140 + }, + { + "epoch": 0.8940259169778169, + "grad_norm": 1.6757723093032837, + "learning_rate": 5e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6936968564987183, + "num_tokens": 210638357.0, + "step": 8141 + }, + { + "epoch": 0.8941357346804305, + "grad_norm": 1.8312891721725464, + "learning_rate": 5e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7208251357078552, + "num_tokens": 210660954.0, + "step": 8142 + }, + { + "epoch": 0.8942455523830442, + "grad_norm": 2.0499653816223145, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7171311378479004, + "num_tokens": 210681970.0, + "step": 8143 + }, + { + "epoch": 0.8943553700856578, + "grad_norm": 1.7957911491394043, + "learning_rate": 5e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.699565052986145, + "num_tokens": 210708859.0, + "step": 8144 + }, + { + "epoch": 0.8944651877882714, + "grad_norm": 1.6942585706710815, + "learning_rate": 5e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7134870290756226, + "num_tokens": 210736510.0, + "step": 8145 + }, + { + "epoch": 0.8945750054908851, + "grad_norm": 1.8004194498062134, + "learning_rate": 5e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.6998603343963623, + "num_tokens": 210761327.0, + "step": 8146 + }, + { + "epoch": 0.8946848231934988, + "grad_norm": 1.8288062810897827, + "learning_rate": 5e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7194651365280151, + "num_tokens": 210786526.0, + "step": 8147 + }, + { + "epoch": 0.8947946408961125, + "grad_norm": 1.9920275211334229, + "learning_rate": 5e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7470114231109619, + "num_tokens": 210805160.0, + "step": 8148 + }, + { + "epoch": 0.8949044585987261, + "grad_norm": 1.6179537773132324, + "learning_rate": 5e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7061848640441895, + "num_tokens": 210836158.0, + "step": 8149 + }, + { + "epoch": 0.8950142763013398, + "grad_norm": 1.868932843208313, + "learning_rate": 5e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7199763655662537, + "num_tokens": 210858274.0, + "step": 8150 + }, + { + "epoch": 0.8951240940039534, + "grad_norm": 1.7102556228637695, + "learning_rate": 5e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.679822564125061, + "num_tokens": 210888443.0, + "step": 8151 + }, + { + "epoch": 0.8952339117065671, + "grad_norm": 1.792020559310913, + "learning_rate": 5e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6972048282623291, + "num_tokens": 210914847.0, + "step": 8152 + }, + { + "epoch": 0.8953437294091807, + "grad_norm": 1.57767915725708, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6882652640342712, + "num_tokens": 210948091.0, + "step": 8153 + }, + { + "epoch": 0.8954535471117944, + "grad_norm": 1.7710164785385132, + "learning_rate": 5e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.711519181728363, + "num_tokens": 210973869.0, + "step": 8154 + }, + { + "epoch": 0.8955633648144081, + "grad_norm": 1.9471096992492676, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7307617664337158, + "num_tokens": 210994636.0, + "step": 8155 + }, + { + "epoch": 0.8956731825170218, + "grad_norm": 1.6329481601715088, + "learning_rate": 5e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6938439607620239, + "num_tokens": 211025442.0, + "step": 8156 + }, + { + "epoch": 0.8957830002196354, + "grad_norm": 1.7912018299102783, + "learning_rate": 5e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7093133926391602, + "num_tokens": 211050230.0, + "step": 8157 + }, + { + "epoch": 0.8958928179222491, + "grad_norm": 1.7420896291732788, + "learning_rate": 5e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6922703981399536, + "num_tokens": 211076216.0, + "step": 8158 + }, + { + "epoch": 0.8960026356248627, + "grad_norm": 1.7049717903137207, + "learning_rate": 5e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6974647045135498, + "num_tokens": 211103781.0, + "step": 8159 + }, + { + "epoch": 0.8961124533274764, + "grad_norm": 1.6957588195800781, + "learning_rate": 5e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7130342125892639, + "num_tokens": 211130168.0, + "step": 8160 + }, + { + "epoch": 0.89622227103009, + "grad_norm": 1.8082846403121948, + "learning_rate": 5e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.717836856842041, + "num_tokens": 211155087.0, + "step": 8161 + }, + { + "epoch": 0.8963320887327038, + "grad_norm": 1.7130041122436523, + "learning_rate": 5e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7131316661834717, + "num_tokens": 211178524.0, + "step": 8162 + }, + { + "epoch": 0.8964419064353174, + "grad_norm": 1.9407140016555786, + "learning_rate": 5e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.730126142501831, + "num_tokens": 211198868.0, + "step": 8163 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 1.6935197114944458, + "learning_rate": 5e-06, + "loss": 1.1167, + "mean_token_accuracy": 0.6792056560516357, + "num_tokens": 211231111.0, + "step": 8164 + }, + { + "epoch": 0.8966615418405447, + "grad_norm": 2.2436437606811523, + "learning_rate": 5e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6988818645477295, + "num_tokens": 211251272.0, + "step": 8165 + }, + { + "epoch": 0.8967713595431583, + "grad_norm": 1.6390759944915771, + "learning_rate": 5e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.6948474049568176, + "num_tokens": 211278778.0, + "step": 8166 + }, + { + "epoch": 0.896881177245772, + "grad_norm": 1.7696456909179688, + "learning_rate": 5e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7002619504928589, + "num_tokens": 211303516.0, + "step": 8167 + }, + { + "epoch": 0.8969909949483856, + "grad_norm": 1.767264723777771, + "learning_rate": 5e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.69614577293396, + "num_tokens": 211329864.0, + "step": 8168 + }, + { + "epoch": 0.8971008126509994, + "grad_norm": 1.801488995552063, + "learning_rate": 5e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7139634490013123, + "num_tokens": 211356275.0, + "step": 8169 + }, + { + "epoch": 0.897210630353613, + "grad_norm": 1.9655951261520386, + "learning_rate": 5e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7258040904998779, + "num_tokens": 211376773.0, + "step": 8170 + }, + { + "epoch": 0.8973204480562267, + "grad_norm": 1.6231775283813477, + "learning_rate": 5e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7073764801025391, + "num_tokens": 211407926.0, + "step": 8171 + }, + { + "epoch": 0.8974302657588403, + "grad_norm": 1.7115577459335327, + "learning_rate": 5e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.6940691471099854, + "num_tokens": 211435857.0, + "step": 8172 + }, + { + "epoch": 0.897540083461454, + "grad_norm": 2.0749900341033936, + "learning_rate": 5e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.6977394819259644, + "num_tokens": 211455782.0, + "step": 8173 + }, + { + "epoch": 0.8976499011640676, + "grad_norm": 1.7872282266616821, + "learning_rate": 5e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.706247091293335, + "num_tokens": 211480035.0, + "step": 8174 + }, + { + "epoch": 0.8977597188666813, + "grad_norm": 1.8209834098815918, + "learning_rate": 5e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.718010425567627, + "num_tokens": 211502921.0, + "step": 8175 + }, + { + "epoch": 0.897869536569295, + "grad_norm": 1.7323217391967773, + "learning_rate": 5e-06, + "loss": 0.944, + "mean_token_accuracy": 0.710739016532898, + "num_tokens": 211529753.0, + "step": 8176 + }, + { + "epoch": 0.8979793542719087, + "grad_norm": 1.9377297163009644, + "learning_rate": 5e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.704873263835907, + "num_tokens": 211552134.0, + "step": 8177 + }, + { + "epoch": 0.8980891719745223, + "grad_norm": 1.6890945434570312, + "learning_rate": 5e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7328238487243652, + "num_tokens": 211579667.0, + "step": 8178 + }, + { + "epoch": 0.898198989677136, + "grad_norm": 1.7396844625473022, + "learning_rate": 5e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6948826909065247, + "num_tokens": 211606743.0, + "step": 8179 + }, + { + "epoch": 0.8983088073797496, + "grad_norm": 1.5434529781341553, + "learning_rate": 5e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7063945531845093, + "num_tokens": 211638940.0, + "step": 8180 + }, + { + "epoch": 0.8984186250823633, + "grad_norm": 1.80789053440094, + "learning_rate": 5e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6953961849212646, + "num_tokens": 211664680.0, + "step": 8181 + }, + { + "epoch": 0.8985284427849769, + "grad_norm": 1.8074488639831543, + "learning_rate": 5e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6962385177612305, + "num_tokens": 211689418.0, + "step": 8182 + }, + { + "epoch": 0.8986382604875905, + "grad_norm": 1.6843479871749878, + "learning_rate": 5e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7021690607070923, + "num_tokens": 211720523.0, + "step": 8183 + }, + { + "epoch": 0.8987480781902043, + "grad_norm": 1.6147677898406982, + "learning_rate": 5e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6948572993278503, + "num_tokens": 211750413.0, + "step": 8184 + }, + { + "epoch": 0.898857895892818, + "grad_norm": 1.8090156316757202, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7163382768630981, + "num_tokens": 211773857.0, + "step": 8185 + }, + { + "epoch": 0.8989677135954316, + "grad_norm": 1.664764642715454, + "learning_rate": 5e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6837694644927979, + "num_tokens": 211804510.0, + "step": 8186 + }, + { + "epoch": 0.8990775312980452, + "grad_norm": 1.8139851093292236, + "learning_rate": 5e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7191848754882812, + "num_tokens": 211827684.0, + "step": 8187 + }, + { + "epoch": 0.8991873490006589, + "grad_norm": 1.6430318355560303, + "learning_rate": 5e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7048367857933044, + "num_tokens": 211855827.0, + "step": 8188 + }, + { + "epoch": 0.8992971667032725, + "grad_norm": 1.7686322927474976, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6935893297195435, + "num_tokens": 211884618.0, + "step": 8189 + }, + { + "epoch": 0.8994069844058862, + "grad_norm": 1.7405667304992676, + "learning_rate": 5e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7073583602905273, + "num_tokens": 211910186.0, + "step": 8190 + }, + { + "epoch": 0.8995168021084999, + "grad_norm": 1.7458372116088867, + "learning_rate": 5e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.7012656927108765, + "num_tokens": 211936426.0, + "step": 8191 + }, + { + "epoch": 0.8996266198111136, + "grad_norm": 1.6491268873214722, + "learning_rate": 5e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7041240930557251, + "num_tokens": 211963430.0, + "step": 8192 + }, + { + "epoch": 0.8997364375137272, + "grad_norm": 1.9492348432540894, + "learning_rate": 5e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.706784188747406, + "num_tokens": 211985138.0, + "step": 8193 + }, + { + "epoch": 0.8998462552163409, + "grad_norm": 1.6097478866577148, + "learning_rate": 5e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7285517454147339, + "num_tokens": 212011882.0, + "step": 8194 + }, + { + "epoch": 0.8999560729189545, + "grad_norm": 1.764210820198059, + "learning_rate": 5e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7049882411956787, + "num_tokens": 212038479.0, + "step": 8195 + }, + { + "epoch": 0.9000658906215682, + "grad_norm": 2.171661615371704, + "learning_rate": 5e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7154291272163391, + "num_tokens": 212055211.0, + "step": 8196 + }, + { + "epoch": 0.9001757083241818, + "grad_norm": 1.7634198665618896, + "learning_rate": 5e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7172744274139404, + "num_tokens": 212080791.0, + "step": 8197 + }, + { + "epoch": 0.9002855260267956, + "grad_norm": 1.6582459211349487, + "learning_rate": 5e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7056183815002441, + "num_tokens": 212112840.0, + "step": 8198 + }, + { + "epoch": 0.9003953437294092, + "grad_norm": 1.7276098728179932, + "learning_rate": 5e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6934572458267212, + "num_tokens": 212143149.0, + "step": 8199 + }, + { + "epoch": 0.9005051614320229, + "grad_norm": 1.6056435108184814, + "learning_rate": 5e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7013829946517944, + "num_tokens": 212174086.0, + "step": 8200 + }, + { + "epoch": 0.9006149791346365, + "grad_norm": 1.6466995477676392, + "learning_rate": 5e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7178965210914612, + "num_tokens": 212202355.0, + "step": 8201 + }, + { + "epoch": 0.9007247968372502, + "grad_norm": 1.8384788036346436, + "learning_rate": 5e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7130497097969055, + "num_tokens": 212226698.0, + "step": 8202 + }, + { + "epoch": 0.9008346145398638, + "grad_norm": 1.7735559940338135, + "learning_rate": 5e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7138913869857788, + "num_tokens": 212253508.0, + "step": 8203 + }, + { + "epoch": 0.9009444322424774, + "grad_norm": 1.7827491760253906, + "learning_rate": 5e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6889729499816895, + "num_tokens": 212281354.0, + "step": 8204 + }, + { + "epoch": 0.9010542499450912, + "grad_norm": 1.8970009088516235, + "learning_rate": 5e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.6967628002166748, + "num_tokens": 212307297.0, + "step": 8205 + }, + { + "epoch": 0.9011640676477048, + "grad_norm": 1.779869556427002, + "learning_rate": 5e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.703831136226654, + "num_tokens": 212333687.0, + "step": 8206 + }, + { + "epoch": 0.9012738853503185, + "grad_norm": 1.576744556427002, + "learning_rate": 5e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6856383681297302, + "num_tokens": 212364291.0, + "step": 8207 + }, + { + "epoch": 0.9013837030529321, + "grad_norm": 1.872663140296936, + "learning_rate": 5e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6879000663757324, + "num_tokens": 212387404.0, + "step": 8208 + }, + { + "epoch": 0.9014935207555458, + "grad_norm": 1.7066980600357056, + "learning_rate": 5e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7222310304641724, + "num_tokens": 212415466.0, + "step": 8209 + }, + { + "epoch": 0.9016033384581594, + "grad_norm": 1.7646862268447876, + "learning_rate": 5e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7168213129043579, + "num_tokens": 212441793.0, + "step": 8210 + }, + { + "epoch": 0.9017131561607731, + "grad_norm": 1.79099702835083, + "learning_rate": 5e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7192561030387878, + "num_tokens": 212465440.0, + "step": 8211 + }, + { + "epoch": 0.9018229738633867, + "grad_norm": 1.710404396057129, + "learning_rate": 5e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7224278450012207, + "num_tokens": 212492887.0, + "step": 8212 + }, + { + "epoch": 0.9019327915660005, + "grad_norm": 1.8478447198867798, + "learning_rate": 5e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7035291194915771, + "num_tokens": 212516884.0, + "step": 8213 + }, + { + "epoch": 0.9020426092686141, + "grad_norm": 1.9049665927886963, + "learning_rate": 5e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7256335616111755, + "num_tokens": 212540064.0, + "step": 8214 + }, + { + "epoch": 0.9021524269712278, + "grad_norm": 1.8057059049606323, + "learning_rate": 5e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7051736116409302, + "num_tokens": 212566886.0, + "step": 8215 + }, + { + "epoch": 0.9022622446738414, + "grad_norm": 1.6606532335281372, + "learning_rate": 5e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7108682990074158, + "num_tokens": 212596268.0, + "step": 8216 + }, + { + "epoch": 0.9023720623764551, + "grad_norm": 1.6592309474945068, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7077713012695312, + "num_tokens": 212624556.0, + "step": 8217 + }, + { + "epoch": 0.9024818800790687, + "grad_norm": 1.6599735021591187, + "learning_rate": 5e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.694449782371521, + "num_tokens": 212653086.0, + "step": 8218 + }, + { + "epoch": 0.9025916977816824, + "grad_norm": 1.774643063545227, + "learning_rate": 5e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7072123289108276, + "num_tokens": 212681040.0, + "step": 8219 + }, + { + "epoch": 0.9027015154842961, + "grad_norm": 1.8351014852523804, + "learning_rate": 5e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7020742297172546, + "num_tokens": 212705080.0, + "step": 8220 + }, + { + "epoch": 0.9028113331869098, + "grad_norm": 1.8431177139282227, + "learning_rate": 5e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.734169065952301, + "num_tokens": 212727356.0, + "step": 8221 + }, + { + "epoch": 0.9029211508895234, + "grad_norm": 1.8059138059616089, + "learning_rate": 5e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7082035541534424, + "num_tokens": 212752252.0, + "step": 8222 + }, + { + "epoch": 0.903030968592137, + "grad_norm": 1.7028911113739014, + "learning_rate": 5e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.6913223266601562, + "num_tokens": 212779790.0, + "step": 8223 + }, + { + "epoch": 0.9031407862947507, + "grad_norm": 1.8236901760101318, + "learning_rate": 5e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.722084641456604, + "num_tokens": 212802763.0, + "step": 8224 + }, + { + "epoch": 0.9032506039973643, + "grad_norm": 1.5726585388183594, + "learning_rate": 5e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7142184376716614, + "num_tokens": 212833265.0, + "step": 8225 + }, + { + "epoch": 0.903360421699978, + "grad_norm": 1.5836881399154663, + "learning_rate": 5e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7153317928314209, + "num_tokens": 212861765.0, + "step": 8226 + }, + { + "epoch": 0.9034702394025917, + "grad_norm": 1.7794744968414307, + "learning_rate": 5e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7071470022201538, + "num_tokens": 212887694.0, + "step": 8227 + }, + { + "epoch": 0.9035800571052054, + "grad_norm": 1.8061412572860718, + "learning_rate": 5e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7178384065628052, + "num_tokens": 212913175.0, + "step": 8228 + }, + { + "epoch": 0.903689874807819, + "grad_norm": 1.7123186588287354, + "learning_rate": 5e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6880747079849243, + "num_tokens": 212943419.0, + "step": 8229 + }, + { + "epoch": 0.9037996925104327, + "grad_norm": 1.8673473596572876, + "learning_rate": 5e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7370726466178894, + "num_tokens": 212965798.0, + "step": 8230 + }, + { + "epoch": 0.9039095102130463, + "grad_norm": 1.7379018068313599, + "learning_rate": 5e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6893740296363831, + "num_tokens": 212993949.0, + "step": 8231 + }, + { + "epoch": 0.90401932791566, + "grad_norm": 1.6377378702163696, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7175676822662354, + "num_tokens": 213020945.0, + "step": 8232 + }, + { + "epoch": 0.9041291456182736, + "grad_norm": 1.6487245559692383, + "learning_rate": 5e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7185604572296143, + "num_tokens": 213049859.0, + "step": 8233 + }, + { + "epoch": 0.9042389633208874, + "grad_norm": 1.9170376062393188, + "learning_rate": 5e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7117707133293152, + "num_tokens": 213071573.0, + "step": 8234 + }, + { + "epoch": 0.904348781023501, + "grad_norm": 1.6346982717514038, + "learning_rate": 5e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7398006916046143, + "num_tokens": 213097916.0, + "step": 8235 + }, + { + "epoch": 0.9044585987261147, + "grad_norm": 1.7362480163574219, + "learning_rate": 5e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7180789709091187, + "num_tokens": 213121787.0, + "step": 8236 + }, + { + "epoch": 0.9045684164287283, + "grad_norm": 1.5929512977600098, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7339332103729248, + "num_tokens": 213148428.0, + "step": 8237 + }, + { + "epoch": 0.904678234131342, + "grad_norm": 1.728944182395935, + "learning_rate": 5e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7037996053695679, + "num_tokens": 213174547.0, + "step": 8238 + }, + { + "epoch": 0.9047880518339556, + "grad_norm": 1.8306996822357178, + "learning_rate": 5e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7425862550735474, + "num_tokens": 213195433.0, + "step": 8239 + }, + { + "epoch": 0.9048978695365693, + "grad_norm": 1.7715898752212524, + "learning_rate": 5e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7196168899536133, + "num_tokens": 213219406.0, + "step": 8240 + }, + { + "epoch": 0.9050076872391829, + "grad_norm": 1.5895949602127075, + "learning_rate": 5e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.705649733543396, + "num_tokens": 213248500.0, + "step": 8241 + }, + { + "epoch": 0.9051175049417967, + "grad_norm": 1.7661606073379517, + "learning_rate": 5e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7196109294891357, + "num_tokens": 213272185.0, + "step": 8242 + }, + { + "epoch": 0.9052273226444103, + "grad_norm": 1.96924889087677, + "learning_rate": 5e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7172797918319702, + "num_tokens": 213291705.0, + "step": 8243 + }, + { + "epoch": 0.905337140347024, + "grad_norm": 1.9041489362716675, + "learning_rate": 5e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7362788915634155, + "num_tokens": 213313311.0, + "step": 8244 + }, + { + "epoch": 0.9054469580496376, + "grad_norm": 1.8393234014511108, + "learning_rate": 5e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7168660163879395, + "num_tokens": 213337586.0, + "step": 8245 + }, + { + "epoch": 0.9055567757522512, + "grad_norm": 1.6916780471801758, + "learning_rate": 5e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6965972185134888, + "num_tokens": 213368492.0, + "step": 8246 + }, + { + "epoch": 0.9056665934548649, + "grad_norm": 1.911595344543457, + "learning_rate": 5e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7211657166481018, + "num_tokens": 213390286.0, + "step": 8247 + }, + { + "epoch": 0.9057764111574785, + "grad_norm": 1.8807038068771362, + "learning_rate": 5e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7294013500213623, + "num_tokens": 213411697.0, + "step": 8248 + }, + { + "epoch": 0.9058862288600923, + "grad_norm": 1.6326555013656616, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7115983963012695, + "num_tokens": 213441066.0, + "step": 8249 + }, + { + "epoch": 0.9059960465627059, + "grad_norm": 1.6687984466552734, + "learning_rate": 5e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.715456485748291, + "num_tokens": 213468557.0, + "step": 8250 + }, + { + "epoch": 0.9061058642653196, + "grad_norm": 1.7749483585357666, + "learning_rate": 5e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7075594067573547, + "num_tokens": 213494466.0, + "step": 8251 + }, + { + "epoch": 0.9062156819679332, + "grad_norm": 1.7067968845367432, + "learning_rate": 5e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6955019235610962, + "num_tokens": 213521621.0, + "step": 8252 + }, + { + "epoch": 0.9063254996705469, + "grad_norm": 1.8484177589416504, + "learning_rate": 5e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6907413005828857, + "num_tokens": 213547720.0, + "step": 8253 + }, + { + "epoch": 0.9064353173731605, + "grad_norm": 1.8236629962921143, + "learning_rate": 5e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6859920024871826, + "num_tokens": 213574636.0, + "step": 8254 + }, + { + "epoch": 0.9065451350757742, + "grad_norm": 1.9339865446090698, + "learning_rate": 5e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.709755539894104, + "num_tokens": 213600926.0, + "step": 8255 + }, + { + "epoch": 0.9066549527783879, + "grad_norm": 2.070984125137329, + "learning_rate": 5e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6958600282669067, + "num_tokens": 213623710.0, + "step": 8256 + }, + { + "epoch": 0.9067647704810016, + "grad_norm": 1.6522068977355957, + "learning_rate": 5e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6834900379180908, + "num_tokens": 213655201.0, + "step": 8257 + }, + { + "epoch": 0.9068745881836152, + "grad_norm": 1.6101678609848022, + "learning_rate": 5e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6927546262741089, + "num_tokens": 213686669.0, + "step": 8258 + }, + { + "epoch": 0.9069844058862289, + "grad_norm": 1.7294119596481323, + "learning_rate": 5e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7332600355148315, + "num_tokens": 213711551.0, + "step": 8259 + }, + { + "epoch": 0.9070942235888425, + "grad_norm": 1.6509284973144531, + "learning_rate": 5e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7047396898269653, + "num_tokens": 213739155.0, + "step": 8260 + }, + { + "epoch": 0.9072040412914562, + "grad_norm": 1.5740774869918823, + "learning_rate": 5e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.724795937538147, + "num_tokens": 213773708.0, + "step": 8261 + }, + { + "epoch": 0.9073138589940698, + "grad_norm": 1.9159903526306152, + "learning_rate": 5e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7501819729804993, + "num_tokens": 213794960.0, + "step": 8262 + }, + { + "epoch": 0.9074236766966836, + "grad_norm": 1.7107919454574585, + "learning_rate": 5e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7341009974479675, + "num_tokens": 213822426.0, + "step": 8263 + }, + { + "epoch": 0.9075334943992972, + "grad_norm": 1.9567933082580566, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.732489824295044, + "num_tokens": 213843223.0, + "step": 8264 + }, + { + "epoch": 0.9076433121019108, + "grad_norm": 1.674368143081665, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7140759229660034, + "num_tokens": 213870152.0, + "step": 8265 + }, + { + "epoch": 0.9077531298045245, + "grad_norm": 1.6718487739562988, + "learning_rate": 5e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7073642015457153, + "num_tokens": 213900322.0, + "step": 8266 + }, + { + "epoch": 0.9078629475071381, + "grad_norm": 1.8491497039794922, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7117987871170044, + "num_tokens": 213923049.0, + "step": 8267 + }, + { + "epoch": 0.9079727652097518, + "grad_norm": 1.7871249914169312, + "learning_rate": 5e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7022302150726318, + "num_tokens": 213947881.0, + "step": 8268 + }, + { + "epoch": 0.9080825829123654, + "grad_norm": 1.6490700244903564, + "learning_rate": 5e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7298036813735962, + "num_tokens": 213974173.0, + "step": 8269 + }, + { + "epoch": 0.9081924006149792, + "grad_norm": 1.8931739330291748, + "learning_rate": 5e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7194241881370544, + "num_tokens": 213996643.0, + "step": 8270 + }, + { + "epoch": 0.9083022183175928, + "grad_norm": 1.6656019687652588, + "learning_rate": 5e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7020683884620667, + "num_tokens": 214027015.0, + "step": 8271 + }, + { + "epoch": 0.9084120360202065, + "grad_norm": 1.7864094972610474, + "learning_rate": 5e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7032512426376343, + "num_tokens": 214053806.0, + "step": 8272 + }, + { + "epoch": 0.9085218537228201, + "grad_norm": 1.8497201204299927, + "learning_rate": 5e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7022848129272461, + "num_tokens": 214077612.0, + "step": 8273 + }, + { + "epoch": 0.9086316714254338, + "grad_norm": 1.7441315650939941, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7165176868438721, + "num_tokens": 214103005.0, + "step": 8274 + }, + { + "epoch": 0.9087414891280474, + "grad_norm": 1.9049478769302368, + "learning_rate": 5e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6984947919845581, + "num_tokens": 214125840.0, + "step": 8275 + }, + { + "epoch": 0.9088513068306611, + "grad_norm": 1.854026198387146, + "learning_rate": 5e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6847147941589355, + "num_tokens": 214150596.0, + "step": 8276 + }, + { + "epoch": 0.9089611245332747, + "grad_norm": 1.7717058658599854, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7233964204788208, + "num_tokens": 214174255.0, + "step": 8277 + }, + { + "epoch": 0.9090709422358885, + "grad_norm": 1.8522727489471436, + "learning_rate": 5e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.710638701915741, + "num_tokens": 214197144.0, + "step": 8278 + }, + { + "epoch": 0.9091807599385021, + "grad_norm": 1.8185474872589111, + "learning_rate": 5e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7063119411468506, + "num_tokens": 214222955.0, + "step": 8279 + }, + { + "epoch": 0.9092905776411158, + "grad_norm": 1.7231404781341553, + "learning_rate": 5e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.7011377215385437, + "num_tokens": 214247133.0, + "step": 8280 + }, + { + "epoch": 0.9094003953437294, + "grad_norm": 1.8033573627471924, + "learning_rate": 5e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7277436256408691, + "num_tokens": 214270830.0, + "step": 8281 + }, + { + "epoch": 0.909510213046343, + "grad_norm": 1.7608678340911865, + "learning_rate": 5e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7192086577415466, + "num_tokens": 214296842.0, + "step": 8282 + }, + { + "epoch": 0.9096200307489567, + "grad_norm": 1.7595878839492798, + "learning_rate": 5e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6818822622299194, + "num_tokens": 214324567.0, + "step": 8283 + }, + { + "epoch": 0.9097298484515703, + "grad_norm": 1.6465176343917847, + "learning_rate": 5e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7220022082328796, + "num_tokens": 214350621.0, + "step": 8284 + }, + { + "epoch": 0.9098396661541841, + "grad_norm": 1.9865447282791138, + "learning_rate": 5e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.692182183265686, + "num_tokens": 214373281.0, + "step": 8285 + }, + { + "epoch": 0.9099494838567977, + "grad_norm": 1.8124138116836548, + "learning_rate": 5e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.713503897190094, + "num_tokens": 214399358.0, + "step": 8286 + }, + { + "epoch": 0.9100593015594114, + "grad_norm": 1.651962399482727, + "learning_rate": 5e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6847193837165833, + "num_tokens": 214428617.0, + "step": 8287 + }, + { + "epoch": 0.910169119262025, + "grad_norm": 1.714533805847168, + "learning_rate": 5e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7036494612693787, + "num_tokens": 214457394.0, + "step": 8288 + }, + { + "epoch": 0.9102789369646387, + "grad_norm": 1.7087037563323975, + "learning_rate": 5e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7083296775817871, + "num_tokens": 214483758.0, + "step": 8289 + }, + { + "epoch": 0.9103887546672523, + "grad_norm": 2.2368903160095215, + "learning_rate": 5e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7265544533729553, + "num_tokens": 214502912.0, + "step": 8290 + }, + { + "epoch": 0.910498572369866, + "grad_norm": 1.9559122323989868, + "learning_rate": 5e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7180715203285217, + "num_tokens": 214523248.0, + "step": 8291 + }, + { + "epoch": 0.9106083900724797, + "grad_norm": 1.8281898498535156, + "learning_rate": 5e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7189199924468994, + "num_tokens": 214547326.0, + "step": 8292 + }, + { + "epoch": 0.9107182077750934, + "grad_norm": 1.7515535354614258, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.6988235712051392, + "num_tokens": 214573025.0, + "step": 8293 + }, + { + "epoch": 0.910828025477707, + "grad_norm": 1.921578288078308, + "learning_rate": 5e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7192964553833008, + "num_tokens": 214593872.0, + "step": 8294 + }, + { + "epoch": 0.9109378431803207, + "grad_norm": 1.804793357849121, + "learning_rate": 5e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7357653975486755, + "num_tokens": 214616955.0, + "step": 8295 + }, + { + "epoch": 0.9110476608829343, + "grad_norm": 1.8137911558151245, + "learning_rate": 5e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6963135004043579, + "num_tokens": 214642603.0, + "step": 8296 + }, + { + "epoch": 0.911157478585548, + "grad_norm": 1.69373619556427, + "learning_rate": 5e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7280356884002686, + "num_tokens": 214670301.0, + "step": 8297 + }, + { + "epoch": 0.9112672962881616, + "grad_norm": 2.125162124633789, + "learning_rate": 5e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7234692573547363, + "num_tokens": 214689772.0, + "step": 8298 + }, + { + "epoch": 0.9113771139907754, + "grad_norm": 1.9816256761550903, + "learning_rate": 5e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7122231721878052, + "num_tokens": 214711357.0, + "step": 8299 + }, + { + "epoch": 0.911486931693389, + "grad_norm": 1.7267214059829712, + "learning_rate": 5e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7255575060844421, + "num_tokens": 214738283.0, + "step": 8300 + }, + { + "epoch": 0.9115967493960027, + "grad_norm": 1.746084213256836, + "learning_rate": 5e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6895493268966675, + "num_tokens": 214766238.0, + "step": 8301 + }, + { + "epoch": 0.9117065670986163, + "grad_norm": 1.7625869512557983, + "learning_rate": 5e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7059890031814575, + "num_tokens": 214790430.0, + "step": 8302 + }, + { + "epoch": 0.91181638480123, + "grad_norm": 1.706638216972351, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.738172173500061, + "num_tokens": 214814402.0, + "step": 8303 + }, + { + "epoch": 0.9119262025038436, + "grad_norm": 1.7142200469970703, + "learning_rate": 5e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7049673199653625, + "num_tokens": 214841472.0, + "step": 8304 + }, + { + "epoch": 0.9120360202064572, + "grad_norm": 1.7960304021835327, + "learning_rate": 5e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6927438378334045, + "num_tokens": 214867931.0, + "step": 8305 + }, + { + "epoch": 0.9121458379090709, + "grad_norm": 1.7563984394073486, + "learning_rate": 5e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6961237788200378, + "num_tokens": 214893570.0, + "step": 8306 + }, + { + "epoch": 0.9122556556116846, + "grad_norm": 1.9141367673873901, + "learning_rate": 5e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.7008150219917297, + "num_tokens": 214917607.0, + "step": 8307 + }, + { + "epoch": 0.9123654733142983, + "grad_norm": 1.6759469509124756, + "learning_rate": 5e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6949595808982849, + "num_tokens": 214946302.0, + "step": 8308 + }, + { + "epoch": 0.9124752910169119, + "grad_norm": 1.6257495880126953, + "learning_rate": 5e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6945687532424927, + "num_tokens": 214977184.0, + "step": 8309 + }, + { + "epoch": 0.9125851087195256, + "grad_norm": 1.7485803365707397, + "learning_rate": 5e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7109643220901489, + "num_tokens": 215002943.0, + "step": 8310 + }, + { + "epoch": 0.9126949264221392, + "grad_norm": 1.798999547958374, + "learning_rate": 5e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7212320566177368, + "num_tokens": 215027092.0, + "step": 8311 + }, + { + "epoch": 0.9128047441247529, + "grad_norm": 1.8544138669967651, + "learning_rate": 5e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7232743501663208, + "num_tokens": 215050150.0, + "step": 8312 + }, + { + "epoch": 0.9129145618273665, + "grad_norm": 1.7126920223236084, + "learning_rate": 5e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7016193270683289, + "num_tokens": 215078453.0, + "step": 8313 + }, + { + "epoch": 0.9130243795299803, + "grad_norm": 1.7677721977233887, + "learning_rate": 5e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6894431114196777, + "num_tokens": 215107570.0, + "step": 8314 + }, + { + "epoch": 0.9131341972325939, + "grad_norm": 2.0921411514282227, + "learning_rate": 5e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7265574932098389, + "num_tokens": 215125812.0, + "step": 8315 + }, + { + "epoch": 0.9132440149352076, + "grad_norm": 1.761319637298584, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7254980802536011, + "num_tokens": 215150700.0, + "step": 8316 + }, + { + "epoch": 0.9133538326378212, + "grad_norm": 1.7882049083709717, + "learning_rate": 5e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6903800964355469, + "num_tokens": 215177241.0, + "step": 8317 + }, + { + "epoch": 0.9134636503404349, + "grad_norm": 1.5988365411758423, + "learning_rate": 5e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6958292126655579, + "num_tokens": 215209029.0, + "step": 8318 + }, + { + "epoch": 0.9135734680430485, + "grad_norm": 1.5598419904708862, + "learning_rate": 5e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6994256377220154, + "num_tokens": 215240711.0, + "step": 8319 + }, + { + "epoch": 0.9136832857456622, + "grad_norm": 1.867437720298767, + "learning_rate": 5e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7260158061981201, + "num_tokens": 215261046.0, + "step": 8320 + }, + { + "epoch": 0.9137931034482759, + "grad_norm": 1.793953776359558, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7296781539916992, + "num_tokens": 215283269.0, + "step": 8321 + }, + { + "epoch": 0.9139029211508896, + "grad_norm": 1.7700697183609009, + "learning_rate": 5e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6887781620025635, + "num_tokens": 215308933.0, + "step": 8322 + }, + { + "epoch": 0.9140127388535032, + "grad_norm": 1.9367014169692993, + "learning_rate": 5e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7238085865974426, + "num_tokens": 215326991.0, + "step": 8323 + }, + { + "epoch": 0.9141225565561168, + "grad_norm": 1.8759268522262573, + "learning_rate": 5e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7141684293746948, + "num_tokens": 215350136.0, + "step": 8324 + }, + { + "epoch": 0.9142323742587305, + "grad_norm": 1.8028424978256226, + "learning_rate": 5e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7127373814582825, + "num_tokens": 215374184.0, + "step": 8325 + }, + { + "epoch": 0.9143421919613441, + "grad_norm": 1.919153094291687, + "learning_rate": 5e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7354589700698853, + "num_tokens": 215394248.0, + "step": 8326 + }, + { + "epoch": 0.9144520096639578, + "grad_norm": 1.7798832654953003, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7170511484146118, + "num_tokens": 215418326.0, + "step": 8327 + }, + { + "epoch": 0.9145618273665715, + "grad_norm": 1.709519863128662, + "learning_rate": 5e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7121331691741943, + "num_tokens": 215445944.0, + "step": 8328 + }, + { + "epoch": 0.9146716450691852, + "grad_norm": 1.7888356447219849, + "learning_rate": 5e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7127363085746765, + "num_tokens": 215470506.0, + "step": 8329 + }, + { + "epoch": 0.9147814627717988, + "grad_norm": 1.6584348678588867, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7244517207145691, + "num_tokens": 215499991.0, + "step": 8330 + }, + { + "epoch": 0.9148912804744125, + "grad_norm": 1.8800219297409058, + "learning_rate": 5e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.6981285810470581, + "num_tokens": 215523969.0, + "step": 8331 + }, + { + "epoch": 0.9150010981770261, + "grad_norm": 1.9749701023101807, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7326158285140991, + "num_tokens": 215543825.0, + "step": 8332 + }, + { + "epoch": 0.9151109158796398, + "grad_norm": 1.6598289012908936, + "learning_rate": 5e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7000553607940674, + "num_tokens": 215574939.0, + "step": 8333 + }, + { + "epoch": 0.9152207335822534, + "grad_norm": 1.6278162002563477, + "learning_rate": 5e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.695457935333252, + "num_tokens": 215606217.0, + "step": 8334 + }, + { + "epoch": 0.9153305512848671, + "grad_norm": 1.5800776481628418, + "learning_rate": 5e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6913532614707947, + "num_tokens": 215639224.0, + "step": 8335 + }, + { + "epoch": 0.9154403689874808, + "grad_norm": 1.7297929525375366, + "learning_rate": 5e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7108680605888367, + "num_tokens": 215664423.0, + "step": 8336 + }, + { + "epoch": 0.9155501866900945, + "grad_norm": 2.074117422103882, + "learning_rate": 5e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7215033769607544, + "num_tokens": 215684774.0, + "step": 8337 + }, + { + "epoch": 0.9156600043927081, + "grad_norm": 1.8478959798812866, + "learning_rate": 5e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7082430124282837, + "num_tokens": 215708009.0, + "step": 8338 + }, + { + "epoch": 0.9157698220953218, + "grad_norm": 1.717211127281189, + "learning_rate": 5e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.714571475982666, + "num_tokens": 215735621.0, + "step": 8339 + }, + { + "epoch": 0.9158796397979354, + "grad_norm": 1.9422438144683838, + "learning_rate": 5e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.696729838848114, + "num_tokens": 215757432.0, + "step": 8340 + }, + { + "epoch": 0.915989457500549, + "grad_norm": 1.6470404863357544, + "learning_rate": 5e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6921005249023438, + "num_tokens": 215790195.0, + "step": 8341 + }, + { + "epoch": 0.9160992752031627, + "grad_norm": 1.8176740407943726, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7167086005210876, + "num_tokens": 215815537.0, + "step": 8342 + }, + { + "epoch": 0.9162090929057765, + "grad_norm": 1.6165050268173218, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7180713415145874, + "num_tokens": 215845925.0, + "step": 8343 + }, + { + "epoch": 0.9163189106083901, + "grad_norm": 1.7779160737991333, + "learning_rate": 5e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.712626576423645, + "num_tokens": 215870966.0, + "step": 8344 + }, + { + "epoch": 0.9164287283110037, + "grad_norm": 1.911941647529602, + "learning_rate": 5e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7024631500244141, + "num_tokens": 215895270.0, + "step": 8345 + }, + { + "epoch": 0.9165385460136174, + "grad_norm": 1.6505534648895264, + "learning_rate": 5e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7095486521720886, + "num_tokens": 215922817.0, + "step": 8346 + }, + { + "epoch": 0.916648363716231, + "grad_norm": 1.8340939283370972, + "learning_rate": 5e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7233240008354187, + "num_tokens": 215946593.0, + "step": 8347 + }, + { + "epoch": 0.9167581814188447, + "grad_norm": 1.7750747203826904, + "learning_rate": 5e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6997768878936768, + "num_tokens": 215973421.0, + "step": 8348 + }, + { + "epoch": 0.9168679991214583, + "grad_norm": 2.077510118484497, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7230468392372131, + "num_tokens": 215991493.0, + "step": 8349 + }, + { + "epoch": 0.9169778168240721, + "grad_norm": 1.7772449254989624, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.722318172454834, + "num_tokens": 216015838.0, + "step": 8350 + }, + { + "epoch": 0.9170876345266857, + "grad_norm": 1.671936273574829, + "learning_rate": 5e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7056671380996704, + "num_tokens": 216045089.0, + "step": 8351 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 1.7016865015029907, + "learning_rate": 5e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7204422354698181, + "num_tokens": 216072733.0, + "step": 8352 + }, + { + "epoch": 0.917307269931913, + "grad_norm": 1.8364449739456177, + "learning_rate": 5e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7135499715805054, + "num_tokens": 216095455.0, + "step": 8353 + }, + { + "epoch": 0.9174170876345267, + "grad_norm": 1.844112753868103, + "learning_rate": 5e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6812232732772827, + "num_tokens": 216119928.0, + "step": 8354 + }, + { + "epoch": 0.9175269053371403, + "grad_norm": 1.8293272256851196, + "learning_rate": 5e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7069471478462219, + "num_tokens": 216143518.0, + "step": 8355 + }, + { + "epoch": 0.917636723039754, + "grad_norm": 1.6743286848068237, + "learning_rate": 5e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.6969975233078003, + "num_tokens": 216173286.0, + "step": 8356 + }, + { + "epoch": 0.9177465407423677, + "grad_norm": 1.8633978366851807, + "learning_rate": 5e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6979660391807556, + "num_tokens": 216197584.0, + "step": 8357 + }, + { + "epoch": 0.9178563584449814, + "grad_norm": 1.7577557563781738, + "learning_rate": 5e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7227569222450256, + "num_tokens": 216224278.0, + "step": 8358 + }, + { + "epoch": 0.917966176147595, + "grad_norm": 1.587025761604309, + "learning_rate": 5e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7153655290603638, + "num_tokens": 216253415.0, + "step": 8359 + }, + { + "epoch": 0.9180759938502087, + "grad_norm": 1.6061557531356812, + "learning_rate": 5e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7161227464675903, + "num_tokens": 216282307.0, + "step": 8360 + }, + { + "epoch": 0.9181858115528223, + "grad_norm": 1.7401093244552612, + "learning_rate": 5e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7023414969444275, + "num_tokens": 216307201.0, + "step": 8361 + }, + { + "epoch": 0.918295629255436, + "grad_norm": 1.6066759824752808, + "learning_rate": 5e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7108411192893982, + "num_tokens": 216335467.0, + "step": 8362 + }, + { + "epoch": 0.9184054469580496, + "grad_norm": 1.827956199645996, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7200741767883301, + "num_tokens": 216358769.0, + "step": 8363 + }, + { + "epoch": 0.9185152646606632, + "grad_norm": 1.7574824094772339, + "learning_rate": 5e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6983845829963684, + "num_tokens": 216387293.0, + "step": 8364 + }, + { + "epoch": 0.918625082363277, + "grad_norm": 1.8649085760116577, + "learning_rate": 5e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7135521769523621, + "num_tokens": 216410194.0, + "step": 8365 + }, + { + "epoch": 0.9187349000658906, + "grad_norm": 1.655189871788025, + "learning_rate": 5e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7249796390533447, + "num_tokens": 216437428.0, + "step": 8366 + }, + { + "epoch": 0.9188447177685043, + "grad_norm": 1.6724231243133545, + "learning_rate": 5e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7147583961486816, + "num_tokens": 216464203.0, + "step": 8367 + }, + { + "epoch": 0.9189545354711179, + "grad_norm": 1.6617662906646729, + "learning_rate": 5e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7060540914535522, + "num_tokens": 216490157.0, + "step": 8368 + }, + { + "epoch": 0.9190643531737316, + "grad_norm": 1.6298925876617432, + "learning_rate": 5e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6956691741943359, + "num_tokens": 216522651.0, + "step": 8369 + }, + { + "epoch": 0.9191741708763452, + "grad_norm": 1.7640323638916016, + "learning_rate": 5e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6892303228378296, + "num_tokens": 216547249.0, + "step": 8370 + }, + { + "epoch": 0.9192839885789589, + "grad_norm": 2.0333054065704346, + "learning_rate": 5e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7147300243377686, + "num_tokens": 216567434.0, + "step": 8371 + }, + { + "epoch": 0.9193938062815726, + "grad_norm": 1.6880520582199097, + "learning_rate": 5e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7013226747512817, + "num_tokens": 216595980.0, + "step": 8372 + }, + { + "epoch": 0.9195036239841863, + "grad_norm": 1.5454726219177246, + "learning_rate": 5e-06, + "loss": 0.947, + "mean_token_accuracy": 0.708166241645813, + "num_tokens": 216630416.0, + "step": 8373 + }, + { + "epoch": 0.9196134416867999, + "grad_norm": 1.771730661392212, + "learning_rate": 5e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.6983778476715088, + "num_tokens": 216655551.0, + "step": 8374 + }, + { + "epoch": 0.9197232593894136, + "grad_norm": 1.839721918106079, + "learning_rate": 5e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.694450855255127, + "num_tokens": 216679991.0, + "step": 8375 + }, + { + "epoch": 0.9198330770920272, + "grad_norm": 1.6598572731018066, + "learning_rate": 5e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.711732029914856, + "num_tokens": 216708017.0, + "step": 8376 + }, + { + "epoch": 0.9199428947946409, + "grad_norm": 1.6427326202392578, + "learning_rate": 5e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7189788222312927, + "num_tokens": 216734151.0, + "step": 8377 + }, + { + "epoch": 0.9200527124972545, + "grad_norm": 1.7911157608032227, + "learning_rate": 5e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7110693454742432, + "num_tokens": 216758526.0, + "step": 8378 + }, + { + "epoch": 0.9201625301998683, + "grad_norm": 1.7111765146255493, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.707693338394165, + "num_tokens": 216788821.0, + "step": 8379 + }, + { + "epoch": 0.9202723479024819, + "grad_norm": 1.5444165468215942, + "learning_rate": 5e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.708892822265625, + "num_tokens": 216821460.0, + "step": 8380 + }, + { + "epoch": 0.9203821656050956, + "grad_norm": 1.5633399486541748, + "learning_rate": 5e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7067931890487671, + "num_tokens": 216854778.0, + "step": 8381 + }, + { + "epoch": 0.9204919833077092, + "grad_norm": 1.8147755861282349, + "learning_rate": 5e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6887316703796387, + "num_tokens": 216880203.0, + "step": 8382 + }, + { + "epoch": 0.9206018010103229, + "grad_norm": 1.6281335353851318, + "learning_rate": 5e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6898213624954224, + "num_tokens": 216911383.0, + "step": 8383 + }, + { + "epoch": 0.9207116187129365, + "grad_norm": 1.8328039646148682, + "learning_rate": 5e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7066709399223328, + "num_tokens": 216933714.0, + "step": 8384 + }, + { + "epoch": 0.9208214364155501, + "grad_norm": 1.6253488063812256, + "learning_rate": 5e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7002761363983154, + "num_tokens": 216960812.0, + "step": 8385 + }, + { + "epoch": 0.9209312541181639, + "grad_norm": 1.668264627456665, + "learning_rate": 5e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7144898176193237, + "num_tokens": 216990334.0, + "step": 8386 + }, + { + "epoch": 0.9210410718207775, + "grad_norm": 1.6119040250778198, + "learning_rate": 5e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7012069225311279, + "num_tokens": 217020076.0, + "step": 8387 + }, + { + "epoch": 0.9211508895233912, + "grad_norm": 1.6285731792449951, + "learning_rate": 5e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.694164514541626, + "num_tokens": 217049474.0, + "step": 8388 + }, + { + "epoch": 0.9212607072260048, + "grad_norm": 1.702900767326355, + "learning_rate": 5e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6866699457168579, + "num_tokens": 217076817.0, + "step": 8389 + }, + { + "epoch": 0.9213705249286185, + "grad_norm": 1.7155687808990479, + "learning_rate": 5e-06, + "loss": 0.947, + "mean_token_accuracy": 0.717499852180481, + "num_tokens": 217106171.0, + "step": 8390 + }, + { + "epoch": 0.9214803426312321, + "grad_norm": 1.6225041151046753, + "learning_rate": 5e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7052265405654907, + "num_tokens": 217135066.0, + "step": 8391 + }, + { + "epoch": 0.9215901603338458, + "grad_norm": 1.9382940530776978, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7315763235092163, + "num_tokens": 217155524.0, + "step": 8392 + }, + { + "epoch": 0.9216999780364594, + "grad_norm": 1.7095873355865479, + "learning_rate": 5e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7076122760772705, + "num_tokens": 217184618.0, + "step": 8393 + }, + { + "epoch": 0.9218097957390732, + "grad_norm": 1.8355708122253418, + "learning_rate": 5e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7199712991714478, + "num_tokens": 217205280.0, + "step": 8394 + }, + { + "epoch": 0.9219196134416868, + "grad_norm": 1.798102617263794, + "learning_rate": 5e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6947652101516724, + "num_tokens": 217230661.0, + "step": 8395 + }, + { + "epoch": 0.9220294311443005, + "grad_norm": 1.6705230474472046, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7035017013549805, + "num_tokens": 217258165.0, + "step": 8396 + }, + { + "epoch": 0.9221392488469141, + "grad_norm": 1.6424411535263062, + "learning_rate": 5e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7030242681503296, + "num_tokens": 217289023.0, + "step": 8397 + }, + { + "epoch": 0.9222490665495278, + "grad_norm": 1.8327151536941528, + "learning_rate": 5e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7268533110618591, + "num_tokens": 217312584.0, + "step": 8398 + }, + { + "epoch": 0.9223588842521414, + "grad_norm": 1.78568434715271, + "learning_rate": 5e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7069469690322876, + "num_tokens": 217336843.0, + "step": 8399 + }, + { + "epoch": 0.922468701954755, + "grad_norm": 1.911036729812622, + "learning_rate": 5e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7052125930786133, + "num_tokens": 217360826.0, + "step": 8400 + }, + { + "epoch": 0.9225785196573688, + "grad_norm": 1.7933170795440674, + "learning_rate": 5e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7055352926254272, + "num_tokens": 217384978.0, + "step": 8401 + }, + { + "epoch": 0.9226883373599825, + "grad_norm": 1.7685037851333618, + "learning_rate": 5e-06, + "loss": 0.977, + "mean_token_accuracy": 0.6975441575050354, + "num_tokens": 217410415.0, + "step": 8402 + }, + { + "epoch": 0.9227981550625961, + "grad_norm": 1.79388427734375, + "learning_rate": 5e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7295251488685608, + "num_tokens": 217434104.0, + "step": 8403 + }, + { + "epoch": 0.9229079727652097, + "grad_norm": 1.8734747171401978, + "learning_rate": 5e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7247183918952942, + "num_tokens": 217458425.0, + "step": 8404 + }, + { + "epoch": 0.9230177904678234, + "grad_norm": 1.6604870557785034, + "learning_rate": 5e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6917507648468018, + "num_tokens": 217489218.0, + "step": 8405 + }, + { + "epoch": 0.923127608170437, + "grad_norm": 1.8416907787322998, + "learning_rate": 5e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.704855740070343, + "num_tokens": 217513414.0, + "step": 8406 + }, + { + "epoch": 0.9232374258730507, + "grad_norm": 2.024829387664795, + "learning_rate": 5e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6886471509933472, + "num_tokens": 217537452.0, + "step": 8407 + }, + { + "epoch": 0.9233472435756644, + "grad_norm": 1.7482496500015259, + "learning_rate": 5e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7086318731307983, + "num_tokens": 217565099.0, + "step": 8408 + }, + { + "epoch": 0.9234570612782781, + "grad_norm": 1.758976697921753, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7183541059494019, + "num_tokens": 217589426.0, + "step": 8409 + }, + { + "epoch": 0.9235668789808917, + "grad_norm": 1.7544054985046387, + "learning_rate": 5e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7025830745697021, + "num_tokens": 217615548.0, + "step": 8410 + }, + { + "epoch": 0.9236766966835054, + "grad_norm": 1.785079836845398, + "learning_rate": 5e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6934852600097656, + "num_tokens": 217640435.0, + "step": 8411 + }, + { + "epoch": 0.923786514386119, + "grad_norm": 1.9464561939239502, + "learning_rate": 5e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7280375957489014, + "num_tokens": 217659839.0, + "step": 8412 + }, + { + "epoch": 0.9238963320887327, + "grad_norm": 1.7490787506103516, + "learning_rate": 5e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6933521032333374, + "num_tokens": 217686783.0, + "step": 8413 + }, + { + "epoch": 0.9240061497913463, + "grad_norm": 1.6440693140029907, + "learning_rate": 5e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7261805534362793, + "num_tokens": 217714249.0, + "step": 8414 + }, + { + "epoch": 0.9241159674939601, + "grad_norm": 1.8927358388900757, + "learning_rate": 5e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7130392789840698, + "num_tokens": 217738393.0, + "step": 8415 + }, + { + "epoch": 0.9242257851965737, + "grad_norm": 1.5875627994537354, + "learning_rate": 5e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7078652381896973, + "num_tokens": 217767964.0, + "step": 8416 + }, + { + "epoch": 0.9243356028991874, + "grad_norm": 1.8979160785675049, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7025983333587646, + "num_tokens": 217792021.0, + "step": 8417 + }, + { + "epoch": 0.924445420601801, + "grad_norm": 1.6293705701828003, + "learning_rate": 5e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7169684767723083, + "num_tokens": 217822977.0, + "step": 8418 + }, + { + "epoch": 0.9245552383044147, + "grad_norm": 1.607214331626892, + "learning_rate": 5e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.6976916790008545, + "num_tokens": 217852099.0, + "step": 8419 + }, + { + "epoch": 0.9246650560070283, + "grad_norm": 2.319855213165283, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7290787100791931, + "num_tokens": 217870817.0, + "step": 8420 + }, + { + "epoch": 0.924774873709642, + "grad_norm": 2.012371778488159, + "learning_rate": 5e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6860199570655823, + "num_tokens": 217892975.0, + "step": 8421 + }, + { + "epoch": 0.9248846914122557, + "grad_norm": 1.9783705472946167, + "learning_rate": 5e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7324252724647522, + "num_tokens": 217913835.0, + "step": 8422 + }, + { + "epoch": 0.9249945091148694, + "grad_norm": 1.7575256824493408, + "learning_rate": 5e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7382943630218506, + "num_tokens": 217938915.0, + "step": 8423 + }, + { + "epoch": 0.925104326817483, + "grad_norm": 1.5480657815933228, + "learning_rate": 5e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.715430498123169, + "num_tokens": 217970763.0, + "step": 8424 + }, + { + "epoch": 0.9252141445200966, + "grad_norm": 1.733459234237671, + "learning_rate": 5e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6909348368644714, + "num_tokens": 217999379.0, + "step": 8425 + }, + { + "epoch": 0.9253239622227103, + "grad_norm": 1.8541405200958252, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7277858257293701, + "num_tokens": 218022902.0, + "step": 8426 + }, + { + "epoch": 0.9254337799253239, + "grad_norm": 1.6960002183914185, + "learning_rate": 5e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6918457746505737, + "num_tokens": 218053212.0, + "step": 8427 + }, + { + "epoch": 0.9255435976279376, + "grad_norm": 1.8634878396987915, + "learning_rate": 5e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6983596086502075, + "num_tokens": 218078132.0, + "step": 8428 + }, + { + "epoch": 0.9256534153305512, + "grad_norm": 1.6286931037902832, + "learning_rate": 5e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7026282548904419, + "num_tokens": 218106787.0, + "step": 8429 + }, + { + "epoch": 0.925763233033165, + "grad_norm": 1.6258091926574707, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.722221851348877, + "num_tokens": 218138312.0, + "step": 8430 + }, + { + "epoch": 0.9258730507357786, + "grad_norm": 1.9783707857131958, + "learning_rate": 5e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7408334016799927, + "num_tokens": 218157174.0, + "step": 8431 + }, + { + "epoch": 0.9259828684383923, + "grad_norm": 1.7707735300064087, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7100712656974792, + "num_tokens": 218182502.0, + "step": 8432 + }, + { + "epoch": 0.9260926861410059, + "grad_norm": 1.8170944452285767, + "learning_rate": 5e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7068789601325989, + "num_tokens": 218207639.0, + "step": 8433 + }, + { + "epoch": 0.9262025038436196, + "grad_norm": 1.657975196838379, + "learning_rate": 5e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7195737361907959, + "num_tokens": 218234713.0, + "step": 8434 + }, + { + "epoch": 0.9263123215462332, + "grad_norm": 1.8065224885940552, + "learning_rate": 5e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7297601699829102, + "num_tokens": 218258279.0, + "step": 8435 + }, + { + "epoch": 0.9264221392488469, + "grad_norm": 1.9013373851776123, + "learning_rate": 5e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6907424330711365, + "num_tokens": 218283339.0, + "step": 8436 + }, + { + "epoch": 0.9265319569514606, + "grad_norm": 1.7113629579544067, + "learning_rate": 5e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.6994179487228394, + "num_tokens": 218310960.0, + "step": 8437 + }, + { + "epoch": 0.9266417746540743, + "grad_norm": 1.6310065984725952, + "learning_rate": 5e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7269542813301086, + "num_tokens": 218340260.0, + "step": 8438 + }, + { + "epoch": 0.9267515923566879, + "grad_norm": 1.9300318956375122, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7460142374038696, + "num_tokens": 218359441.0, + "step": 8439 + }, + { + "epoch": 0.9268614100593016, + "grad_norm": 1.8029495477676392, + "learning_rate": 5e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7160094380378723, + "num_tokens": 218382829.0, + "step": 8440 + }, + { + "epoch": 0.9269712277619152, + "grad_norm": 1.8858579397201538, + "learning_rate": 5e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7087326049804688, + "num_tokens": 218406091.0, + "step": 8441 + }, + { + "epoch": 0.9270810454645289, + "grad_norm": 1.5824882984161377, + "learning_rate": 5e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7010878324508667, + "num_tokens": 218435991.0, + "step": 8442 + }, + { + "epoch": 0.9271908631671425, + "grad_norm": 1.6915541887283325, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7205748558044434, + "num_tokens": 218463144.0, + "step": 8443 + }, + { + "epoch": 0.9273006808697563, + "grad_norm": 1.887481927871704, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.699824333190918, + "num_tokens": 218485835.0, + "step": 8444 + }, + { + "epoch": 0.9274104985723699, + "grad_norm": 1.8197513818740845, + "learning_rate": 5e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7142312526702881, + "num_tokens": 218511074.0, + "step": 8445 + }, + { + "epoch": 0.9275203162749835, + "grad_norm": 1.509965419769287, + "learning_rate": 5e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6978633403778076, + "num_tokens": 218544727.0, + "step": 8446 + }, + { + "epoch": 0.9276301339775972, + "grad_norm": 1.8328648805618286, + "learning_rate": 5e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6860551238059998, + "num_tokens": 218570082.0, + "step": 8447 + }, + { + "epoch": 0.9277399516802108, + "grad_norm": 1.5978831052780151, + "learning_rate": 5e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.697461724281311, + "num_tokens": 218599164.0, + "step": 8448 + }, + { + "epoch": 0.9278497693828245, + "grad_norm": 1.5378973484039307, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7279676795005798, + "num_tokens": 218631627.0, + "step": 8449 + }, + { + "epoch": 0.9279595870854381, + "grad_norm": 1.6933244466781616, + "learning_rate": 5e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7267104387283325, + "num_tokens": 218657819.0, + "step": 8450 + }, + { + "epoch": 0.9280694047880519, + "grad_norm": 1.614575982093811, + "learning_rate": 5e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6858402490615845, + "num_tokens": 218690576.0, + "step": 8451 + }, + { + "epoch": 0.9281792224906655, + "grad_norm": 1.740397572517395, + "learning_rate": 5e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6979355812072754, + "num_tokens": 218714744.0, + "step": 8452 + }, + { + "epoch": 0.9282890401932792, + "grad_norm": 1.7848098278045654, + "learning_rate": 5e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7217427492141724, + "num_tokens": 218736747.0, + "step": 8453 + }, + { + "epoch": 0.9283988578958928, + "grad_norm": 1.8115568161010742, + "learning_rate": 5e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7421449422836304, + "num_tokens": 218759674.0, + "step": 8454 + }, + { + "epoch": 0.9285086755985065, + "grad_norm": 1.757947325706482, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7214692831039429, + "num_tokens": 218783969.0, + "step": 8455 + }, + { + "epoch": 0.9286184933011201, + "grad_norm": 1.6894190311431885, + "learning_rate": 5e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.708849310874939, + "num_tokens": 218812955.0, + "step": 8456 + }, + { + "epoch": 0.9287283110037338, + "grad_norm": 1.8377234935760498, + "learning_rate": 5e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7069355249404907, + "num_tokens": 218838635.0, + "step": 8457 + }, + { + "epoch": 0.9288381287063474, + "grad_norm": 1.5720210075378418, + "learning_rate": 5e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7203922867774963, + "num_tokens": 218869449.0, + "step": 8458 + }, + { + "epoch": 0.9289479464089612, + "grad_norm": 1.5712887048721313, + "learning_rate": 5e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7075831294059753, + "num_tokens": 218899262.0, + "step": 8459 + }, + { + "epoch": 0.9290577641115748, + "grad_norm": 1.6585373878479004, + "learning_rate": 5e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6875044107437134, + "num_tokens": 218926608.0, + "step": 8460 + }, + { + "epoch": 0.9291675818141885, + "grad_norm": 1.5015276670455933, + "learning_rate": 5e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6780679225921631, + "num_tokens": 218964627.0, + "step": 8461 + }, + { + "epoch": 0.9292773995168021, + "grad_norm": 1.7431013584136963, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7237424850463867, + "num_tokens": 218989694.0, + "step": 8462 + }, + { + "epoch": 0.9293872172194158, + "grad_norm": 1.743600845336914, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7240511178970337, + "num_tokens": 219015067.0, + "step": 8463 + }, + { + "epoch": 0.9294970349220294, + "grad_norm": 1.8909351825714111, + "learning_rate": 5e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7202274799346924, + "num_tokens": 219038853.0, + "step": 8464 + }, + { + "epoch": 0.929606852624643, + "grad_norm": 1.5659123659133911, + "learning_rate": 5e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7097350358963013, + "num_tokens": 219067383.0, + "step": 8465 + }, + { + "epoch": 0.9297166703272568, + "grad_norm": 1.8396356105804443, + "learning_rate": 5e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7216585874557495, + "num_tokens": 219089826.0, + "step": 8466 + }, + { + "epoch": 0.9298264880298704, + "grad_norm": 1.7100675106048584, + "learning_rate": 5e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.710158109664917, + "num_tokens": 219117513.0, + "step": 8467 + }, + { + "epoch": 0.9299363057324841, + "grad_norm": 1.7275302410125732, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.721587598323822, + "num_tokens": 219140977.0, + "step": 8468 + }, + { + "epoch": 0.9300461234350977, + "grad_norm": 1.9334620237350464, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7197144031524658, + "num_tokens": 219161385.0, + "step": 8469 + }, + { + "epoch": 0.9301559411377114, + "grad_norm": 1.7866796255111694, + "learning_rate": 5e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7315405011177063, + "num_tokens": 219184282.0, + "step": 8470 + }, + { + "epoch": 0.930265758840325, + "grad_norm": 1.9443445205688477, + "learning_rate": 5e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7062641382217407, + "num_tokens": 219204802.0, + "step": 8471 + }, + { + "epoch": 0.9303755765429387, + "grad_norm": 1.7829947471618652, + "learning_rate": 5e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.716850221157074, + "num_tokens": 219230636.0, + "step": 8472 + }, + { + "epoch": 0.9304853942455524, + "grad_norm": 1.5816537141799927, + "learning_rate": 5e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.701334536075592, + "num_tokens": 219263394.0, + "step": 8473 + }, + { + "epoch": 0.9305952119481661, + "grad_norm": 1.8138986825942993, + "learning_rate": 5e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7116391658782959, + "num_tokens": 219289046.0, + "step": 8474 + }, + { + "epoch": 0.9307050296507797, + "grad_norm": 1.8205891847610474, + "learning_rate": 5e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7034046649932861, + "num_tokens": 219313569.0, + "step": 8475 + }, + { + "epoch": 0.9308148473533934, + "grad_norm": 1.6265910863876343, + "learning_rate": 5e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7051945924758911, + "num_tokens": 219343912.0, + "step": 8476 + }, + { + "epoch": 0.930924665056007, + "grad_norm": 1.705621600151062, + "learning_rate": 5e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.715803325176239, + "num_tokens": 219369525.0, + "step": 8477 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 1.6397273540496826, + "learning_rate": 5e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7037016153335571, + "num_tokens": 219397702.0, + "step": 8478 + }, + { + "epoch": 0.9311443004612343, + "grad_norm": 1.6206587553024292, + "learning_rate": 5e-06, + "loss": 1.084, + "mean_token_accuracy": 0.674206554889679, + "num_tokens": 219431213.0, + "step": 8479 + }, + { + "epoch": 0.9312541181638481, + "grad_norm": 1.794143557548523, + "learning_rate": 5e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7202529907226562, + "num_tokens": 219455763.0, + "step": 8480 + }, + { + "epoch": 0.9313639358664617, + "grad_norm": 1.7722722291946411, + "learning_rate": 5e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.6968883872032166, + "num_tokens": 219481125.0, + "step": 8481 + }, + { + "epoch": 0.9314737535690754, + "grad_norm": 1.7739512920379639, + "learning_rate": 5e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6962999105453491, + "num_tokens": 219507754.0, + "step": 8482 + }, + { + "epoch": 0.931583571271689, + "grad_norm": 1.673525333404541, + "learning_rate": 5e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7131058573722839, + "num_tokens": 219535221.0, + "step": 8483 + }, + { + "epoch": 0.9316933889743026, + "grad_norm": 1.6259167194366455, + "learning_rate": 5e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7155953645706177, + "num_tokens": 219564952.0, + "step": 8484 + }, + { + "epoch": 0.9318032066769163, + "grad_norm": 1.6756387948989868, + "learning_rate": 5e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.711728036403656, + "num_tokens": 219591003.0, + "step": 8485 + }, + { + "epoch": 0.9319130243795299, + "grad_norm": 1.7178884744644165, + "learning_rate": 5e-06, + "loss": 0.874, + "mean_token_accuracy": 0.729548454284668, + "num_tokens": 219617481.0, + "step": 8486 + }, + { + "epoch": 0.9320228420821436, + "grad_norm": 1.7030755281448364, + "learning_rate": 5e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6844989061355591, + "num_tokens": 219645823.0, + "step": 8487 + }, + { + "epoch": 0.9321326597847573, + "grad_norm": 1.7979388236999512, + "learning_rate": 5e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6961127519607544, + "num_tokens": 219672654.0, + "step": 8488 + }, + { + "epoch": 0.932242477487371, + "grad_norm": 1.8496392965316772, + "learning_rate": 5e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7208694219589233, + "num_tokens": 219696278.0, + "step": 8489 + }, + { + "epoch": 0.9323522951899846, + "grad_norm": 2.0001065731048584, + "learning_rate": 5e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6964074373245239, + "num_tokens": 219716050.0, + "step": 8490 + }, + { + "epoch": 0.9324621128925983, + "grad_norm": 2.055835008621216, + "learning_rate": 5e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7137168645858765, + "num_tokens": 219735856.0, + "step": 8491 + }, + { + "epoch": 0.9325719305952119, + "grad_norm": 1.7568132877349854, + "learning_rate": 5e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7001328468322754, + "num_tokens": 219761789.0, + "step": 8492 + }, + { + "epoch": 0.9326817482978256, + "grad_norm": 1.9040676355361938, + "learning_rate": 5e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6820888519287109, + "num_tokens": 219787363.0, + "step": 8493 + }, + { + "epoch": 0.9327915660004392, + "grad_norm": 2.2827093601226807, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7292579412460327, + "num_tokens": 219804384.0, + "step": 8494 + }, + { + "epoch": 0.932901383703053, + "grad_norm": 1.8729137182235718, + "learning_rate": 5e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6877400875091553, + "num_tokens": 219829783.0, + "step": 8495 + }, + { + "epoch": 0.9330112014056666, + "grad_norm": 1.6981196403503418, + "learning_rate": 5e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7273684740066528, + "num_tokens": 219857096.0, + "step": 8496 + }, + { + "epoch": 0.9331210191082803, + "grad_norm": 1.8875172138214111, + "learning_rate": 5e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7166093587875366, + "num_tokens": 219881195.0, + "step": 8497 + }, + { + "epoch": 0.9332308368108939, + "grad_norm": 1.8114334344863892, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7030402421951294, + "num_tokens": 219907947.0, + "step": 8498 + }, + { + "epoch": 0.9333406545135076, + "grad_norm": 1.6995446681976318, + "learning_rate": 5e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7063214778900146, + "num_tokens": 219936432.0, + "step": 8499 + }, + { + "epoch": 0.9334504722161212, + "grad_norm": 1.7931655645370483, + "learning_rate": 5e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6949925422668457, + "num_tokens": 219960607.0, + "step": 8500 + }, + { + "epoch": 0.9335602899187349, + "grad_norm": 1.9190484285354614, + "learning_rate": 5e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7361823320388794, + "num_tokens": 219981702.0, + "step": 8501 + }, + { + "epoch": 0.9336701076213486, + "grad_norm": 1.7107130289077759, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7157348394393921, + "num_tokens": 220010127.0, + "step": 8502 + }, + { + "epoch": 0.9337799253239623, + "grad_norm": 1.7979073524475098, + "learning_rate": 5e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7183219194412231, + "num_tokens": 220033832.0, + "step": 8503 + }, + { + "epoch": 0.9338897430265759, + "grad_norm": 1.6481181383132935, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7134837508201599, + "num_tokens": 220062452.0, + "step": 8504 + }, + { + "epoch": 0.9339995607291895, + "grad_norm": 1.7033926248550415, + "learning_rate": 5e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7159966230392456, + "num_tokens": 220090771.0, + "step": 8505 + }, + { + "epoch": 0.9341093784318032, + "grad_norm": 1.74380362033844, + "learning_rate": 5e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.701585590839386, + "num_tokens": 220118996.0, + "step": 8506 + }, + { + "epoch": 0.9342191961344168, + "grad_norm": 2.0108776092529297, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.733615517616272, + "num_tokens": 220139425.0, + "step": 8507 + }, + { + "epoch": 0.9343290138370305, + "grad_norm": 1.6554155349731445, + "learning_rate": 5e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6912630796432495, + "num_tokens": 220167762.0, + "step": 8508 + }, + { + "epoch": 0.9344388315396442, + "grad_norm": 1.8661359548568726, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.729313850402832, + "num_tokens": 220189805.0, + "step": 8509 + }, + { + "epoch": 0.9345486492422579, + "grad_norm": 1.6884148120880127, + "learning_rate": 5e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7048351168632507, + "num_tokens": 220219425.0, + "step": 8510 + }, + { + "epoch": 0.9346584669448715, + "grad_norm": 1.9574824571609497, + "learning_rate": 5e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7092658281326294, + "num_tokens": 220241190.0, + "step": 8511 + }, + { + "epoch": 0.9347682846474852, + "grad_norm": 1.7605235576629639, + "learning_rate": 5e-06, + "loss": 0.968, + "mean_token_accuracy": 0.6977713108062744, + "num_tokens": 220269043.0, + "step": 8512 + }, + { + "epoch": 0.9348781023500988, + "grad_norm": 1.68820321559906, + "learning_rate": 5e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7120262384414673, + "num_tokens": 220295631.0, + "step": 8513 + }, + { + "epoch": 0.9349879200527125, + "grad_norm": 1.815468668937683, + "learning_rate": 5e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6853310465812683, + "num_tokens": 220320235.0, + "step": 8514 + }, + { + "epoch": 0.9350977377553261, + "grad_norm": 2.033980369567871, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7359094619750977, + "num_tokens": 220339006.0, + "step": 8515 + }, + { + "epoch": 0.9352075554579398, + "grad_norm": 1.9422816038131714, + "learning_rate": 5e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6995607614517212, + "num_tokens": 220363054.0, + "step": 8516 + }, + { + "epoch": 0.9353173731605535, + "grad_norm": 1.829667329788208, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7171928286552429, + "num_tokens": 220388044.0, + "step": 8517 + }, + { + "epoch": 0.9354271908631672, + "grad_norm": 2.1061482429504395, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7239967584609985, + "num_tokens": 220405255.0, + "step": 8518 + }, + { + "epoch": 0.9355370085657808, + "grad_norm": 1.6581767797470093, + "learning_rate": 5e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7042807340621948, + "num_tokens": 220434443.0, + "step": 8519 + }, + { + "epoch": 0.9356468262683945, + "grad_norm": 2.094487428665161, + "learning_rate": 5e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.732514500617981, + "num_tokens": 220452675.0, + "step": 8520 + }, + { + "epoch": 0.9357566439710081, + "grad_norm": 1.7255812883377075, + "learning_rate": 5e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6965380311012268, + "num_tokens": 220480173.0, + "step": 8521 + }, + { + "epoch": 0.9358664616736218, + "grad_norm": 1.8375643491744995, + "learning_rate": 5e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7111749053001404, + "num_tokens": 220504211.0, + "step": 8522 + }, + { + "epoch": 0.9359762793762354, + "grad_norm": 1.941503643989563, + "learning_rate": 5e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7099491357803345, + "num_tokens": 220526949.0, + "step": 8523 + }, + { + "epoch": 0.9360860970788492, + "grad_norm": 1.9504480361938477, + "learning_rate": 5e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6904585361480713, + "num_tokens": 220552253.0, + "step": 8524 + }, + { + "epoch": 0.9361959147814628, + "grad_norm": 1.8710734844207764, + "learning_rate": 5e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7136409282684326, + "num_tokens": 220575037.0, + "step": 8525 + }, + { + "epoch": 0.9363057324840764, + "grad_norm": 1.9444881677627563, + "learning_rate": 5e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6864370107650757, + "num_tokens": 220599202.0, + "step": 8526 + }, + { + "epoch": 0.9364155501866901, + "grad_norm": 1.7738385200500488, + "learning_rate": 5e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7104104161262512, + "num_tokens": 220624645.0, + "step": 8527 + }, + { + "epoch": 0.9365253678893037, + "grad_norm": 1.8596147298812866, + "learning_rate": 5e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.681883692741394, + "num_tokens": 220650778.0, + "step": 8528 + }, + { + "epoch": 0.9366351855919174, + "grad_norm": 1.7916961908340454, + "learning_rate": 5e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7032292485237122, + "num_tokens": 220676509.0, + "step": 8529 + }, + { + "epoch": 0.936745003294531, + "grad_norm": 1.9431297779083252, + "learning_rate": 5e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6950374841690063, + "num_tokens": 220699487.0, + "step": 8530 + }, + { + "epoch": 0.9368548209971448, + "grad_norm": 1.8168705701828003, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7138217687606812, + "num_tokens": 220725028.0, + "step": 8531 + }, + { + "epoch": 0.9369646386997584, + "grad_norm": 1.5971770286560059, + "learning_rate": 5e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6913865804672241, + "num_tokens": 220757855.0, + "step": 8532 + }, + { + "epoch": 0.9370744564023721, + "grad_norm": 1.951627254486084, + "learning_rate": 5e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7180267572402954, + "num_tokens": 220778976.0, + "step": 8533 + }, + { + "epoch": 0.9371842741049857, + "grad_norm": 1.6339292526245117, + "learning_rate": 5e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6918246746063232, + "num_tokens": 220808969.0, + "step": 8534 + }, + { + "epoch": 0.9372940918075994, + "grad_norm": 1.891151785850525, + "learning_rate": 5e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7102341651916504, + "num_tokens": 220831001.0, + "step": 8535 + }, + { + "epoch": 0.937403909510213, + "grad_norm": 1.690282940864563, + "learning_rate": 5e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7224342823028564, + "num_tokens": 220858703.0, + "step": 8536 + }, + { + "epoch": 0.9375137272128267, + "grad_norm": 1.9081573486328125, + "learning_rate": 5e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6976807117462158, + "num_tokens": 220880412.0, + "step": 8537 + }, + { + "epoch": 0.9376235449154404, + "grad_norm": 1.9501670598983765, + "learning_rate": 5e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7171533107757568, + "num_tokens": 220903365.0, + "step": 8538 + }, + { + "epoch": 0.9377333626180541, + "grad_norm": 1.7523236274719238, + "learning_rate": 5e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6943613290786743, + "num_tokens": 220929545.0, + "step": 8539 + }, + { + "epoch": 0.9378431803206677, + "grad_norm": 1.873108148574829, + "learning_rate": 5e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7116789817810059, + "num_tokens": 220952988.0, + "step": 8540 + }, + { + "epoch": 0.9379529980232814, + "grad_norm": 1.6959259510040283, + "learning_rate": 5e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7240880727767944, + "num_tokens": 220979276.0, + "step": 8541 + }, + { + "epoch": 0.938062815725895, + "grad_norm": 1.7446589469909668, + "learning_rate": 5e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7073740363121033, + "num_tokens": 221006968.0, + "step": 8542 + }, + { + "epoch": 0.9381726334285087, + "grad_norm": 1.8758094310760498, + "learning_rate": 5e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6907506585121155, + "num_tokens": 221032201.0, + "step": 8543 + }, + { + "epoch": 0.9382824511311223, + "grad_norm": 1.7576993703842163, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7211962342262268, + "num_tokens": 221055825.0, + "step": 8544 + }, + { + "epoch": 0.9383922688337359, + "grad_norm": 1.7715603113174438, + "learning_rate": 5e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.683617115020752, + "num_tokens": 221083411.0, + "step": 8545 + }, + { + "epoch": 0.9385020865363497, + "grad_norm": 1.7855340242385864, + "learning_rate": 5e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6940969824790955, + "num_tokens": 221109148.0, + "step": 8546 + }, + { + "epoch": 0.9386119042389633, + "grad_norm": 1.7606821060180664, + "learning_rate": 5e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7171059846878052, + "num_tokens": 221134175.0, + "step": 8547 + }, + { + "epoch": 0.938721721941577, + "grad_norm": 1.696513056755066, + "learning_rate": 5e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7144935131072998, + "num_tokens": 221162849.0, + "step": 8548 + }, + { + "epoch": 0.9388315396441906, + "grad_norm": 1.627119541168213, + "learning_rate": 5e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7307713627815247, + "num_tokens": 221189415.0, + "step": 8549 + }, + { + "epoch": 0.9389413573468043, + "grad_norm": 1.8649842739105225, + "learning_rate": 5e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6867595911026001, + "num_tokens": 221213847.0, + "step": 8550 + }, + { + "epoch": 0.9390511750494179, + "grad_norm": 1.9130102396011353, + "learning_rate": 5e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6878811120986938, + "num_tokens": 221236892.0, + "step": 8551 + }, + { + "epoch": 0.9391609927520316, + "grad_norm": 1.5733236074447632, + "learning_rate": 5e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7129582166671753, + "num_tokens": 221267294.0, + "step": 8552 + }, + { + "epoch": 0.9392708104546453, + "grad_norm": 1.9508885145187378, + "learning_rate": 5e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7251980304718018, + "num_tokens": 221286666.0, + "step": 8553 + }, + { + "epoch": 0.939380628157259, + "grad_norm": 2.009340286254883, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7133702039718628, + "num_tokens": 221308034.0, + "step": 8554 + }, + { + "epoch": 0.9394904458598726, + "grad_norm": 2.1311163902282715, + "learning_rate": 5e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7201575040817261, + "num_tokens": 221327480.0, + "step": 8555 + }, + { + "epoch": 0.9396002635624863, + "grad_norm": 1.6957119703292847, + "learning_rate": 5e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.6756526231765747, + "num_tokens": 221358862.0, + "step": 8556 + }, + { + "epoch": 0.9397100812650999, + "grad_norm": 1.5285166501998901, + "learning_rate": 5e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6980675458908081, + "num_tokens": 221389946.0, + "step": 8557 + }, + { + "epoch": 0.9398198989677136, + "grad_norm": 2.0439398288726807, + "learning_rate": 5e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.6975606679916382, + "num_tokens": 221411794.0, + "step": 8558 + }, + { + "epoch": 0.9399297166703272, + "grad_norm": 1.990220069885254, + "learning_rate": 5e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7068719863891602, + "num_tokens": 221434071.0, + "step": 8559 + }, + { + "epoch": 0.940039534372941, + "grad_norm": 1.7364444732666016, + "learning_rate": 5e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7258268594741821, + "num_tokens": 221459868.0, + "step": 8560 + }, + { + "epoch": 0.9401493520755546, + "grad_norm": 1.8442883491516113, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7153303623199463, + "num_tokens": 221486136.0, + "step": 8561 + }, + { + "epoch": 0.9402591697781683, + "grad_norm": 1.848855972290039, + "learning_rate": 5e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7037745118141174, + "num_tokens": 221510465.0, + "step": 8562 + }, + { + "epoch": 0.9403689874807819, + "grad_norm": 1.7916454076766968, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7213438749313354, + "num_tokens": 221534890.0, + "step": 8563 + }, + { + "epoch": 0.9404788051833955, + "grad_norm": 1.724035382270813, + "learning_rate": 5e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7152112722396851, + "num_tokens": 221563979.0, + "step": 8564 + }, + { + "epoch": 0.9405886228860092, + "grad_norm": 1.8158437013626099, + "learning_rate": 5e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7088374495506287, + "num_tokens": 221588097.0, + "step": 8565 + }, + { + "epoch": 0.9406984405886228, + "grad_norm": 1.86776602268219, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.717991828918457, + "num_tokens": 221611275.0, + "step": 8566 + }, + { + "epoch": 0.9408082582912366, + "grad_norm": 1.9434351921081543, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7478143572807312, + "num_tokens": 221632047.0, + "step": 8567 + }, + { + "epoch": 0.9409180759938502, + "grad_norm": 1.7241934537887573, + "learning_rate": 5e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7045605182647705, + "num_tokens": 221657193.0, + "step": 8568 + }, + { + "epoch": 0.9410278936964639, + "grad_norm": 1.9442768096923828, + "learning_rate": 5e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7107488512992859, + "num_tokens": 221678735.0, + "step": 8569 + }, + { + "epoch": 0.9411377113990775, + "grad_norm": 1.8584593534469604, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7197511196136475, + "num_tokens": 221700742.0, + "step": 8570 + }, + { + "epoch": 0.9412475291016912, + "grad_norm": 1.7013800144195557, + "learning_rate": 5e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7089495062828064, + "num_tokens": 221728609.0, + "step": 8571 + }, + { + "epoch": 0.9413573468043048, + "grad_norm": 1.689448356628418, + "learning_rate": 5e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6915481686592102, + "num_tokens": 221756348.0, + "step": 8572 + }, + { + "epoch": 0.9414671645069185, + "grad_norm": 1.8394523859024048, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.697676420211792, + "num_tokens": 221782651.0, + "step": 8573 + }, + { + "epoch": 0.9415769822095322, + "grad_norm": 1.8828717470169067, + "learning_rate": 5e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7195357084274292, + "num_tokens": 221806527.0, + "step": 8574 + }, + { + "epoch": 0.9416867999121459, + "grad_norm": 1.6644012928009033, + "learning_rate": 5e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7411097288131714, + "num_tokens": 221832472.0, + "step": 8575 + }, + { + "epoch": 0.9417966176147595, + "grad_norm": 1.7169671058654785, + "learning_rate": 5e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6946195363998413, + "num_tokens": 221860324.0, + "step": 8576 + }, + { + "epoch": 0.9419064353173732, + "grad_norm": 1.5463261604309082, + "learning_rate": 5e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7213941812515259, + "num_tokens": 221889932.0, + "step": 8577 + }, + { + "epoch": 0.9420162530199868, + "grad_norm": 1.8159146308898926, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.705187201499939, + "num_tokens": 221915064.0, + "step": 8578 + }, + { + "epoch": 0.9421260707226005, + "grad_norm": 1.6621342897415161, + "learning_rate": 5e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7217387557029724, + "num_tokens": 221941711.0, + "step": 8579 + }, + { + "epoch": 0.9422358884252141, + "grad_norm": 1.958810567855835, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7051619291305542, + "num_tokens": 221963963.0, + "step": 8580 + }, + { + "epoch": 0.9423457061278278, + "grad_norm": 1.741814374923706, + "learning_rate": 5e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6958454251289368, + "num_tokens": 221992534.0, + "step": 8581 + }, + { + "epoch": 0.9424555238304415, + "grad_norm": 1.7243168354034424, + "learning_rate": 5e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7125972509384155, + "num_tokens": 222017067.0, + "step": 8582 + }, + { + "epoch": 0.9425653415330552, + "grad_norm": 1.7090976238250732, + "learning_rate": 5e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7075081467628479, + "num_tokens": 222042787.0, + "step": 8583 + }, + { + "epoch": 0.9426751592356688, + "grad_norm": 1.7538676261901855, + "learning_rate": 5e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7028120160102844, + "num_tokens": 222068652.0, + "step": 8584 + }, + { + "epoch": 0.9427849769382824, + "grad_norm": 1.772529125213623, + "learning_rate": 5e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7328081130981445, + "num_tokens": 222092194.0, + "step": 8585 + }, + { + "epoch": 0.9428947946408961, + "grad_norm": 1.5337533950805664, + "learning_rate": 5e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7010761499404907, + "num_tokens": 222123461.0, + "step": 8586 + }, + { + "epoch": 0.9430046123435097, + "grad_norm": 1.7565584182739258, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7226229310035706, + "num_tokens": 222147629.0, + "step": 8587 + }, + { + "epoch": 0.9431144300461234, + "grad_norm": 1.7831909656524658, + "learning_rate": 5e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6919881105422974, + "num_tokens": 222171691.0, + "step": 8588 + }, + { + "epoch": 0.9432242477487371, + "grad_norm": 1.8492887020111084, + "learning_rate": 5e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7057334184646606, + "num_tokens": 222195703.0, + "step": 8589 + }, + { + "epoch": 0.9433340654513508, + "grad_norm": 1.5603232383728027, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.6986485123634338, + "num_tokens": 222231452.0, + "step": 8590 + }, + { + "epoch": 0.9434438831539644, + "grad_norm": 1.5465130805969238, + "learning_rate": 5e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7352722883224487, + "num_tokens": 222262489.0, + "step": 8591 + }, + { + "epoch": 0.9435537008565781, + "grad_norm": 1.6998709440231323, + "learning_rate": 5e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6819261312484741, + "num_tokens": 222291251.0, + "step": 8592 + }, + { + "epoch": 0.9436635185591917, + "grad_norm": 1.7167673110961914, + "learning_rate": 5e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.711065411567688, + "num_tokens": 222316691.0, + "step": 8593 + }, + { + "epoch": 0.9437733362618054, + "grad_norm": 1.7157589197158813, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7235778570175171, + "num_tokens": 222341035.0, + "step": 8594 + }, + { + "epoch": 0.943883153964419, + "grad_norm": 1.6758925914764404, + "learning_rate": 5e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7097011208534241, + "num_tokens": 222367698.0, + "step": 8595 + }, + { + "epoch": 0.9439929716670328, + "grad_norm": 1.6657445430755615, + "learning_rate": 5e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.712934672832489, + "num_tokens": 222395700.0, + "step": 8596 + }, + { + "epoch": 0.9441027893696464, + "grad_norm": 1.8882852792739868, + "learning_rate": 5e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6919891238212585, + "num_tokens": 222420189.0, + "step": 8597 + }, + { + "epoch": 0.9442126070722601, + "grad_norm": 1.8991297483444214, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7062675952911377, + "num_tokens": 222443316.0, + "step": 8598 + }, + { + "epoch": 0.9443224247748737, + "grad_norm": 1.7118284702301025, + "learning_rate": 5e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7125221490859985, + "num_tokens": 222471462.0, + "step": 8599 + }, + { + "epoch": 0.9444322424774874, + "grad_norm": 1.9294489622116089, + "learning_rate": 5e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7083988785743713, + "num_tokens": 222493069.0, + "step": 8600 + }, + { + "epoch": 0.944542060180101, + "grad_norm": 1.7758748531341553, + "learning_rate": 5e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7264702320098877, + "num_tokens": 222519973.0, + "step": 8601 + }, + { + "epoch": 0.9446518778827147, + "grad_norm": 1.76523756980896, + "learning_rate": 5e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6880922317504883, + "num_tokens": 222550428.0, + "step": 8602 + }, + { + "epoch": 0.9447616955853284, + "grad_norm": 1.6100085973739624, + "learning_rate": 5e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7090318202972412, + "num_tokens": 222579578.0, + "step": 8603 + }, + { + "epoch": 0.944871513287942, + "grad_norm": 1.7529382705688477, + "learning_rate": 5e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7167657613754272, + "num_tokens": 222603927.0, + "step": 8604 + }, + { + "epoch": 0.9449813309905557, + "grad_norm": 1.89180326461792, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.734754741191864, + "num_tokens": 222624539.0, + "step": 8605 + }, + { + "epoch": 0.9450911486931693, + "grad_norm": 1.7735997438430786, + "learning_rate": 5e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7180374264717102, + "num_tokens": 222651939.0, + "step": 8606 + }, + { + "epoch": 0.945200966395783, + "grad_norm": 1.7918899059295654, + "learning_rate": 5e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7057504057884216, + "num_tokens": 222676404.0, + "step": 8607 + }, + { + "epoch": 0.9453107840983966, + "grad_norm": 1.8885365724563599, + "learning_rate": 5e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7210665941238403, + "num_tokens": 222699559.0, + "step": 8608 + }, + { + "epoch": 0.9454206018010103, + "grad_norm": 1.5697894096374512, + "learning_rate": 5e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7289661169052124, + "num_tokens": 222728618.0, + "step": 8609 + }, + { + "epoch": 0.9455304195036239, + "grad_norm": 1.7436447143554688, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.712914228439331, + "num_tokens": 222754340.0, + "step": 8610 + }, + { + "epoch": 0.9456402372062377, + "grad_norm": 1.6968389749526978, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7159619331359863, + "num_tokens": 222779895.0, + "step": 8611 + }, + { + "epoch": 0.9457500549088513, + "grad_norm": 1.6510074138641357, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7285873889923096, + "num_tokens": 222806376.0, + "step": 8612 + }, + { + "epoch": 0.945859872611465, + "grad_norm": 1.5756521224975586, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7125874161720276, + "num_tokens": 222834954.0, + "step": 8613 + }, + { + "epoch": 0.9459696903140786, + "grad_norm": 1.646985650062561, + "learning_rate": 5e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7062423229217529, + "num_tokens": 222864445.0, + "step": 8614 + }, + { + "epoch": 0.9460795080166923, + "grad_norm": 1.7639808654785156, + "learning_rate": 5e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7055222988128662, + "num_tokens": 222889857.0, + "step": 8615 + }, + { + "epoch": 0.9461893257193059, + "grad_norm": 1.6683287620544434, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7241367101669312, + "num_tokens": 222917448.0, + "step": 8616 + }, + { + "epoch": 0.9462991434219196, + "grad_norm": 1.7056535482406616, + "learning_rate": 5e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6851744651794434, + "num_tokens": 222945143.0, + "step": 8617 + }, + { + "epoch": 0.9464089611245333, + "grad_norm": 1.772227168083191, + "learning_rate": 5e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6935602426528931, + "num_tokens": 222970411.0, + "step": 8618 + }, + { + "epoch": 0.946518778827147, + "grad_norm": 1.5832781791687012, + "learning_rate": 5e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6968609094619751, + "num_tokens": 223005394.0, + "step": 8619 + }, + { + "epoch": 0.9466285965297606, + "grad_norm": 1.4432188272476196, + "learning_rate": 5e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6794509291648865, + "num_tokens": 223039751.0, + "step": 8620 + }, + { + "epoch": 0.9467384142323743, + "grad_norm": 1.5207364559173584, + "learning_rate": 5e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7004686594009399, + "num_tokens": 223071069.0, + "step": 8621 + }, + { + "epoch": 0.9468482319349879, + "grad_norm": 1.5908751487731934, + "learning_rate": 5e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.692717432975769, + "num_tokens": 223104217.0, + "step": 8622 + }, + { + "epoch": 0.9469580496376016, + "grad_norm": 1.7673544883728027, + "learning_rate": 5e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7065417766571045, + "num_tokens": 223130424.0, + "step": 8623 + }, + { + "epoch": 0.9470678673402152, + "grad_norm": 1.7467702627182007, + "learning_rate": 5e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.696367084980011, + "num_tokens": 223156357.0, + "step": 8624 + }, + { + "epoch": 0.947177685042829, + "grad_norm": 1.9615718126296997, + "learning_rate": 5e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.6998132467269897, + "num_tokens": 223178535.0, + "step": 8625 + }, + { + "epoch": 0.9472875027454426, + "grad_norm": 1.9636285305023193, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7316913604736328, + "num_tokens": 223197692.0, + "step": 8626 + }, + { + "epoch": 0.9473973204480562, + "grad_norm": 1.7902696132659912, + "learning_rate": 5e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7025402784347534, + "num_tokens": 223226175.0, + "step": 8627 + }, + { + "epoch": 0.9475071381506699, + "grad_norm": 1.790748119354248, + "learning_rate": 5e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6991211175918579, + "num_tokens": 223254328.0, + "step": 8628 + }, + { + "epoch": 0.9476169558532835, + "grad_norm": 2.000042676925659, + "learning_rate": 5e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7258832454681396, + "num_tokens": 223279527.0, + "step": 8629 + }, + { + "epoch": 0.9477267735558972, + "grad_norm": 1.7260518074035645, + "learning_rate": 5e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7351832985877991, + "num_tokens": 223305748.0, + "step": 8630 + }, + { + "epoch": 0.9478365912585108, + "grad_norm": 1.7532391548156738, + "learning_rate": 5e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7223724126815796, + "num_tokens": 223328664.0, + "step": 8631 + }, + { + "epoch": 0.9479464089611246, + "grad_norm": 1.6531327962875366, + "learning_rate": 5e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7091732025146484, + "num_tokens": 223355554.0, + "step": 8632 + }, + { + "epoch": 0.9480562266637382, + "grad_norm": 1.71345853805542, + "learning_rate": 5e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6924416422843933, + "num_tokens": 223382259.0, + "step": 8633 + }, + { + "epoch": 0.9481660443663519, + "grad_norm": 1.58341383934021, + "learning_rate": 5e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.6968894004821777, + "num_tokens": 223411311.0, + "step": 8634 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 1.7859001159667969, + "learning_rate": 5e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7157201170921326, + "num_tokens": 223436941.0, + "step": 8635 + }, + { + "epoch": 0.9483856797715792, + "grad_norm": 1.7062734365463257, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7125695943832397, + "num_tokens": 223463442.0, + "step": 8636 + }, + { + "epoch": 0.9484954974741928, + "grad_norm": 1.7399003505706787, + "learning_rate": 5e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.705857515335083, + "num_tokens": 223489600.0, + "step": 8637 + }, + { + "epoch": 0.9486053151768065, + "grad_norm": 1.7317367792129517, + "learning_rate": 5e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.718083381652832, + "num_tokens": 223513957.0, + "step": 8638 + }, + { + "epoch": 0.9487151328794201, + "grad_norm": 1.7743587493896484, + "learning_rate": 5e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7014096975326538, + "num_tokens": 223539621.0, + "step": 8639 + }, + { + "epoch": 0.9488249505820339, + "grad_norm": 1.8264034986495972, + "learning_rate": 5e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7196370363235474, + "num_tokens": 223562411.0, + "step": 8640 + }, + { + "epoch": 0.9489347682846475, + "grad_norm": 1.826813817024231, + "learning_rate": 5e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6933378577232361, + "num_tokens": 223587971.0, + "step": 8641 + }, + { + "epoch": 0.9490445859872612, + "grad_norm": 1.8861600160598755, + "learning_rate": 5e-06, + "loss": 1.086, + "mean_token_accuracy": 0.6797018051147461, + "num_tokens": 223613405.0, + "step": 8642 + }, + { + "epoch": 0.9491544036898748, + "grad_norm": 1.6068257093429565, + "learning_rate": 5e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7016842365264893, + "num_tokens": 223646331.0, + "step": 8643 + }, + { + "epoch": 0.9492642213924884, + "grad_norm": 1.7543871402740479, + "learning_rate": 5e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7073537707328796, + "num_tokens": 223672050.0, + "step": 8644 + }, + { + "epoch": 0.9493740390951021, + "grad_norm": 1.9634190797805786, + "learning_rate": 5e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7153064608573914, + "num_tokens": 223694751.0, + "step": 8645 + }, + { + "epoch": 0.9494838567977157, + "grad_norm": 1.760256052017212, + "learning_rate": 5e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7316550016403198, + "num_tokens": 223720103.0, + "step": 8646 + }, + { + "epoch": 0.9495936745003295, + "grad_norm": 1.524048089981079, + "learning_rate": 5e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7019244432449341, + "num_tokens": 223754825.0, + "step": 8647 + }, + { + "epoch": 0.9497034922029431, + "grad_norm": 1.700439214706421, + "learning_rate": 5e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7173962593078613, + "num_tokens": 223781269.0, + "step": 8648 + }, + { + "epoch": 0.9498133099055568, + "grad_norm": 1.6633437871932983, + "learning_rate": 5e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7016282081604004, + "num_tokens": 223808803.0, + "step": 8649 + }, + { + "epoch": 0.9499231276081704, + "grad_norm": 1.773496389389038, + "learning_rate": 5e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7114225625991821, + "num_tokens": 223831250.0, + "step": 8650 + }, + { + "epoch": 0.9500329453107841, + "grad_norm": 1.679064154624939, + "learning_rate": 5e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6956441402435303, + "num_tokens": 223858605.0, + "step": 8651 + }, + { + "epoch": 0.9501427630133977, + "grad_norm": 1.7963587045669556, + "learning_rate": 5e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.679481565952301, + "num_tokens": 223887068.0, + "step": 8652 + }, + { + "epoch": 0.9502525807160114, + "grad_norm": 1.8539437055587769, + "learning_rate": 5e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.6928215622901917, + "num_tokens": 223910970.0, + "step": 8653 + }, + { + "epoch": 0.9503623984186251, + "grad_norm": 1.7922924757003784, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7304589748382568, + "num_tokens": 223933755.0, + "step": 8654 + }, + { + "epoch": 0.9504722161212388, + "grad_norm": 1.6297448873519897, + "learning_rate": 5e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7058164477348328, + "num_tokens": 223960801.0, + "step": 8655 + }, + { + "epoch": 0.9505820338238524, + "grad_norm": 1.6687337160110474, + "learning_rate": 5e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.700883150100708, + "num_tokens": 223988252.0, + "step": 8656 + }, + { + "epoch": 0.9506918515264661, + "grad_norm": 1.8106135129928589, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7291548848152161, + "num_tokens": 224011250.0, + "step": 8657 + }, + { + "epoch": 0.9508016692290797, + "grad_norm": 1.7196602821350098, + "learning_rate": 5e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7084109783172607, + "num_tokens": 224037603.0, + "step": 8658 + }, + { + "epoch": 0.9509114869316934, + "grad_norm": 1.7611483335494995, + "learning_rate": 5e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7189840078353882, + "num_tokens": 224062146.0, + "step": 8659 + }, + { + "epoch": 0.951021304634307, + "grad_norm": 1.7902804613113403, + "learning_rate": 5e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7008010745048523, + "num_tokens": 224090381.0, + "step": 8660 + }, + { + "epoch": 0.9511311223369208, + "grad_norm": 1.8556772470474243, + "learning_rate": 5e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.719689667224884, + "num_tokens": 224114769.0, + "step": 8661 + }, + { + "epoch": 0.9512409400395344, + "grad_norm": 1.659134030342102, + "learning_rate": 5e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.715935468673706, + "num_tokens": 224143543.0, + "step": 8662 + }, + { + "epoch": 0.9513507577421481, + "grad_norm": 1.6860584020614624, + "learning_rate": 5e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7044471502304077, + "num_tokens": 224169749.0, + "step": 8663 + }, + { + "epoch": 0.9514605754447617, + "grad_norm": 1.661044955253601, + "learning_rate": 5e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7232195138931274, + "num_tokens": 224198111.0, + "step": 8664 + }, + { + "epoch": 0.9515703931473753, + "grad_norm": 1.943003535270691, + "learning_rate": 5e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7117158770561218, + "num_tokens": 224220039.0, + "step": 8665 + }, + { + "epoch": 0.951680210849989, + "grad_norm": 1.6627804040908813, + "learning_rate": 5e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7278264760971069, + "num_tokens": 224245492.0, + "step": 8666 + }, + { + "epoch": 0.9517900285526026, + "grad_norm": 1.9184329509735107, + "learning_rate": 5e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.699189305305481, + "num_tokens": 224269326.0, + "step": 8667 + }, + { + "epoch": 0.9518998462552163, + "grad_norm": 1.828791618347168, + "learning_rate": 5e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.6958096027374268, + "num_tokens": 224294695.0, + "step": 8668 + }, + { + "epoch": 0.95200966395783, + "grad_norm": 1.8649686574935913, + "learning_rate": 5e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6969293355941772, + "num_tokens": 224319898.0, + "step": 8669 + }, + { + "epoch": 0.9521194816604437, + "grad_norm": 1.7285140752792358, + "learning_rate": 5e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6837525367736816, + "num_tokens": 224348085.0, + "step": 8670 + }, + { + "epoch": 0.9522292993630573, + "grad_norm": 1.7196565866470337, + "learning_rate": 5e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7099555730819702, + "num_tokens": 224373477.0, + "step": 8671 + }, + { + "epoch": 0.952339117065671, + "grad_norm": 1.7467889785766602, + "learning_rate": 5e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7003995180130005, + "num_tokens": 224401217.0, + "step": 8672 + }, + { + "epoch": 0.9524489347682846, + "grad_norm": 1.7162083387374878, + "learning_rate": 5e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7093615531921387, + "num_tokens": 224428112.0, + "step": 8673 + }, + { + "epoch": 0.9525587524708983, + "grad_norm": 1.9347305297851562, + "learning_rate": 5e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.697168231010437, + "num_tokens": 224453401.0, + "step": 8674 + }, + { + "epoch": 0.9526685701735119, + "grad_norm": 1.9195964336395264, + "learning_rate": 5e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6825741529464722, + "num_tokens": 224477322.0, + "step": 8675 + }, + { + "epoch": 0.9527783878761257, + "grad_norm": 1.5736154317855835, + "learning_rate": 5e-06, + "loss": 0.928, + "mean_token_accuracy": 0.718193531036377, + "num_tokens": 224507370.0, + "step": 8676 + }, + { + "epoch": 0.9528882055787393, + "grad_norm": 1.8068653345108032, + "learning_rate": 5e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7127761840820312, + "num_tokens": 224532005.0, + "step": 8677 + }, + { + "epoch": 0.952998023281353, + "grad_norm": 1.8021866083145142, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7255972623825073, + "num_tokens": 224555314.0, + "step": 8678 + }, + { + "epoch": 0.9531078409839666, + "grad_norm": 1.8119711875915527, + "learning_rate": 5e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7117207646369934, + "num_tokens": 224579617.0, + "step": 8679 + }, + { + "epoch": 0.9532176586865803, + "grad_norm": 1.8491466045379639, + "learning_rate": 5e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7061703205108643, + "num_tokens": 224603979.0, + "step": 8680 + }, + { + "epoch": 0.9533274763891939, + "grad_norm": 1.6742355823516846, + "learning_rate": 5e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7021907567977905, + "num_tokens": 224632418.0, + "step": 8681 + }, + { + "epoch": 0.9534372940918076, + "grad_norm": 1.8765056133270264, + "learning_rate": 5e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7090054154396057, + "num_tokens": 224654808.0, + "step": 8682 + }, + { + "epoch": 0.9535471117944213, + "grad_norm": 1.7573720216751099, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7223450541496277, + "num_tokens": 224680377.0, + "step": 8683 + }, + { + "epoch": 0.953656929497035, + "grad_norm": 1.6241990327835083, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7299258708953857, + "num_tokens": 224706639.0, + "step": 8684 + }, + { + "epoch": 0.9537667471996486, + "grad_norm": 1.6067357063293457, + "learning_rate": 5e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.69440096616745, + "num_tokens": 224735949.0, + "step": 8685 + }, + { + "epoch": 0.9538765649022622, + "grad_norm": 1.716034173965454, + "learning_rate": 5e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7095355987548828, + "num_tokens": 224760153.0, + "step": 8686 + }, + { + "epoch": 0.9539863826048759, + "grad_norm": 1.724430799484253, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7281559109687805, + "num_tokens": 224784323.0, + "step": 8687 + }, + { + "epoch": 0.9540962003074895, + "grad_norm": 1.7776789665222168, + "learning_rate": 5e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7167760133743286, + "num_tokens": 224807482.0, + "step": 8688 + }, + { + "epoch": 0.9542060180101032, + "grad_norm": 1.7141362428665161, + "learning_rate": 5e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7125887274742126, + "num_tokens": 224833167.0, + "step": 8689 + }, + { + "epoch": 0.9543158357127169, + "grad_norm": 1.8058149814605713, + "learning_rate": 5e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7064728736877441, + "num_tokens": 224858108.0, + "step": 8690 + }, + { + "epoch": 0.9544256534153306, + "grad_norm": 1.676360845565796, + "learning_rate": 5e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7190107703208923, + "num_tokens": 224885876.0, + "step": 8691 + }, + { + "epoch": 0.9545354711179442, + "grad_norm": 2.0379748344421387, + "learning_rate": 5e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7219868898391724, + "num_tokens": 224905348.0, + "step": 8692 + }, + { + "epoch": 0.9546452888205579, + "grad_norm": 1.884106159210205, + "learning_rate": 5e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7146039009094238, + "num_tokens": 224929981.0, + "step": 8693 + }, + { + "epoch": 0.9547551065231715, + "grad_norm": 2.0144805908203125, + "learning_rate": 5e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7257029414176941, + "num_tokens": 224950839.0, + "step": 8694 + }, + { + "epoch": 0.9548649242257852, + "grad_norm": 1.837931752204895, + "learning_rate": 5e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7255864143371582, + "num_tokens": 224973975.0, + "step": 8695 + }, + { + "epoch": 0.9549747419283988, + "grad_norm": 1.7838069200515747, + "learning_rate": 5e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.706261396408081, + "num_tokens": 224999347.0, + "step": 8696 + }, + { + "epoch": 0.9550845596310125, + "grad_norm": 1.5767110586166382, + "learning_rate": 5e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7173886895179749, + "num_tokens": 225033175.0, + "step": 8697 + }, + { + "epoch": 0.9551943773336262, + "grad_norm": 1.6413706541061401, + "learning_rate": 5e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6861807107925415, + "num_tokens": 225061485.0, + "step": 8698 + }, + { + "epoch": 0.9553041950362399, + "grad_norm": 1.8213211297988892, + "learning_rate": 5e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7247629165649414, + "num_tokens": 225084774.0, + "step": 8699 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 1.6003339290618896, + "learning_rate": 5e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6930246353149414, + "num_tokens": 225116760.0, + "step": 8700 + }, + { + "epoch": 0.9555238304414672, + "grad_norm": 1.790174961090088, + "learning_rate": 5e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7174637317657471, + "num_tokens": 225140527.0, + "step": 8701 + }, + { + "epoch": 0.9556336481440808, + "grad_norm": 1.8658853769302368, + "learning_rate": 5e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6978267431259155, + "num_tokens": 225164642.0, + "step": 8702 + }, + { + "epoch": 0.9557434658466945, + "grad_norm": 1.7415233850479126, + "learning_rate": 5e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7141616344451904, + "num_tokens": 225190855.0, + "step": 8703 + }, + { + "epoch": 0.9558532835493081, + "grad_norm": 1.6183860301971436, + "learning_rate": 5e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6892017126083374, + "num_tokens": 225220680.0, + "step": 8704 + }, + { + "epoch": 0.9559631012519219, + "grad_norm": 1.930230736732483, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.698920726776123, + "num_tokens": 225244458.0, + "step": 8705 + }, + { + "epoch": 0.9560729189545355, + "grad_norm": 1.7995878458023071, + "learning_rate": 5e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7198930978775024, + "num_tokens": 225268220.0, + "step": 8706 + }, + { + "epoch": 0.9561827366571491, + "grad_norm": 1.8042629957199097, + "learning_rate": 5e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7020828127861023, + "num_tokens": 225292650.0, + "step": 8707 + }, + { + "epoch": 0.9562925543597628, + "grad_norm": 1.8419933319091797, + "learning_rate": 5e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6856503486633301, + "num_tokens": 225317140.0, + "step": 8708 + }, + { + "epoch": 0.9564023720623764, + "grad_norm": 1.7357475757598877, + "learning_rate": 5e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7080700397491455, + "num_tokens": 225342095.0, + "step": 8709 + }, + { + "epoch": 0.9565121897649901, + "grad_norm": 1.9268286228179932, + "learning_rate": 5e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.6960341930389404, + "num_tokens": 225364275.0, + "step": 8710 + }, + { + "epoch": 0.9566220074676037, + "grad_norm": 1.8860042095184326, + "learning_rate": 5e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6934417486190796, + "num_tokens": 225387630.0, + "step": 8711 + }, + { + "epoch": 0.9567318251702175, + "grad_norm": 1.5603238344192505, + "learning_rate": 5e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6926074028015137, + "num_tokens": 225422664.0, + "step": 8712 + }, + { + "epoch": 0.9568416428728311, + "grad_norm": 1.7464715242385864, + "learning_rate": 5e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6985337138175964, + "num_tokens": 225451102.0, + "step": 8713 + }, + { + "epoch": 0.9569514605754448, + "grad_norm": 1.6693633794784546, + "learning_rate": 5e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6823520064353943, + "num_tokens": 225480118.0, + "step": 8714 + }, + { + "epoch": 0.9570612782780584, + "grad_norm": 1.7743299007415771, + "learning_rate": 5e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7284017205238342, + "num_tokens": 225506369.0, + "step": 8715 + }, + { + "epoch": 0.9571710959806721, + "grad_norm": 1.7864328622817993, + "learning_rate": 5e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.702858567237854, + "num_tokens": 225530616.0, + "step": 8716 + }, + { + "epoch": 0.9572809136832857, + "grad_norm": 2.085411310195923, + "learning_rate": 5e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7171891927719116, + "num_tokens": 225550387.0, + "step": 8717 + }, + { + "epoch": 0.9573907313858994, + "grad_norm": 1.7107044458389282, + "learning_rate": 5e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.704626739025116, + "num_tokens": 225574887.0, + "step": 8718 + }, + { + "epoch": 0.9575005490885131, + "grad_norm": 1.5986796617507935, + "learning_rate": 5e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6793419122695923, + "num_tokens": 225608604.0, + "step": 8719 + }, + { + "epoch": 0.9576103667911268, + "grad_norm": 1.593754529953003, + "learning_rate": 5e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6846282482147217, + "num_tokens": 225640539.0, + "step": 8720 + }, + { + "epoch": 0.9577201844937404, + "grad_norm": 1.7745212316513062, + "learning_rate": 5e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7228488326072693, + "num_tokens": 225664698.0, + "step": 8721 + }, + { + "epoch": 0.9578300021963541, + "grad_norm": 1.8971893787384033, + "learning_rate": 5e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6822588443756104, + "num_tokens": 225689245.0, + "step": 8722 + }, + { + "epoch": 0.9579398198989677, + "grad_norm": 1.8079683780670166, + "learning_rate": 5e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7179982662200928, + "num_tokens": 225715019.0, + "step": 8723 + }, + { + "epoch": 0.9580496376015813, + "grad_norm": 1.5848982334136963, + "learning_rate": 5e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7206626534461975, + "num_tokens": 225741327.0, + "step": 8724 + }, + { + "epoch": 0.958159455304195, + "grad_norm": 1.8983299732208252, + "learning_rate": 5e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7173702716827393, + "num_tokens": 225764140.0, + "step": 8725 + }, + { + "epoch": 0.9582692730068088, + "grad_norm": 1.837361454963684, + "learning_rate": 5e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7217627763748169, + "num_tokens": 225788416.0, + "step": 8726 + }, + { + "epoch": 0.9583790907094224, + "grad_norm": 1.7285845279693604, + "learning_rate": 5e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6935635209083557, + "num_tokens": 225817975.0, + "step": 8727 + }, + { + "epoch": 0.958488908412036, + "grad_norm": 1.8435461521148682, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7186068296432495, + "num_tokens": 225844770.0, + "step": 8728 + }, + { + "epoch": 0.9585987261146497, + "grad_norm": 1.7349019050598145, + "learning_rate": 5e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7139906883239746, + "num_tokens": 225871579.0, + "step": 8729 + }, + { + "epoch": 0.9587085438172633, + "grad_norm": 1.945766568183899, + "learning_rate": 5e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7250962257385254, + "num_tokens": 225889843.0, + "step": 8730 + }, + { + "epoch": 0.958818361519877, + "grad_norm": 1.8800146579742432, + "learning_rate": 5e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7127096652984619, + "num_tokens": 225913404.0, + "step": 8731 + }, + { + "epoch": 0.9589281792224906, + "grad_norm": 1.7605164051055908, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.709963321685791, + "num_tokens": 225940255.0, + "step": 8732 + }, + { + "epoch": 0.9590379969251043, + "grad_norm": 1.5981929302215576, + "learning_rate": 5e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6963956356048584, + "num_tokens": 225969270.0, + "step": 8733 + }, + { + "epoch": 0.959147814627718, + "grad_norm": 1.9719206094741821, + "learning_rate": 5e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7099201679229736, + "num_tokens": 225990088.0, + "step": 8734 + }, + { + "epoch": 0.9592576323303317, + "grad_norm": 1.8373241424560547, + "learning_rate": 5e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7222455739974976, + "num_tokens": 226014356.0, + "step": 8735 + }, + { + "epoch": 0.9593674500329453, + "grad_norm": 1.9577592611312866, + "learning_rate": 5e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7162311673164368, + "num_tokens": 226035924.0, + "step": 8736 + }, + { + "epoch": 0.959477267735559, + "grad_norm": 1.8047237396240234, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7238185405731201, + "num_tokens": 226060996.0, + "step": 8737 + }, + { + "epoch": 0.9595870854381726, + "grad_norm": 1.9028459787368774, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7178932428359985, + "num_tokens": 226083163.0, + "step": 8738 + }, + { + "epoch": 0.9596969031407863, + "grad_norm": 1.8863277435302734, + "learning_rate": 5e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7056057453155518, + "num_tokens": 226106135.0, + "step": 8739 + }, + { + "epoch": 0.9598067208433999, + "grad_norm": 2.0188496112823486, + "learning_rate": 5e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7155531644821167, + "num_tokens": 226125436.0, + "step": 8740 + }, + { + "epoch": 0.9599165385460137, + "grad_norm": 1.7613240480422974, + "learning_rate": 5e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7189714908599854, + "num_tokens": 226148097.0, + "step": 8741 + }, + { + "epoch": 0.9600263562486273, + "grad_norm": 1.8143616914749146, + "learning_rate": 5e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7059429883956909, + "num_tokens": 226173960.0, + "step": 8742 + }, + { + "epoch": 0.960136173951241, + "grad_norm": 1.6157349348068237, + "learning_rate": 5e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7035467624664307, + "num_tokens": 226204648.0, + "step": 8743 + }, + { + "epoch": 0.9602459916538546, + "grad_norm": 1.82893705368042, + "learning_rate": 5e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7112040519714355, + "num_tokens": 226228737.0, + "step": 8744 + }, + { + "epoch": 0.9603558093564682, + "grad_norm": 1.7326146364212036, + "learning_rate": 5e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7180885076522827, + "num_tokens": 226254044.0, + "step": 8745 + }, + { + "epoch": 0.9604656270590819, + "grad_norm": 1.9282054901123047, + "learning_rate": 5e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7201025485992432, + "num_tokens": 226275234.0, + "step": 8746 + }, + { + "epoch": 0.9605754447616955, + "grad_norm": 1.8432352542877197, + "learning_rate": 5e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7155570983886719, + "num_tokens": 226298130.0, + "step": 8747 + }, + { + "epoch": 0.9606852624643093, + "grad_norm": 1.8051605224609375, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7193922400474548, + "num_tokens": 226320477.0, + "step": 8748 + }, + { + "epoch": 0.9607950801669229, + "grad_norm": 1.9420210123062134, + "learning_rate": 5e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7234196662902832, + "num_tokens": 226346891.0, + "step": 8749 + }, + { + "epoch": 0.9609048978695366, + "grad_norm": 1.7111022472381592, + "learning_rate": 5e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7026499509811401, + "num_tokens": 226374777.0, + "step": 8750 + }, + { + "epoch": 0.9610147155721502, + "grad_norm": 1.8823589086532593, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7090373039245605, + "num_tokens": 226400098.0, + "step": 8751 + }, + { + "epoch": 0.9611245332747639, + "grad_norm": 1.6794713735580444, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7316219210624695, + "num_tokens": 226425232.0, + "step": 8752 + }, + { + "epoch": 0.9612343509773775, + "grad_norm": 1.8084406852722168, + "learning_rate": 5e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7180087566375732, + "num_tokens": 226450680.0, + "step": 8753 + }, + { + "epoch": 0.9613441686799912, + "grad_norm": 1.97459876537323, + "learning_rate": 5e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7156657576560974, + "num_tokens": 226470380.0, + "step": 8754 + }, + { + "epoch": 0.9614539863826049, + "grad_norm": 1.6722079515457153, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.714058518409729, + "num_tokens": 226499654.0, + "step": 8755 + }, + { + "epoch": 0.9615638040852186, + "grad_norm": 2.311923027038574, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7158055305480957, + "num_tokens": 226514790.0, + "step": 8756 + }, + { + "epoch": 0.9616736217878322, + "grad_norm": 1.8667351007461548, + "learning_rate": 5e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7179770469665527, + "num_tokens": 226537249.0, + "step": 8757 + }, + { + "epoch": 0.9617834394904459, + "grad_norm": 1.8033679723739624, + "learning_rate": 5e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7042052745819092, + "num_tokens": 226560372.0, + "step": 8758 + }, + { + "epoch": 0.9618932571930595, + "grad_norm": 1.862497329711914, + "learning_rate": 5e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7165290117263794, + "num_tokens": 226584462.0, + "step": 8759 + }, + { + "epoch": 0.9620030748956732, + "grad_norm": 1.7498809099197388, + "learning_rate": 5e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7035205364227295, + "num_tokens": 226609890.0, + "step": 8760 + }, + { + "epoch": 0.9621128925982868, + "grad_norm": 1.9563535451889038, + "learning_rate": 5e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6940176486968994, + "num_tokens": 226633899.0, + "step": 8761 + }, + { + "epoch": 0.9622227103009005, + "grad_norm": 1.8866928815841675, + "learning_rate": 5e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.708713173866272, + "num_tokens": 226658237.0, + "step": 8762 + }, + { + "epoch": 0.9623325280035142, + "grad_norm": 1.9911190271377563, + "learning_rate": 5e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.720474362373352, + "num_tokens": 226679029.0, + "step": 8763 + }, + { + "epoch": 0.9624423457061279, + "grad_norm": 1.9173163175582886, + "learning_rate": 5e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7110445499420166, + "num_tokens": 226701751.0, + "step": 8764 + }, + { + "epoch": 0.9625521634087415, + "grad_norm": 1.9136931896209717, + "learning_rate": 5e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7250628471374512, + "num_tokens": 226723987.0, + "step": 8765 + }, + { + "epoch": 0.9626619811113551, + "grad_norm": 1.7482675313949585, + "learning_rate": 5e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7123990058898926, + "num_tokens": 226751019.0, + "step": 8766 + }, + { + "epoch": 0.9627717988139688, + "grad_norm": 1.6840758323669434, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7353595495223999, + "num_tokens": 226776061.0, + "step": 8767 + }, + { + "epoch": 0.9628816165165824, + "grad_norm": 1.7107782363891602, + "learning_rate": 5e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7009807825088501, + "num_tokens": 226802383.0, + "step": 8768 + }, + { + "epoch": 0.9629914342191961, + "grad_norm": 1.594641923904419, + "learning_rate": 5e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7201694250106812, + "num_tokens": 226832442.0, + "step": 8769 + }, + { + "epoch": 0.9631012519218098, + "grad_norm": 1.7894619703292847, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7116472125053406, + "num_tokens": 226858658.0, + "step": 8770 + }, + { + "epoch": 0.9632110696244235, + "grad_norm": 1.6571435928344727, + "learning_rate": 5e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7203459739685059, + "num_tokens": 226886821.0, + "step": 8771 + }, + { + "epoch": 0.9633208873270371, + "grad_norm": 1.7732211351394653, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7139590382575989, + "num_tokens": 226911728.0, + "step": 8772 + }, + { + "epoch": 0.9634307050296508, + "grad_norm": 1.9151691198349, + "learning_rate": 5e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7109832763671875, + "num_tokens": 226935263.0, + "step": 8773 + }, + { + "epoch": 0.9635405227322644, + "grad_norm": 1.7466864585876465, + "learning_rate": 5e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7216500043869019, + "num_tokens": 226958537.0, + "step": 8774 + }, + { + "epoch": 0.9636503404348781, + "grad_norm": 1.6639958620071411, + "learning_rate": 5e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7148504257202148, + "num_tokens": 226985948.0, + "step": 8775 + }, + { + "epoch": 0.9637601581374917, + "grad_norm": 1.6736661195755005, + "learning_rate": 5e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.718859076499939, + "num_tokens": 227014141.0, + "step": 8776 + }, + { + "epoch": 0.9638699758401055, + "grad_norm": 1.8013681173324585, + "learning_rate": 5e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6782814264297485, + "num_tokens": 227039204.0, + "step": 8777 + }, + { + "epoch": 0.9639797935427191, + "grad_norm": 1.6441391706466675, + "learning_rate": 5e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6974571943283081, + "num_tokens": 227068781.0, + "step": 8778 + }, + { + "epoch": 0.9640896112453328, + "grad_norm": 1.6958706378936768, + "learning_rate": 5e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6980553865432739, + "num_tokens": 227096642.0, + "step": 8779 + }, + { + "epoch": 0.9641994289479464, + "grad_norm": 1.9171167612075806, + "learning_rate": 5e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6895601153373718, + "num_tokens": 227119496.0, + "step": 8780 + }, + { + "epoch": 0.9643092466505601, + "grad_norm": 1.7531176805496216, + "learning_rate": 5e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7179990410804749, + "num_tokens": 227143375.0, + "step": 8781 + }, + { + "epoch": 0.9644190643531737, + "grad_norm": 1.5728355646133423, + "learning_rate": 5e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.6962901949882507, + "num_tokens": 227174421.0, + "step": 8782 + }, + { + "epoch": 0.9645288820557874, + "grad_norm": 1.7758420705795288, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7264140248298645, + "num_tokens": 227201229.0, + "step": 8783 + }, + { + "epoch": 0.9646386997584011, + "grad_norm": 1.7336938381195068, + "learning_rate": 5e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6928014755249023, + "num_tokens": 227227610.0, + "step": 8784 + }, + { + "epoch": 0.9647485174610148, + "grad_norm": 1.7333554029464722, + "learning_rate": 5e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.710999608039856, + "num_tokens": 227253149.0, + "step": 8785 + }, + { + "epoch": 0.9648583351636284, + "grad_norm": 1.641436219215393, + "learning_rate": 5e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7202198505401611, + "num_tokens": 227280592.0, + "step": 8786 + }, + { + "epoch": 0.964968152866242, + "grad_norm": 1.715715765953064, + "learning_rate": 5e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6966909766197205, + "num_tokens": 227308207.0, + "step": 8787 + }, + { + "epoch": 0.9650779705688557, + "grad_norm": 1.5989434719085693, + "learning_rate": 5e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7301619052886963, + "num_tokens": 227334270.0, + "step": 8788 + }, + { + "epoch": 0.9651877882714693, + "grad_norm": 1.9457004070281982, + "learning_rate": 5e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7147029638290405, + "num_tokens": 227357004.0, + "step": 8789 + }, + { + "epoch": 0.965297605974083, + "grad_norm": 1.8147082328796387, + "learning_rate": 5e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6863451600074768, + "num_tokens": 227384998.0, + "step": 8790 + }, + { + "epoch": 0.9654074236766966, + "grad_norm": 1.755875587463379, + "learning_rate": 5e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7060437202453613, + "num_tokens": 227410062.0, + "step": 8791 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 1.744261622428894, + "learning_rate": 5e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7121516466140747, + "num_tokens": 227434273.0, + "step": 8792 + }, + { + "epoch": 0.965627059081924, + "grad_norm": 1.5638688802719116, + "learning_rate": 5e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.7069332003593445, + "num_tokens": 227466332.0, + "step": 8793 + }, + { + "epoch": 0.9657368767845377, + "grad_norm": 1.5393891334533691, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7222112417221069, + "num_tokens": 227500012.0, + "step": 8794 + }, + { + "epoch": 0.9658466944871513, + "grad_norm": 1.7632911205291748, + "learning_rate": 5e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7157418727874756, + "num_tokens": 227524362.0, + "step": 8795 + }, + { + "epoch": 0.965956512189765, + "grad_norm": 1.9641321897506714, + "learning_rate": 5e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7221800088882446, + "num_tokens": 227544091.0, + "step": 8796 + }, + { + "epoch": 0.9660663298923786, + "grad_norm": 1.6742236614227295, + "learning_rate": 5e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6778620481491089, + "num_tokens": 227572138.0, + "step": 8797 + }, + { + "epoch": 0.9661761475949923, + "grad_norm": 1.5914361476898193, + "learning_rate": 5e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6797100305557251, + "num_tokens": 227603057.0, + "step": 8798 + }, + { + "epoch": 0.966285965297606, + "grad_norm": 1.7676644325256348, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7117949724197388, + "num_tokens": 227631177.0, + "step": 8799 + }, + { + "epoch": 0.9663957830002197, + "grad_norm": 1.700571894645691, + "learning_rate": 5e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6957277059555054, + "num_tokens": 227657882.0, + "step": 8800 + }, + { + "epoch": 0.9665056007028333, + "grad_norm": 1.603476881980896, + "learning_rate": 5e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7142347097396851, + "num_tokens": 227686901.0, + "step": 8801 + }, + { + "epoch": 0.966615418405447, + "grad_norm": 1.8613067865371704, + "learning_rate": 5e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7075223922729492, + "num_tokens": 227709241.0, + "step": 8802 + }, + { + "epoch": 0.9667252361080606, + "grad_norm": 1.516096591949463, + "learning_rate": 5e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7071027755737305, + "num_tokens": 227743975.0, + "step": 8803 + }, + { + "epoch": 0.9668350538106742, + "grad_norm": 1.610405445098877, + "learning_rate": 5e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7292553186416626, + "num_tokens": 227773705.0, + "step": 8804 + }, + { + "epoch": 0.9669448715132879, + "grad_norm": 1.5864313840866089, + "learning_rate": 5e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7088289260864258, + "num_tokens": 227803779.0, + "step": 8805 + }, + { + "epoch": 0.9670546892159017, + "grad_norm": 1.716401219367981, + "learning_rate": 5e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7080831527709961, + "num_tokens": 227831515.0, + "step": 8806 + }, + { + "epoch": 0.9671645069185153, + "grad_norm": 1.9968513250350952, + "learning_rate": 5e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7028968334197998, + "num_tokens": 227852963.0, + "step": 8807 + }, + { + "epoch": 0.967274324621129, + "grad_norm": 2.2338976860046387, + "learning_rate": 5e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7391930222511292, + "num_tokens": 227868227.0, + "step": 8808 + }, + { + "epoch": 0.9673841423237426, + "grad_norm": 1.91184401512146, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7181017398834229, + "num_tokens": 227890427.0, + "step": 8809 + }, + { + "epoch": 0.9674939600263562, + "grad_norm": 1.7183808088302612, + "learning_rate": 5e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7138605117797852, + "num_tokens": 227917066.0, + "step": 8810 + }, + { + "epoch": 0.9676037777289699, + "grad_norm": 1.6149420738220215, + "learning_rate": 5e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7164526581764221, + "num_tokens": 227945461.0, + "step": 8811 + }, + { + "epoch": 0.9677135954315835, + "grad_norm": 1.6586860418319702, + "learning_rate": 5e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7225468754768372, + "num_tokens": 227973599.0, + "step": 8812 + }, + { + "epoch": 0.9678234131341973, + "grad_norm": 1.574333667755127, + "learning_rate": 5e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7038023471832275, + "num_tokens": 228006432.0, + "step": 8813 + }, + { + "epoch": 0.9679332308368109, + "grad_norm": 1.718017816543579, + "learning_rate": 5e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7078799605369568, + "num_tokens": 228032802.0, + "step": 8814 + }, + { + "epoch": 0.9680430485394246, + "grad_norm": 1.7909623384475708, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7072597742080688, + "num_tokens": 228060380.0, + "step": 8815 + }, + { + "epoch": 0.9681528662420382, + "grad_norm": 1.9371907711029053, + "learning_rate": 5e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.6978231072425842, + "num_tokens": 228081460.0, + "step": 8816 + }, + { + "epoch": 0.9682626839446519, + "grad_norm": 1.596252679824829, + "learning_rate": 5e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6891330480575562, + "num_tokens": 228114842.0, + "step": 8817 + }, + { + "epoch": 0.9683725016472655, + "grad_norm": 1.7417585849761963, + "learning_rate": 5e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7084861397743225, + "num_tokens": 228140451.0, + "step": 8818 + }, + { + "epoch": 0.9684823193498792, + "grad_norm": 1.8969966173171997, + "learning_rate": 5e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7025352716445923, + "num_tokens": 228166245.0, + "step": 8819 + }, + { + "epoch": 0.9685921370524928, + "grad_norm": 1.6337345838546753, + "learning_rate": 5e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7175897359848022, + "num_tokens": 228195588.0, + "step": 8820 + }, + { + "epoch": 0.9687019547551066, + "grad_norm": 1.6288695335388184, + "learning_rate": 5e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6999455094337463, + "num_tokens": 228225938.0, + "step": 8821 + }, + { + "epoch": 0.9688117724577202, + "grad_norm": 1.9003206491470337, + "learning_rate": 5e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.697992205619812, + "num_tokens": 228248319.0, + "step": 8822 + }, + { + "epoch": 0.9689215901603339, + "grad_norm": 1.745252013206482, + "learning_rate": 5e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7024846076965332, + "num_tokens": 228275978.0, + "step": 8823 + }, + { + "epoch": 0.9690314078629475, + "grad_norm": 1.9424017667770386, + "learning_rate": 5e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7075625658035278, + "num_tokens": 228297825.0, + "step": 8824 + }, + { + "epoch": 0.9691412255655611, + "grad_norm": 1.7166965007781982, + "learning_rate": 5e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7066993713378906, + "num_tokens": 228323061.0, + "step": 8825 + }, + { + "epoch": 0.9692510432681748, + "grad_norm": 1.687875509262085, + "learning_rate": 5e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7030081748962402, + "num_tokens": 228350476.0, + "step": 8826 + }, + { + "epoch": 0.9693608609707884, + "grad_norm": 1.8141977787017822, + "learning_rate": 5e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6949278116226196, + "num_tokens": 228377767.0, + "step": 8827 + }, + { + "epoch": 0.9694706786734022, + "grad_norm": 1.9879040718078613, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6970981359481812, + "num_tokens": 228399213.0, + "step": 8828 + }, + { + "epoch": 0.9695804963760158, + "grad_norm": 1.8565967082977295, + "learning_rate": 5e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7206162214279175, + "num_tokens": 228422329.0, + "step": 8829 + }, + { + "epoch": 0.9696903140786295, + "grad_norm": 1.4520354270935059, + "learning_rate": 5e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6988577842712402, + "num_tokens": 228457500.0, + "step": 8830 + }, + { + "epoch": 0.9698001317812431, + "grad_norm": 1.8383986949920654, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.701056957244873, + "num_tokens": 228481500.0, + "step": 8831 + }, + { + "epoch": 0.9699099494838568, + "grad_norm": 1.7384674549102783, + "learning_rate": 5e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6875631809234619, + "num_tokens": 228507388.0, + "step": 8832 + }, + { + "epoch": 0.9700197671864704, + "grad_norm": 1.6417522430419922, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.71263587474823, + "num_tokens": 228535242.0, + "step": 8833 + }, + { + "epoch": 0.9701295848890841, + "grad_norm": 1.8189038038253784, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7028875946998596, + "num_tokens": 228558958.0, + "step": 8834 + }, + { + "epoch": 0.9702394025916978, + "grad_norm": 1.6746450662612915, + "learning_rate": 5e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6895990967750549, + "num_tokens": 228584279.0, + "step": 8835 + }, + { + "epoch": 0.9703492202943115, + "grad_norm": 1.6328870058059692, + "learning_rate": 5e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7028626799583435, + "num_tokens": 228613554.0, + "step": 8836 + }, + { + "epoch": 0.9704590379969251, + "grad_norm": 1.5863465070724487, + "learning_rate": 5e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7145687937736511, + "num_tokens": 228645418.0, + "step": 8837 + }, + { + "epoch": 0.9705688556995388, + "grad_norm": 1.9586055278778076, + "learning_rate": 5e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.6954423785209656, + "num_tokens": 228666034.0, + "step": 8838 + }, + { + "epoch": 0.9706786734021524, + "grad_norm": 1.6935454607009888, + "learning_rate": 5e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6994411945343018, + "num_tokens": 228693750.0, + "step": 8839 + }, + { + "epoch": 0.9707884911047661, + "grad_norm": 1.552592158317566, + "learning_rate": 5e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7032078504562378, + "num_tokens": 228726460.0, + "step": 8840 + }, + { + "epoch": 0.9708983088073797, + "grad_norm": 1.7193135023117065, + "learning_rate": 5e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7122597694396973, + "num_tokens": 228751042.0, + "step": 8841 + }, + { + "epoch": 0.9710081265099935, + "grad_norm": 1.8722487688064575, + "learning_rate": 5e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7189950942993164, + "num_tokens": 228771833.0, + "step": 8842 + }, + { + "epoch": 0.9711179442126071, + "grad_norm": 1.7706849575042725, + "learning_rate": 5e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.6973532438278198, + "num_tokens": 228797624.0, + "step": 8843 + }, + { + "epoch": 0.9712277619152208, + "grad_norm": 1.7016276121139526, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6954150795936584, + "num_tokens": 228827519.0, + "step": 8844 + }, + { + "epoch": 0.9713375796178344, + "grad_norm": 1.705073595046997, + "learning_rate": 5e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.67889004945755, + "num_tokens": 228857102.0, + "step": 8845 + }, + { + "epoch": 0.971447397320448, + "grad_norm": 1.80301034450531, + "learning_rate": 5e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6857370138168335, + "num_tokens": 228881053.0, + "step": 8846 + }, + { + "epoch": 0.9715572150230617, + "grad_norm": 1.5864956378936768, + "learning_rate": 5e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6815353631973267, + "num_tokens": 228912062.0, + "step": 8847 + }, + { + "epoch": 0.9716670327256753, + "grad_norm": 1.6914894580841064, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7281209230422974, + "num_tokens": 228935033.0, + "step": 8848 + }, + { + "epoch": 0.971776850428289, + "grad_norm": 1.8438448905944824, + "learning_rate": 5e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7145650386810303, + "num_tokens": 228958918.0, + "step": 8849 + }, + { + "epoch": 0.9718866681309027, + "grad_norm": 1.6578508615493774, + "learning_rate": 5e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7090538740158081, + "num_tokens": 228985618.0, + "step": 8850 + }, + { + "epoch": 0.9719964858335164, + "grad_norm": 2.0175745487213135, + "learning_rate": 5e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7209609746932983, + "num_tokens": 229007751.0, + "step": 8851 + }, + { + "epoch": 0.97210630353613, + "grad_norm": 1.922690510749817, + "learning_rate": 5e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.714117705821991, + "num_tokens": 229032685.0, + "step": 8852 + }, + { + "epoch": 0.9722161212387437, + "grad_norm": 1.949625015258789, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7301708459854126, + "num_tokens": 229053057.0, + "step": 8853 + }, + { + "epoch": 0.9723259389413573, + "grad_norm": 1.5984152555465698, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7158809304237366, + "num_tokens": 229080714.0, + "step": 8854 + }, + { + "epoch": 0.972435756643971, + "grad_norm": 1.8406386375427246, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7177155017852783, + "num_tokens": 229104769.0, + "step": 8855 + }, + { + "epoch": 0.9725455743465846, + "grad_norm": 1.6351451873779297, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7298645973205566, + "num_tokens": 229136527.0, + "step": 8856 + }, + { + "epoch": 0.9726553920491984, + "grad_norm": 1.7474544048309326, + "learning_rate": 5e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.705733060836792, + "num_tokens": 229162906.0, + "step": 8857 + }, + { + "epoch": 0.972765209751812, + "grad_norm": 1.6555019617080688, + "learning_rate": 5e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7058263421058655, + "num_tokens": 229190241.0, + "step": 8858 + }, + { + "epoch": 0.9728750274544257, + "grad_norm": 1.7372257709503174, + "learning_rate": 5e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7289159893989563, + "num_tokens": 229214527.0, + "step": 8859 + }, + { + "epoch": 0.9729848451570393, + "grad_norm": 1.940673589706421, + "learning_rate": 5e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.6994288563728333, + "num_tokens": 229234582.0, + "step": 8860 + }, + { + "epoch": 0.973094662859653, + "grad_norm": 1.7442586421966553, + "learning_rate": 5e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7335803508758545, + "num_tokens": 229261154.0, + "step": 8861 + }, + { + "epoch": 0.9732044805622666, + "grad_norm": 1.831252932548523, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7344428300857544, + "num_tokens": 229284160.0, + "step": 8862 + }, + { + "epoch": 0.9733142982648803, + "grad_norm": 1.9218802452087402, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7208550572395325, + "num_tokens": 229305416.0, + "step": 8863 + }, + { + "epoch": 0.973424115967494, + "grad_norm": 1.9540513753890991, + "learning_rate": 5e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7230327129364014, + "num_tokens": 229325690.0, + "step": 8864 + }, + { + "epoch": 0.9735339336701077, + "grad_norm": 1.7532588243484497, + "learning_rate": 5e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.710867702960968, + "num_tokens": 229351645.0, + "step": 8865 + }, + { + "epoch": 0.9736437513727213, + "grad_norm": 1.8271297216415405, + "learning_rate": 5e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7094724178314209, + "num_tokens": 229377544.0, + "step": 8866 + }, + { + "epoch": 0.973753569075335, + "grad_norm": 2.045794725418091, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.722220242023468, + "num_tokens": 229397066.0, + "step": 8867 + }, + { + "epoch": 0.9738633867779486, + "grad_norm": 1.788205862045288, + "learning_rate": 5e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7178691625595093, + "num_tokens": 229423636.0, + "step": 8868 + }, + { + "epoch": 0.9739732044805622, + "grad_norm": 2.018568754196167, + "learning_rate": 5e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7215779423713684, + "num_tokens": 229443648.0, + "step": 8869 + }, + { + "epoch": 0.9740830221831759, + "grad_norm": 1.6990569829940796, + "learning_rate": 5e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7171095609664917, + "num_tokens": 229471951.0, + "step": 8870 + }, + { + "epoch": 0.9741928398857896, + "grad_norm": 1.6057331562042236, + "learning_rate": 5e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6939572095870972, + "num_tokens": 229505236.0, + "step": 8871 + }, + { + "epoch": 0.9743026575884033, + "grad_norm": 1.6833375692367554, + "learning_rate": 5e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7085742950439453, + "num_tokens": 229531120.0, + "step": 8872 + }, + { + "epoch": 0.9744124752910169, + "grad_norm": 1.975874900817871, + "learning_rate": 5e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7121735215187073, + "num_tokens": 229553347.0, + "step": 8873 + }, + { + "epoch": 0.9745222929936306, + "grad_norm": 1.9362475872039795, + "learning_rate": 5e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7116420269012451, + "num_tokens": 229578008.0, + "step": 8874 + }, + { + "epoch": 0.9746321106962442, + "grad_norm": 1.791170358657837, + "learning_rate": 5e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7134069204330444, + "num_tokens": 229602193.0, + "step": 8875 + }, + { + "epoch": 0.9747419283988579, + "grad_norm": 1.7021830081939697, + "learning_rate": 5e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7240934371948242, + "num_tokens": 229629591.0, + "step": 8876 + }, + { + "epoch": 0.9748517461014715, + "grad_norm": 1.8881523609161377, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7192932367324829, + "num_tokens": 229652320.0, + "step": 8877 + }, + { + "epoch": 0.9749615638040853, + "grad_norm": 1.7311171293258667, + "learning_rate": 5e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7096796035766602, + "num_tokens": 229677885.0, + "step": 8878 + }, + { + "epoch": 0.9750713815066989, + "grad_norm": 1.7997339963912964, + "learning_rate": 5e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7117660641670227, + "num_tokens": 229701891.0, + "step": 8879 + }, + { + "epoch": 0.9751811992093126, + "grad_norm": 1.7848680019378662, + "learning_rate": 5e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7109304070472717, + "num_tokens": 229726549.0, + "step": 8880 + }, + { + "epoch": 0.9752910169119262, + "grad_norm": 1.8972679376602173, + "learning_rate": 5e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7254507541656494, + "num_tokens": 229749054.0, + "step": 8881 + }, + { + "epoch": 0.9754008346145399, + "grad_norm": 1.6941444873809814, + "learning_rate": 5e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7102223634719849, + "num_tokens": 229775886.0, + "step": 8882 + }, + { + "epoch": 0.9755106523171535, + "grad_norm": 1.6684247255325317, + "learning_rate": 5e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7194487452507019, + "num_tokens": 229805975.0, + "step": 8883 + }, + { + "epoch": 0.9756204700197671, + "grad_norm": 1.5749053955078125, + "learning_rate": 5e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6845754384994507, + "num_tokens": 229836709.0, + "step": 8884 + }, + { + "epoch": 0.9757302877223808, + "grad_norm": 1.6442817449569702, + "learning_rate": 5e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7147010564804077, + "num_tokens": 229861755.0, + "step": 8885 + }, + { + "epoch": 0.9758401054249946, + "grad_norm": 1.8439104557037354, + "learning_rate": 5e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7060215473175049, + "num_tokens": 229885279.0, + "step": 8886 + }, + { + "epoch": 0.9759499231276082, + "grad_norm": 1.9359349012374878, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7142902612686157, + "num_tokens": 229904799.0, + "step": 8887 + }, + { + "epoch": 0.9760597408302218, + "grad_norm": 1.6300715208053589, + "learning_rate": 5e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7273414134979248, + "num_tokens": 229932505.0, + "step": 8888 + }, + { + "epoch": 0.9761695585328355, + "grad_norm": 1.8358384370803833, + "learning_rate": 5e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7047705054283142, + "num_tokens": 229956956.0, + "step": 8889 + }, + { + "epoch": 0.9762793762354491, + "grad_norm": 1.9333460330963135, + "learning_rate": 5e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7136969566345215, + "num_tokens": 229978879.0, + "step": 8890 + }, + { + "epoch": 0.9763891939380628, + "grad_norm": 2.098026752471924, + "learning_rate": 5e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6975172758102417, + "num_tokens": 229998839.0, + "step": 8891 + }, + { + "epoch": 0.9764990116406764, + "grad_norm": 1.9395453929901123, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7212497591972351, + "num_tokens": 230019278.0, + "step": 8892 + }, + { + "epoch": 0.9766088293432902, + "grad_norm": 1.7480756044387817, + "learning_rate": 5e-06, + "loss": 0.975, + "mean_token_accuracy": 0.6969121694564819, + "num_tokens": 230044553.0, + "step": 8893 + }, + { + "epoch": 0.9767186470459038, + "grad_norm": 1.6259969472885132, + "learning_rate": 5e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7191729545593262, + "num_tokens": 230070967.0, + "step": 8894 + }, + { + "epoch": 0.9768284647485175, + "grad_norm": 1.8635607957839966, + "learning_rate": 5e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6931468844413757, + "num_tokens": 230094471.0, + "step": 8895 + }, + { + "epoch": 0.9769382824511311, + "grad_norm": 1.647445797920227, + "learning_rate": 5e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7055944204330444, + "num_tokens": 230122274.0, + "step": 8896 + }, + { + "epoch": 0.9770481001537448, + "grad_norm": 1.6738187074661255, + "learning_rate": 5e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.722233772277832, + "num_tokens": 230150759.0, + "step": 8897 + }, + { + "epoch": 0.9771579178563584, + "grad_norm": 2.3220224380493164, + "learning_rate": 5e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7260922789573669, + "num_tokens": 230168804.0, + "step": 8898 + }, + { + "epoch": 0.9772677355589721, + "grad_norm": 2.1178839206695557, + "learning_rate": 5e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6902471780776978, + "num_tokens": 230189621.0, + "step": 8899 + }, + { + "epoch": 0.9773775532615858, + "grad_norm": 1.6438466310501099, + "learning_rate": 5e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7113319635391235, + "num_tokens": 230220025.0, + "step": 8900 + }, + { + "epoch": 0.9774873709641995, + "grad_norm": 1.699717402458191, + "learning_rate": 5e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7145669460296631, + "num_tokens": 230245677.0, + "step": 8901 + }, + { + "epoch": 0.9775971886668131, + "grad_norm": 1.8430730104446411, + "learning_rate": 5e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7191109657287598, + "num_tokens": 230267783.0, + "step": 8902 + }, + { + "epoch": 0.9777070063694268, + "grad_norm": 1.8734664916992188, + "learning_rate": 5e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6829553842544556, + "num_tokens": 230293377.0, + "step": 8903 + }, + { + "epoch": 0.9778168240720404, + "grad_norm": 1.8877137899398804, + "learning_rate": 5e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6980547904968262, + "num_tokens": 230318139.0, + "step": 8904 + }, + { + "epoch": 0.977926641774654, + "grad_norm": 1.776293396949768, + "learning_rate": 5e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7020180225372314, + "num_tokens": 230342904.0, + "step": 8905 + }, + { + "epoch": 0.9780364594772677, + "grad_norm": 1.7012526988983154, + "learning_rate": 5e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7026282548904419, + "num_tokens": 230369493.0, + "step": 8906 + }, + { + "epoch": 0.9781462771798815, + "grad_norm": 1.8799062967300415, + "learning_rate": 5e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7133141756057739, + "num_tokens": 230392127.0, + "step": 8907 + }, + { + "epoch": 0.9782560948824951, + "grad_norm": 1.914048194885254, + "learning_rate": 5e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7132534384727478, + "num_tokens": 230414685.0, + "step": 8908 + }, + { + "epoch": 0.9783659125851087, + "grad_norm": 1.5847692489624023, + "learning_rate": 5e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6958841681480408, + "num_tokens": 230443387.0, + "step": 8909 + }, + { + "epoch": 0.9784757302877224, + "grad_norm": 1.7126766443252563, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7027338743209839, + "num_tokens": 230469244.0, + "step": 8910 + }, + { + "epoch": 0.978585547990336, + "grad_norm": 1.5559406280517578, + "learning_rate": 5e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6876136660575867, + "num_tokens": 230501019.0, + "step": 8911 + }, + { + "epoch": 0.9786953656929497, + "grad_norm": 1.6799904108047485, + "learning_rate": 5e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7274852395057678, + "num_tokens": 230526104.0, + "step": 8912 + }, + { + "epoch": 0.9788051833955633, + "grad_norm": 1.6830774545669556, + "learning_rate": 5e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7286969423294067, + "num_tokens": 230553059.0, + "step": 8913 + }, + { + "epoch": 0.978915001098177, + "grad_norm": 2.0003154277801514, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7241299152374268, + "num_tokens": 230574795.0, + "step": 8914 + }, + { + "epoch": 0.9790248188007907, + "grad_norm": 1.7575573921203613, + "learning_rate": 5e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.707138180732727, + "num_tokens": 230601570.0, + "step": 8915 + }, + { + "epoch": 0.9791346365034044, + "grad_norm": 1.7419766187667847, + "learning_rate": 5e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7135111093521118, + "num_tokens": 230626410.0, + "step": 8916 + }, + { + "epoch": 0.979244454206018, + "grad_norm": 1.7967263460159302, + "learning_rate": 5e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7411106824874878, + "num_tokens": 230651728.0, + "step": 8917 + }, + { + "epoch": 0.9793542719086317, + "grad_norm": 1.6533207893371582, + "learning_rate": 5e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7169911861419678, + "num_tokens": 230677781.0, + "step": 8918 + }, + { + "epoch": 0.9794640896112453, + "grad_norm": 1.7546762228012085, + "learning_rate": 5e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7133615016937256, + "num_tokens": 230703282.0, + "step": 8919 + }, + { + "epoch": 0.979573907313859, + "grad_norm": 1.6880698204040527, + "learning_rate": 5e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7144964933395386, + "num_tokens": 230730625.0, + "step": 8920 + }, + { + "epoch": 0.9796837250164726, + "grad_norm": 1.5499409437179565, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.717817485332489, + "num_tokens": 230760578.0, + "step": 8921 + }, + { + "epoch": 0.9797935427190864, + "grad_norm": 1.8799824714660645, + "learning_rate": 5e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7161844968795776, + "num_tokens": 230782683.0, + "step": 8922 + }, + { + "epoch": 0.9799033604217, + "grad_norm": 1.8122386932373047, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7319243550300598, + "num_tokens": 230806677.0, + "step": 8923 + }, + { + "epoch": 0.9800131781243137, + "grad_norm": 1.8119326829910278, + "learning_rate": 5e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7447219491004944, + "num_tokens": 230829766.0, + "step": 8924 + }, + { + "epoch": 0.9801229958269273, + "grad_norm": 1.7184237241744995, + "learning_rate": 5e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7071353793144226, + "num_tokens": 230859157.0, + "step": 8925 + }, + { + "epoch": 0.980232813529541, + "grad_norm": 1.6499395370483398, + "learning_rate": 5e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.698257327079773, + "num_tokens": 230889840.0, + "step": 8926 + }, + { + "epoch": 0.9803426312321546, + "grad_norm": 1.8583000898361206, + "learning_rate": 5e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7174662351608276, + "num_tokens": 230912397.0, + "step": 8927 + }, + { + "epoch": 0.9804524489347682, + "grad_norm": 1.7344489097595215, + "learning_rate": 5e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6988058686256409, + "num_tokens": 230941199.0, + "step": 8928 + }, + { + "epoch": 0.980562266637382, + "grad_norm": 1.9553842544555664, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.713594913482666, + "num_tokens": 230962285.0, + "step": 8929 + }, + { + "epoch": 0.9806720843399956, + "grad_norm": 1.6886613368988037, + "learning_rate": 5e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7221066355705261, + "num_tokens": 230990960.0, + "step": 8930 + }, + { + "epoch": 0.9807819020426093, + "grad_norm": 1.8482306003570557, + "learning_rate": 5e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.737909197807312, + "num_tokens": 231013376.0, + "step": 8931 + }, + { + "epoch": 0.9808917197452229, + "grad_norm": 1.674808382987976, + "learning_rate": 5e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6888691186904907, + "num_tokens": 231039968.0, + "step": 8932 + }, + { + "epoch": 0.9810015374478366, + "grad_norm": 1.5471460819244385, + "learning_rate": 5e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7025226950645447, + "num_tokens": 231069290.0, + "step": 8933 + }, + { + "epoch": 0.9811113551504502, + "grad_norm": 1.7130361795425415, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.722387433052063, + "num_tokens": 231099469.0, + "step": 8934 + }, + { + "epoch": 0.9812211728530639, + "grad_norm": 2.1049745082855225, + "learning_rate": 5e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.74224454164505, + "num_tokens": 231117231.0, + "step": 8935 + }, + { + "epoch": 0.9813309905556776, + "grad_norm": 1.6585683822631836, + "learning_rate": 5e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6970032453536987, + "num_tokens": 231149806.0, + "step": 8936 + }, + { + "epoch": 0.9814408082582913, + "grad_norm": 1.7027314901351929, + "learning_rate": 5e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7168245911598206, + "num_tokens": 231175700.0, + "step": 8937 + }, + { + "epoch": 0.9815506259609049, + "grad_norm": 1.7941380739212036, + "learning_rate": 5e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7191368937492371, + "num_tokens": 231200330.0, + "step": 8938 + }, + { + "epoch": 0.9816604436635186, + "grad_norm": 1.6922584772109985, + "learning_rate": 5e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7178659439086914, + "num_tokens": 231225500.0, + "step": 8939 + }, + { + "epoch": 0.9817702613661322, + "grad_norm": 1.8366551399230957, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7155205607414246, + "num_tokens": 231249851.0, + "step": 8940 + }, + { + "epoch": 0.9818800790687459, + "grad_norm": 1.5004452466964722, + "learning_rate": 5e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6752817034721375, + "num_tokens": 231286235.0, + "step": 8941 + }, + { + "epoch": 0.9819898967713595, + "grad_norm": 1.8449534177780151, + "learning_rate": 5e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6987514495849609, + "num_tokens": 231311631.0, + "step": 8942 + }, + { + "epoch": 0.9820997144739732, + "grad_norm": 1.7009960412979126, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7370695471763611, + "num_tokens": 231337220.0, + "step": 8943 + }, + { + "epoch": 0.9822095321765869, + "grad_norm": 1.5914610624313354, + "learning_rate": 5e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7034021615982056, + "num_tokens": 231367872.0, + "step": 8944 + }, + { + "epoch": 0.9823193498792006, + "grad_norm": 1.769429087638855, + "learning_rate": 5e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7097052335739136, + "num_tokens": 231393656.0, + "step": 8945 + }, + { + "epoch": 0.9824291675818142, + "grad_norm": 1.953444242477417, + "learning_rate": 5e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7197034358978271, + "num_tokens": 231414843.0, + "step": 8946 + }, + { + "epoch": 0.9825389852844278, + "grad_norm": 1.7586030960083008, + "learning_rate": 5e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7130692601203918, + "num_tokens": 231440323.0, + "step": 8947 + }, + { + "epoch": 0.9826488029870415, + "grad_norm": 1.9583686590194702, + "learning_rate": 5e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7513308525085449, + "num_tokens": 231458663.0, + "step": 8948 + }, + { + "epoch": 0.9827586206896551, + "grad_norm": 1.6455459594726562, + "learning_rate": 5e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6900007724761963, + "num_tokens": 231491630.0, + "step": 8949 + }, + { + "epoch": 0.9828684383922688, + "grad_norm": 2.0569920539855957, + "learning_rate": 5e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7218887805938721, + "num_tokens": 231509796.0, + "step": 8950 + }, + { + "epoch": 0.9829782560948825, + "grad_norm": 1.7363102436065674, + "learning_rate": 5e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7361112236976624, + "num_tokens": 231535691.0, + "step": 8951 + }, + { + "epoch": 0.9830880737974962, + "grad_norm": 1.4931111335754395, + "learning_rate": 5e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.6952991485595703, + "num_tokens": 231567744.0, + "step": 8952 + }, + { + "epoch": 0.9831978915001098, + "grad_norm": 1.7319741249084473, + "learning_rate": 5e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6933975219726562, + "num_tokens": 231596245.0, + "step": 8953 + }, + { + "epoch": 0.9833077092027235, + "grad_norm": 1.727835774421692, + "learning_rate": 5e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7157834768295288, + "num_tokens": 231622805.0, + "step": 8954 + }, + { + "epoch": 0.9834175269053371, + "grad_norm": 1.6241652965545654, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7164887189865112, + "num_tokens": 231651558.0, + "step": 8955 + }, + { + "epoch": 0.9835273446079508, + "grad_norm": 1.730726718902588, + "learning_rate": 5e-06, + "loss": 1.1054, + "mean_token_accuracy": 0.6647245287895203, + "num_tokens": 231683150.0, + "step": 8956 + }, + { + "epoch": 0.9836371623105644, + "grad_norm": 1.6536966562271118, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7139588594436646, + "num_tokens": 231712107.0, + "step": 8957 + }, + { + "epoch": 0.9837469800131782, + "grad_norm": 1.5864927768707275, + "learning_rate": 5e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7147977352142334, + "num_tokens": 231744802.0, + "step": 8958 + }, + { + "epoch": 0.9838567977157918, + "grad_norm": 1.6319228410720825, + "learning_rate": 5e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7051169872283936, + "num_tokens": 231773923.0, + "step": 8959 + }, + { + "epoch": 0.9839666154184055, + "grad_norm": 1.6106075048446655, + "learning_rate": 5e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6974889636039734, + "num_tokens": 231804511.0, + "step": 8960 + }, + { + "epoch": 0.9840764331210191, + "grad_norm": 1.5294357538223267, + "learning_rate": 5e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6970059871673584, + "num_tokens": 231837839.0, + "step": 8961 + }, + { + "epoch": 0.9841862508236328, + "grad_norm": 1.7040703296661377, + "learning_rate": 5e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7092030048370361, + "num_tokens": 231864621.0, + "step": 8962 + }, + { + "epoch": 0.9842960685262464, + "grad_norm": 1.7354665994644165, + "learning_rate": 5e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7212117910385132, + "num_tokens": 231888100.0, + "step": 8963 + }, + { + "epoch": 0.98440588622886, + "grad_norm": 1.7890750169754028, + "learning_rate": 5e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.722000002861023, + "num_tokens": 231914000.0, + "step": 8964 + }, + { + "epoch": 0.9845157039314738, + "grad_norm": 2.0047242641448975, + "learning_rate": 5e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.723180890083313, + "num_tokens": 231935167.0, + "step": 8965 + }, + { + "epoch": 0.9846255216340875, + "grad_norm": 1.9288249015808105, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7376134395599365, + "num_tokens": 231956460.0, + "step": 8966 + }, + { + "epoch": 0.9847353393367011, + "grad_norm": 1.8781718015670776, + "learning_rate": 5e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7035934329032898, + "num_tokens": 231979923.0, + "step": 8967 + }, + { + "epoch": 0.9848451570393147, + "grad_norm": 2.1270992755889893, + "learning_rate": 5e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7200793027877808, + "num_tokens": 231997872.0, + "step": 8968 + }, + { + "epoch": 0.9849549747419284, + "grad_norm": 1.7375521659851074, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7172577381134033, + "num_tokens": 232023020.0, + "step": 8969 + }, + { + "epoch": 0.985064792444542, + "grad_norm": 1.5377988815307617, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7332841753959656, + "num_tokens": 232050861.0, + "step": 8970 + }, + { + "epoch": 0.9851746101471557, + "grad_norm": 2.019800901412964, + "learning_rate": 5e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7073289155960083, + "num_tokens": 232071494.0, + "step": 8971 + }, + { + "epoch": 0.9852844278497693, + "grad_norm": 1.6682507991790771, + "learning_rate": 5e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6933754682540894, + "num_tokens": 232102180.0, + "step": 8972 + }, + { + "epoch": 0.9853942455523831, + "grad_norm": 1.6181799173355103, + "learning_rate": 5e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7043705582618713, + "num_tokens": 232129306.0, + "step": 8973 + }, + { + "epoch": 0.9855040632549967, + "grad_norm": 1.863593339920044, + "learning_rate": 5e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7130801677703857, + "num_tokens": 232154651.0, + "step": 8974 + }, + { + "epoch": 0.9856138809576104, + "grad_norm": 1.7826906442642212, + "learning_rate": 5e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7236523628234863, + "num_tokens": 232177855.0, + "step": 8975 + }, + { + "epoch": 0.985723698660224, + "grad_norm": 1.8421143293380737, + "learning_rate": 5e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7020168304443359, + "num_tokens": 232201554.0, + "step": 8976 + }, + { + "epoch": 0.9858335163628377, + "grad_norm": 1.5295131206512451, + "learning_rate": 5e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6935516595840454, + "num_tokens": 232236085.0, + "step": 8977 + }, + { + "epoch": 0.9859433340654513, + "grad_norm": 1.7786006927490234, + "learning_rate": 5e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6933947205543518, + "num_tokens": 232262731.0, + "step": 8978 + }, + { + "epoch": 0.986053151768065, + "grad_norm": 1.6171343326568604, + "learning_rate": 5e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7218273878097534, + "num_tokens": 232289290.0, + "step": 8979 + }, + { + "epoch": 0.9861629694706787, + "grad_norm": 1.6263031959533691, + "learning_rate": 5e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.712364137172699, + "num_tokens": 232318671.0, + "step": 8980 + }, + { + "epoch": 0.9862727871732924, + "grad_norm": 1.8156872987747192, + "learning_rate": 5e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7020094394683838, + "num_tokens": 232344465.0, + "step": 8981 + }, + { + "epoch": 0.986382604875906, + "grad_norm": 1.8879514932632446, + "learning_rate": 5e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7196599245071411, + "num_tokens": 232366047.0, + "step": 8982 + }, + { + "epoch": 0.9864924225785197, + "grad_norm": 1.9285709857940674, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7356529235839844, + "num_tokens": 232385515.0, + "step": 8983 + }, + { + "epoch": 0.9866022402811333, + "grad_norm": 1.687453031539917, + "learning_rate": 5e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7317978739738464, + "num_tokens": 232409859.0, + "step": 8984 + }, + { + "epoch": 0.986712057983747, + "grad_norm": 1.862558126449585, + "learning_rate": 5e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7014950513839722, + "num_tokens": 232436029.0, + "step": 8985 + }, + { + "epoch": 0.9868218756863606, + "grad_norm": 1.8386495113372803, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7212294340133667, + "num_tokens": 232459175.0, + "step": 8986 + }, + { + "epoch": 0.9869316933889744, + "grad_norm": 1.6885095834732056, + "learning_rate": 5e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7144830822944641, + "num_tokens": 232485611.0, + "step": 8987 + }, + { + "epoch": 0.987041511091588, + "grad_norm": 1.6101789474487305, + "learning_rate": 5e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.684507429599762, + "num_tokens": 232518239.0, + "step": 8988 + }, + { + "epoch": 0.9871513287942016, + "grad_norm": 1.6754891872406006, + "learning_rate": 5e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.70700603723526, + "num_tokens": 232543785.0, + "step": 8989 + }, + { + "epoch": 0.9872611464968153, + "grad_norm": 1.677467703819275, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7137646079063416, + "num_tokens": 232572042.0, + "step": 8990 + }, + { + "epoch": 0.9873709641994289, + "grad_norm": 1.6694997549057007, + "learning_rate": 5e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7184996604919434, + "num_tokens": 232599221.0, + "step": 8991 + }, + { + "epoch": 0.9874807819020426, + "grad_norm": 1.5765708684921265, + "learning_rate": 5e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7014732360839844, + "num_tokens": 232632247.0, + "step": 8992 + }, + { + "epoch": 0.9875905996046562, + "grad_norm": 1.7086693048477173, + "learning_rate": 5e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.6773660182952881, + "num_tokens": 232662956.0, + "step": 8993 + }, + { + "epoch": 0.98770041730727, + "grad_norm": 1.846014380455017, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7282885909080505, + "num_tokens": 232684614.0, + "step": 8994 + }, + { + "epoch": 0.9878102350098836, + "grad_norm": 1.8138704299926758, + "learning_rate": 5e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6987695693969727, + "num_tokens": 232709507.0, + "step": 8995 + }, + { + "epoch": 0.9879200527124973, + "grad_norm": 1.8822792768478394, + "learning_rate": 5e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6992580890655518, + "num_tokens": 232734184.0, + "step": 8996 + }, + { + "epoch": 0.9880298704151109, + "grad_norm": 1.666806936264038, + "learning_rate": 5e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6837728023529053, + "num_tokens": 232763349.0, + "step": 8997 + }, + { + "epoch": 0.9881396881177246, + "grad_norm": 1.761391520500183, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7139116525650024, + "num_tokens": 232787918.0, + "step": 8998 + }, + { + "epoch": 0.9882495058203382, + "grad_norm": 1.8599854707717896, + "learning_rate": 5e-06, + "loss": 0.982, + "mean_token_accuracy": 0.6969746947288513, + "num_tokens": 232813027.0, + "step": 8999 + }, + { + "epoch": 0.9883593235229519, + "grad_norm": 1.7781174182891846, + "learning_rate": 5e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7135480046272278, + "num_tokens": 232838126.0, + "step": 9000 + }, + { + "epoch": 0.9884691412255655, + "grad_norm": 2.0240440368652344, + "learning_rate": 5e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7298643589019775, + "num_tokens": 232857691.0, + "step": 9001 + }, + { + "epoch": 0.9885789589281793, + "grad_norm": 1.696629524230957, + "learning_rate": 5e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7223483920097351, + "num_tokens": 232884188.0, + "step": 9002 + }, + { + "epoch": 0.9886887766307929, + "grad_norm": 1.7173787355422974, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7078756093978882, + "num_tokens": 232911493.0, + "step": 9003 + }, + { + "epoch": 0.9887985943334066, + "grad_norm": 1.9843170642852783, + "learning_rate": 5e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7255710363388062, + "num_tokens": 232931634.0, + "step": 9004 + }, + { + "epoch": 0.9889084120360202, + "grad_norm": 1.7161939144134521, + "learning_rate": 5e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6806499361991882, + "num_tokens": 232958752.0, + "step": 9005 + }, + { + "epoch": 0.9890182297386338, + "grad_norm": 1.6886608600616455, + "learning_rate": 5e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7121222615242004, + "num_tokens": 232987234.0, + "step": 9006 + }, + { + "epoch": 0.9891280474412475, + "grad_norm": 1.6992756128311157, + "learning_rate": 5e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7008134126663208, + "num_tokens": 233015543.0, + "step": 9007 + }, + { + "epoch": 0.9892378651438611, + "grad_norm": 1.8128570318222046, + "learning_rate": 5e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7170930504798889, + "num_tokens": 233040299.0, + "step": 9008 + }, + { + "epoch": 0.9893476828464749, + "grad_norm": 1.7291003465652466, + "learning_rate": 5e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7101098895072937, + "num_tokens": 233065627.0, + "step": 9009 + }, + { + "epoch": 0.9894575005490885, + "grad_norm": 1.777525782585144, + "learning_rate": 5e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6921138763427734, + "num_tokens": 233089011.0, + "step": 9010 + }, + { + "epoch": 0.9895673182517022, + "grad_norm": 1.5763187408447266, + "learning_rate": 5e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7078168392181396, + "num_tokens": 233121048.0, + "step": 9011 + }, + { + "epoch": 0.9896771359543158, + "grad_norm": 1.6638941764831543, + "learning_rate": 5e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.694279670715332, + "num_tokens": 233147803.0, + "step": 9012 + }, + { + "epoch": 0.9897869536569295, + "grad_norm": 1.551945686340332, + "learning_rate": 5e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6955485343933105, + "num_tokens": 233179159.0, + "step": 9013 + }, + { + "epoch": 0.9898967713595431, + "grad_norm": 1.4840465784072876, + "learning_rate": 5e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7232630848884583, + "num_tokens": 233215718.0, + "step": 9014 + }, + { + "epoch": 0.9900065890621568, + "grad_norm": 1.9002217054367065, + "learning_rate": 5e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.705085813999176, + "num_tokens": 233238374.0, + "step": 9015 + }, + { + "epoch": 0.9901164067647705, + "grad_norm": 1.7112390995025635, + "learning_rate": 5e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6992952823638916, + "num_tokens": 233265445.0, + "step": 9016 + }, + { + "epoch": 0.9902262244673842, + "grad_norm": 1.7140969038009644, + "learning_rate": 5e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6912135481834412, + "num_tokens": 233295096.0, + "step": 9017 + }, + { + "epoch": 0.9903360421699978, + "grad_norm": 1.5844155550003052, + "learning_rate": 5e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7062731981277466, + "num_tokens": 233327577.0, + "step": 9018 + }, + { + "epoch": 0.9904458598726115, + "grad_norm": 1.6104134321212769, + "learning_rate": 5e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7220460176467896, + "num_tokens": 233356762.0, + "step": 9019 + }, + { + "epoch": 0.9905556775752251, + "grad_norm": 1.6971771717071533, + "learning_rate": 5e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7253287434577942, + "num_tokens": 233384347.0, + "step": 9020 + }, + { + "epoch": 0.9906654952778388, + "grad_norm": 1.8863879442214966, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7299742698669434, + "num_tokens": 233405381.0, + "step": 9021 + }, + { + "epoch": 0.9907753129804524, + "grad_norm": 1.7696112394332886, + "learning_rate": 5e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7267789840698242, + "num_tokens": 233428100.0, + "step": 9022 + }, + { + "epoch": 0.9908851306830662, + "grad_norm": 1.7520217895507812, + "learning_rate": 5e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7067058086395264, + "num_tokens": 233451698.0, + "step": 9023 + }, + { + "epoch": 0.9909949483856798, + "grad_norm": 1.7653359174728394, + "learning_rate": 5e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7119091749191284, + "num_tokens": 233476898.0, + "step": 9024 + }, + { + "epoch": 0.9911047660882935, + "grad_norm": 1.6328239440917969, + "learning_rate": 5e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.698752760887146, + "num_tokens": 233508056.0, + "step": 9025 + }, + { + "epoch": 0.9912145837909071, + "grad_norm": 1.9877994060516357, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7227144241333008, + "num_tokens": 233527409.0, + "step": 9026 + }, + { + "epoch": 0.9913244014935207, + "grad_norm": 1.6397534608840942, + "learning_rate": 5e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7241811156272888, + "num_tokens": 233553621.0, + "step": 9027 + }, + { + "epoch": 0.9914342191961344, + "grad_norm": 1.6750391721725464, + "learning_rate": 5e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7007681131362915, + "num_tokens": 233582148.0, + "step": 9028 + }, + { + "epoch": 0.991544036898748, + "grad_norm": 1.533247709274292, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7131054997444153, + "num_tokens": 233613856.0, + "step": 9029 + }, + { + "epoch": 0.9916538546013618, + "grad_norm": 1.7625128030776978, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7292510867118835, + "num_tokens": 233637331.0, + "step": 9030 + }, + { + "epoch": 0.9917636723039754, + "grad_norm": 1.5242445468902588, + "learning_rate": 5e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7153915166854858, + "num_tokens": 233667503.0, + "step": 9031 + }, + { + "epoch": 0.9918734900065891, + "grad_norm": 1.6969809532165527, + "learning_rate": 5e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6926851868629456, + "num_tokens": 233694358.0, + "step": 9032 + }, + { + "epoch": 0.9919833077092027, + "grad_norm": 1.925845980644226, + "learning_rate": 5e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7107584476470947, + "num_tokens": 233714719.0, + "step": 9033 + }, + { + "epoch": 0.9920931254118164, + "grad_norm": 1.6066429615020752, + "learning_rate": 5e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.695163369178772, + "num_tokens": 233746309.0, + "step": 9034 + }, + { + "epoch": 0.99220294311443, + "grad_norm": 1.7046067714691162, + "learning_rate": 5e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6937641501426697, + "num_tokens": 233776759.0, + "step": 9035 + }, + { + "epoch": 0.9923127608170437, + "grad_norm": 1.8134000301361084, + "learning_rate": 5e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7060611844062805, + "num_tokens": 233803269.0, + "step": 9036 + }, + { + "epoch": 0.9924225785196573, + "grad_norm": 1.6273854970932007, + "learning_rate": 5e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7162511348724365, + "num_tokens": 233829634.0, + "step": 9037 + }, + { + "epoch": 0.9925323962222711, + "grad_norm": 1.7183774709701538, + "learning_rate": 5e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7260394096374512, + "num_tokens": 233854801.0, + "step": 9038 + }, + { + "epoch": 0.9926422139248847, + "grad_norm": 1.7521427869796753, + "learning_rate": 5e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7118988037109375, + "num_tokens": 233880301.0, + "step": 9039 + }, + { + "epoch": 0.9927520316274984, + "grad_norm": 1.9503567218780518, + "learning_rate": 5e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7379809021949768, + "num_tokens": 233898203.0, + "step": 9040 + }, + { + "epoch": 0.992861849330112, + "grad_norm": 1.8677586317062378, + "learning_rate": 5e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.730202853679657, + "num_tokens": 233921005.0, + "step": 9041 + }, + { + "epoch": 0.9929716670327257, + "grad_norm": 1.8977327346801758, + "learning_rate": 5e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7193034887313843, + "num_tokens": 233942311.0, + "step": 9042 + }, + { + "epoch": 0.9930814847353393, + "grad_norm": 1.829234004020691, + "learning_rate": 5e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7034892439842224, + "num_tokens": 233964369.0, + "step": 9043 + }, + { + "epoch": 0.993191302437953, + "grad_norm": 1.645646572113037, + "learning_rate": 5e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7055363059043884, + "num_tokens": 233995959.0, + "step": 9044 + }, + { + "epoch": 0.9933011201405667, + "grad_norm": 1.9458731412887573, + "learning_rate": 5e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6889792680740356, + "num_tokens": 234019319.0, + "step": 9045 + }, + { + "epoch": 0.9934109378431804, + "grad_norm": 1.6096076965332031, + "learning_rate": 5e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6772280335426331, + "num_tokens": 234049875.0, + "step": 9046 + }, + { + "epoch": 0.993520755545794, + "grad_norm": 1.7451708316802979, + "learning_rate": 5e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6788958311080933, + "num_tokens": 234078492.0, + "step": 9047 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 1.631341576576233, + "learning_rate": 5e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6920008659362793, + "num_tokens": 234108427.0, + "step": 9048 + }, + { + "epoch": 0.9937403909510213, + "grad_norm": 1.5755902528762817, + "learning_rate": 5e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6931215524673462, + "num_tokens": 234139955.0, + "step": 9049 + }, + { + "epoch": 0.9938502086536349, + "grad_norm": 1.7957878112792969, + "learning_rate": 5e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6880022883415222, + "num_tokens": 234166158.0, + "step": 9050 + }, + { + "epoch": 0.9939600263562486, + "grad_norm": 1.6602489948272705, + "learning_rate": 5e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6961952447891235, + "num_tokens": 234192922.0, + "step": 9051 + }, + { + "epoch": 0.9940698440588623, + "grad_norm": 1.7678788900375366, + "learning_rate": 5e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6976932883262634, + "num_tokens": 234219095.0, + "step": 9052 + }, + { + "epoch": 0.994179661761476, + "grad_norm": 1.6388554573059082, + "learning_rate": 5e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6945794224739075, + "num_tokens": 234247330.0, + "step": 9053 + }, + { + "epoch": 0.9942894794640896, + "grad_norm": 1.8517831563949585, + "learning_rate": 5e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7097363471984863, + "num_tokens": 234270786.0, + "step": 9054 + }, + { + "epoch": 0.9943992971667033, + "grad_norm": 1.804222583770752, + "learning_rate": 5e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7166032791137695, + "num_tokens": 234294771.0, + "step": 9055 + }, + { + "epoch": 0.9945091148693169, + "grad_norm": 1.8131051063537598, + "learning_rate": 5e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7181402444839478, + "num_tokens": 234316491.0, + "step": 9056 + }, + { + "epoch": 0.9946189325719306, + "grad_norm": 1.863931655883789, + "learning_rate": 5e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.720175564289093, + "num_tokens": 234339456.0, + "step": 9057 + }, + { + "epoch": 0.9947287502745442, + "grad_norm": 1.6785671710968018, + "learning_rate": 5e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7157463431358337, + "num_tokens": 234364803.0, + "step": 9058 + }, + { + "epoch": 0.994838567977158, + "grad_norm": 1.6704812049865723, + "learning_rate": 5e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7143090963363647, + "num_tokens": 234393774.0, + "step": 9059 + }, + { + "epoch": 0.9949483856797716, + "grad_norm": 1.7547194957733154, + "learning_rate": 5e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.71087646484375, + "num_tokens": 234418242.0, + "step": 9060 + }, + { + "epoch": 0.9950582033823853, + "grad_norm": 1.9703049659729004, + "learning_rate": 5e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7053421139717102, + "num_tokens": 234439608.0, + "step": 9061 + }, + { + "epoch": 0.9951680210849989, + "grad_norm": 1.6555392742156982, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7136667966842651, + "num_tokens": 234465919.0, + "step": 9062 + }, + { + "epoch": 0.9952778387876126, + "grad_norm": 1.8191593885421753, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7320341467857361, + "num_tokens": 234489764.0, + "step": 9063 + }, + { + "epoch": 0.9953876564902262, + "grad_norm": 1.7525813579559326, + "learning_rate": 5e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7072809338569641, + "num_tokens": 234517607.0, + "step": 9064 + }, + { + "epoch": 0.9954974741928398, + "grad_norm": 1.7321479320526123, + "learning_rate": 5e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7306444644927979, + "num_tokens": 234544829.0, + "step": 9065 + }, + { + "epoch": 0.9956072918954535, + "grad_norm": 1.7202638387680054, + "learning_rate": 5e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6959551572799683, + "num_tokens": 234573369.0, + "step": 9066 + }, + { + "epoch": 0.9957171095980673, + "grad_norm": 1.6212170124053955, + "learning_rate": 5e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7111330032348633, + "num_tokens": 234601025.0, + "step": 9067 + }, + { + "epoch": 0.9958269273006809, + "grad_norm": 1.9470043182373047, + "learning_rate": 5e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7054507732391357, + "num_tokens": 234623899.0, + "step": 9068 + }, + { + "epoch": 0.9959367450032945, + "grad_norm": 1.9913711547851562, + "learning_rate": 5e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7064179182052612, + "num_tokens": 234646431.0, + "step": 9069 + }, + { + "epoch": 0.9960465627059082, + "grad_norm": 1.482978343963623, + "learning_rate": 5e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7013425230979919, + "num_tokens": 234680029.0, + "step": 9070 + }, + { + "epoch": 0.9961563804085218, + "grad_norm": 1.9865411520004272, + "learning_rate": 5e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7054216861724854, + "num_tokens": 234700323.0, + "step": 9071 + }, + { + "epoch": 0.9962661981111355, + "grad_norm": 1.737747311592102, + "learning_rate": 5e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7004094123840332, + "num_tokens": 234725455.0, + "step": 9072 + }, + { + "epoch": 0.9963760158137491, + "grad_norm": 1.806320071220398, + "learning_rate": 5e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6967744827270508, + "num_tokens": 234749119.0, + "step": 9073 + }, + { + "epoch": 0.9964858335163629, + "grad_norm": 1.5904884338378906, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7104055881500244, + "num_tokens": 234778644.0, + "step": 9074 + }, + { + "epoch": 0.9965956512189765, + "grad_norm": 1.562180519104004, + "learning_rate": 5e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7048895359039307, + "num_tokens": 234812002.0, + "step": 9075 + }, + { + "epoch": 0.9967054689215902, + "grad_norm": 1.7196804285049438, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7172297239303589, + "num_tokens": 234836497.0, + "step": 9076 + }, + { + "epoch": 0.9968152866242038, + "grad_norm": 1.657918930053711, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7289591431617737, + "num_tokens": 234863742.0, + "step": 9077 + }, + { + "epoch": 0.9969251043268175, + "grad_norm": 1.5961267948150635, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7216417193412781, + "num_tokens": 234892227.0, + "step": 9078 + }, + { + "epoch": 0.9970349220294311, + "grad_norm": 1.717987060546875, + "learning_rate": 5e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7061236500740051, + "num_tokens": 234918188.0, + "step": 9079 + }, + { + "epoch": 0.9971447397320448, + "grad_norm": 2.067819595336914, + "learning_rate": 5e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7180647850036621, + "num_tokens": 234936638.0, + "step": 9080 + }, + { + "epoch": 0.9972545574346585, + "grad_norm": 1.7162059545516968, + "learning_rate": 5e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.696129560470581, + "num_tokens": 234963677.0, + "step": 9081 + }, + { + "epoch": 0.9973643751372722, + "grad_norm": 1.7518670558929443, + "learning_rate": 5e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7172126770019531, + "num_tokens": 234988947.0, + "step": 9082 + }, + { + "epoch": 0.9974741928398858, + "grad_norm": 1.7656505107879639, + "learning_rate": 5e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.710871160030365, + "num_tokens": 235014357.0, + "step": 9083 + }, + { + "epoch": 0.9975840105424995, + "grad_norm": 1.5746928453445435, + "learning_rate": 5e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6946324110031128, + "num_tokens": 235043748.0, + "step": 9084 + }, + { + "epoch": 0.9976938282451131, + "grad_norm": 1.5956661701202393, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7187811136245728, + "num_tokens": 235072398.0, + "step": 9085 + }, + { + "epoch": 0.9978036459477267, + "grad_norm": 1.645338773727417, + "learning_rate": 5e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7177563905715942, + "num_tokens": 235098931.0, + "step": 9086 + }, + { + "epoch": 0.9979134636503404, + "grad_norm": 1.522969126701355, + "learning_rate": 5e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.703711986541748, + "num_tokens": 235129049.0, + "step": 9087 + }, + { + "epoch": 0.9980232813529542, + "grad_norm": 1.8466647863388062, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7181048393249512, + "num_tokens": 235152094.0, + "step": 9088 + }, + { + "epoch": 0.9981330990555678, + "grad_norm": 1.558292269706726, + "learning_rate": 5e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6764501333236694, + "num_tokens": 235185910.0, + "step": 9089 + }, + { + "epoch": 0.9982429167581814, + "grad_norm": 1.9661024808883667, + "learning_rate": 5e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6963672637939453, + "num_tokens": 235206588.0, + "step": 9090 + }, + { + "epoch": 0.9983527344607951, + "grad_norm": 1.8129541873931885, + "learning_rate": 5e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.716376543045044, + "num_tokens": 235231206.0, + "step": 9091 + }, + { + "epoch": 0.9984625521634087, + "grad_norm": 1.8767601251602173, + "learning_rate": 5e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7532796859741211, + "num_tokens": 235250915.0, + "step": 9092 + }, + { + "epoch": 0.9985723698660224, + "grad_norm": 1.8958408832550049, + "learning_rate": 5e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.6968965530395508, + "num_tokens": 235272595.0, + "step": 9093 + }, + { + "epoch": 0.998682187568636, + "grad_norm": 1.7842854261398315, + "learning_rate": 5e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7354071140289307, + "num_tokens": 235294614.0, + "step": 9094 + }, + { + "epoch": 0.9987920052712497, + "grad_norm": 1.6466474533081055, + "learning_rate": 5e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7001343369483948, + "num_tokens": 235324726.0, + "step": 9095 + }, + { + "epoch": 0.9989018229738634, + "grad_norm": 1.7883824110031128, + "learning_rate": 5e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6992394924163818, + "num_tokens": 235352855.0, + "step": 9096 + }, + { + "epoch": 0.9990116406764771, + "grad_norm": 1.6547610759735107, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7196611166000366, + "num_tokens": 235382262.0, + "step": 9097 + }, + { + "epoch": 0.9991214583790907, + "grad_norm": 1.9820539951324463, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7096747159957886, + "num_tokens": 235401827.0, + "step": 9098 + }, + { + "epoch": 0.9992312760817044, + "grad_norm": 2.0614922046661377, + "learning_rate": 5e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7316116094589233, + "num_tokens": 235422125.0, + "step": 9099 + }, + { + "epoch": 0.999341093784318, + "grad_norm": 1.5963804721832275, + "learning_rate": 5e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7041854858398438, + "num_tokens": 235452628.0, + "step": 9100 + }, + { + "epoch": 0.9994509114869317, + "grad_norm": 1.8021358251571655, + "learning_rate": 5e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7194877862930298, + "num_tokens": 235477852.0, + "step": 9101 + }, + { + "epoch": 0.9995607291895453, + "grad_norm": 1.7131710052490234, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7222055196762085, + "num_tokens": 235503267.0, + "step": 9102 + }, + { + "epoch": 0.9996705468921591, + "grad_norm": 1.767836332321167, + "learning_rate": 5e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.710461437702179, + "num_tokens": 235528087.0, + "step": 9103 + }, + { + "epoch": 0.9997803645947727, + "grad_norm": 1.696618676185608, + "learning_rate": 5e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6978757977485657, + "num_tokens": 235558911.0, + "step": 9104 + }, + { + "epoch": 0.9998901822973864, + "grad_norm": 1.6823890209197998, + "learning_rate": 5e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6996209621429443, + "num_tokens": 235589705.0, + "step": 9105 + }, + { + "epoch": 1.0, + "grad_norm": 1.5450769662857056, + "learning_rate": 5e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7227821350097656, + "num_tokens": 235621186.0, + "step": 9106 + }, + { + "epoch": 1.0001098177026138, + "grad_norm": 1.7376925945281982, + "learning_rate": 5e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7341462969779968, + "num_tokens": 235646666.0, + "step": 9107 + }, + { + "epoch": 1.0002196354052273, + "grad_norm": 1.9052225351333618, + "learning_rate": 5e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7368420958518982, + "num_tokens": 235668457.0, + "step": 9108 + }, + { + "epoch": 1.000329453107841, + "grad_norm": 1.695934534072876, + "learning_rate": 5e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7342594861984253, + "num_tokens": 235695829.0, + "step": 9109 + }, + { + "epoch": 1.0004392708104546, + "grad_norm": 1.8803273439407349, + "learning_rate": 5e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7562651038169861, + "num_tokens": 235717772.0, + "step": 9110 + }, + { + "epoch": 1.0005490885130683, + "grad_norm": 1.9816460609436035, + "learning_rate": 5e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.763132631778717, + "num_tokens": 235736464.0, + "step": 9111 + }, + { + "epoch": 1.0006589062156819, + "grad_norm": 1.745455265045166, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7516452074050903, + "num_tokens": 235760808.0, + "step": 9112 + }, + { + "epoch": 1.0007687239182956, + "grad_norm": 1.6975749731063843, + "learning_rate": 5e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7354882955551147, + "num_tokens": 235787768.0, + "step": 9113 + }, + { + "epoch": 1.0008785416209094, + "grad_norm": 1.9047411680221558, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7447060346603394, + "num_tokens": 235814182.0, + "step": 9114 + }, + { + "epoch": 1.000988359323523, + "grad_norm": 1.8001614809036255, + "learning_rate": 5e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7289506196975708, + "num_tokens": 235842188.0, + "step": 9115 + }, + { + "epoch": 1.0010981770261367, + "grad_norm": 1.9373221397399902, + "learning_rate": 5e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7594331502914429, + "num_tokens": 235867281.0, + "step": 9116 + }, + { + "epoch": 1.0012079947287502, + "grad_norm": 2.158885955810547, + "learning_rate": 5e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.731151819229126, + "num_tokens": 235889468.0, + "step": 9117 + }, + { + "epoch": 1.001317812431364, + "grad_norm": 2.0820682048797607, + "learning_rate": 5e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7255024909973145, + "num_tokens": 235911158.0, + "step": 9118 + }, + { + "epoch": 1.0014276301339775, + "grad_norm": 2.0322821140289307, + "learning_rate": 5e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7060753107070923, + "num_tokens": 235939146.0, + "step": 9119 + }, + { + "epoch": 1.0015374478365913, + "grad_norm": 1.778518557548523, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7309655547142029, + "num_tokens": 235966244.0, + "step": 9120 + }, + { + "epoch": 1.001647265539205, + "grad_norm": 1.9211947917938232, + "learning_rate": 5e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7198175191879272, + "num_tokens": 235991984.0, + "step": 9121 + }, + { + "epoch": 1.0017570832418186, + "grad_norm": 2.035137414932251, + "learning_rate": 5e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7452257871627808, + "num_tokens": 236014620.0, + "step": 9122 + }, + { + "epoch": 1.0018669009444323, + "grad_norm": 2.2601664066314697, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7490841150283813, + "num_tokens": 236034633.0, + "step": 9123 + }, + { + "epoch": 1.0019767186470458, + "grad_norm": 1.7972700595855713, + "learning_rate": 5e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7208675146102905, + "num_tokens": 236063955.0, + "step": 9124 + }, + { + "epoch": 1.0020865363496596, + "grad_norm": 1.9469465017318726, + "learning_rate": 5e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7272993922233582, + "num_tokens": 236088746.0, + "step": 9125 + }, + { + "epoch": 1.0021963540522731, + "grad_norm": 1.9983816146850586, + "learning_rate": 5e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7499229907989502, + "num_tokens": 236108363.0, + "step": 9126 + }, + { + "epoch": 1.002306171754887, + "grad_norm": 1.9252805709838867, + "learning_rate": 5e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7502321004867554, + "num_tokens": 236131376.0, + "step": 9127 + }, + { + "epoch": 1.0024159894575007, + "grad_norm": 1.6317901611328125, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7354401350021362, + "num_tokens": 236162012.0, + "step": 9128 + }, + { + "epoch": 1.0025258071601142, + "grad_norm": 1.841691255569458, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7310998439788818, + "num_tokens": 236188748.0, + "step": 9129 + }, + { + "epoch": 1.002635624862728, + "grad_norm": 2.261049270629883, + "learning_rate": 5e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7347177267074585, + "num_tokens": 236207730.0, + "step": 9130 + }, + { + "epoch": 1.0027454425653415, + "grad_norm": 1.9757452011108398, + "learning_rate": 5e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7475844621658325, + "num_tokens": 236230451.0, + "step": 9131 + }, + { + "epoch": 1.0028552602679552, + "grad_norm": 2.1989669799804688, + "learning_rate": 5e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7569478154182434, + "num_tokens": 236251955.0, + "step": 9132 + }, + { + "epoch": 1.0029650779705688, + "grad_norm": 2.1175079345703125, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7371119856834412, + "num_tokens": 236270388.0, + "step": 9133 + }, + { + "epoch": 1.0030748956731825, + "grad_norm": 1.9745187759399414, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7472516298294067, + "num_tokens": 236292336.0, + "step": 9134 + }, + { + "epoch": 1.0031847133757963, + "grad_norm": 2.1757864952087402, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7365041375160217, + "num_tokens": 236315469.0, + "step": 9135 + }, + { + "epoch": 1.0032945310784098, + "grad_norm": 1.752363681793213, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7216300368309021, + "num_tokens": 236343775.0, + "step": 9136 + }, + { + "epoch": 1.0034043487810236, + "grad_norm": 1.872541069984436, + "learning_rate": 5e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7343136072158813, + "num_tokens": 236372968.0, + "step": 9137 + }, + { + "epoch": 1.0035141664836371, + "grad_norm": 1.7760158777236938, + "learning_rate": 5e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7122474908828735, + "num_tokens": 236400970.0, + "step": 9138 + }, + { + "epoch": 1.0036239841862509, + "grad_norm": 1.818800687789917, + "learning_rate": 5e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.7446292042732239, + "num_tokens": 236428631.0, + "step": 9139 + }, + { + "epoch": 1.0037338018888644, + "grad_norm": 1.8491979837417603, + "learning_rate": 5e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7615075707435608, + "num_tokens": 236453252.0, + "step": 9140 + }, + { + "epoch": 1.0038436195914782, + "grad_norm": 1.6947864294052124, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.723081111907959, + "num_tokens": 236484888.0, + "step": 9141 + }, + { + "epoch": 1.0039534372940917, + "grad_norm": 2.0882301330566406, + "learning_rate": 5e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.731934666633606, + "num_tokens": 236506126.0, + "step": 9142 + }, + { + "epoch": 1.0040632549967055, + "grad_norm": 1.987815499305725, + "learning_rate": 5e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7441426515579224, + "num_tokens": 236530016.0, + "step": 9143 + }, + { + "epoch": 1.0041730726993192, + "grad_norm": 2.135385513305664, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7617669701576233, + "num_tokens": 236551012.0, + "step": 9144 + }, + { + "epoch": 1.0042828904019327, + "grad_norm": 1.9715635776519775, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7447878122329712, + "num_tokens": 236574684.0, + "step": 9145 + }, + { + "epoch": 1.0043927081045465, + "grad_norm": 1.9451959133148193, + "learning_rate": 5e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7176913619041443, + "num_tokens": 236600804.0, + "step": 9146 + }, + { + "epoch": 1.00450252580716, + "grad_norm": 1.9781172275543213, + "learning_rate": 5e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7489616870880127, + "num_tokens": 236622636.0, + "step": 9147 + }, + { + "epoch": 1.0046123435097738, + "grad_norm": 1.7011663913726807, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7468134164810181, + "num_tokens": 236652669.0, + "step": 9148 + }, + { + "epoch": 1.0047221612123873, + "grad_norm": 1.8306454420089722, + "learning_rate": 5e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7465604543685913, + "num_tokens": 236677706.0, + "step": 9149 + }, + { + "epoch": 1.004831978915001, + "grad_norm": 1.7634474039077759, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7394943833351135, + "num_tokens": 236704804.0, + "step": 9150 + }, + { + "epoch": 1.0049417966176148, + "grad_norm": 1.7512576580047607, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7156404256820679, + "num_tokens": 236735458.0, + "step": 9151 + }, + { + "epoch": 1.0050516143202284, + "grad_norm": 1.7394568920135498, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7541666030883789, + "num_tokens": 236764401.0, + "step": 9152 + }, + { + "epoch": 1.0051614320228421, + "grad_norm": 1.8446704149246216, + "learning_rate": 5e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7184406518936157, + "num_tokens": 236790359.0, + "step": 9153 + }, + { + "epoch": 1.0052712497254557, + "grad_norm": 1.761986255645752, + "learning_rate": 5e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7268019914627075, + "num_tokens": 236817432.0, + "step": 9154 + }, + { + "epoch": 1.0053810674280694, + "grad_norm": 1.8081767559051514, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7263698577880859, + "num_tokens": 236843844.0, + "step": 9155 + }, + { + "epoch": 1.005490885130683, + "grad_norm": 2.1052253246307373, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7571321725845337, + "num_tokens": 236863104.0, + "step": 9156 + }, + { + "epoch": 1.0056007028332967, + "grad_norm": 2.068464994430542, + "learning_rate": 5e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7300785779953003, + "num_tokens": 236886701.0, + "step": 9157 + }, + { + "epoch": 1.0057105205359105, + "grad_norm": 1.9607818126678467, + "learning_rate": 5e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7273684144020081, + "num_tokens": 236911430.0, + "step": 9158 + }, + { + "epoch": 1.005820338238524, + "grad_norm": 1.708673357963562, + "learning_rate": 5e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7244997024536133, + "num_tokens": 236941111.0, + "step": 9159 + }, + { + "epoch": 1.0059301559411378, + "grad_norm": 2.051197052001953, + "learning_rate": 5e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7316958904266357, + "num_tokens": 236963399.0, + "step": 9160 + }, + { + "epoch": 1.0060399736437513, + "grad_norm": 1.8660287857055664, + "learning_rate": 5e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7393327355384827, + "num_tokens": 236989600.0, + "step": 9161 + }, + { + "epoch": 1.006149791346365, + "grad_norm": 1.7064709663391113, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7304468154907227, + "num_tokens": 237020227.0, + "step": 9162 + }, + { + "epoch": 1.0062596090489786, + "grad_norm": 1.8543380498886108, + "learning_rate": 5e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.7619212865829468, + "num_tokens": 237042215.0, + "step": 9163 + }, + { + "epoch": 1.0063694267515924, + "grad_norm": 1.5067490339279175, + "learning_rate": 5e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7115679383277893, + "num_tokens": 237078981.0, + "step": 9164 + }, + { + "epoch": 1.0064792444542061, + "grad_norm": 2.125182867050171, + "learning_rate": 5e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.741169810295105, + "num_tokens": 237100039.0, + "step": 9165 + }, + { + "epoch": 1.0065890621568196, + "grad_norm": 2.5049843788146973, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7482339143753052, + "num_tokens": 237115433.0, + "step": 9166 + }, + { + "epoch": 1.0066988798594334, + "grad_norm": 1.8761800527572632, + "learning_rate": 5e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7267894744873047, + "num_tokens": 237140606.0, + "step": 9167 + }, + { + "epoch": 1.006808697562047, + "grad_norm": 1.8359689712524414, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7430597543716431, + "num_tokens": 237164974.0, + "step": 9168 + }, + { + "epoch": 1.0069185152646607, + "grad_norm": 1.828884482383728, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7363716959953308, + "num_tokens": 237189630.0, + "step": 9169 + }, + { + "epoch": 1.0070283329672742, + "grad_norm": 2.025860548019409, + "learning_rate": 5e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7367956638336182, + "num_tokens": 237209524.0, + "step": 9170 + }, + { + "epoch": 1.007138150669888, + "grad_norm": 1.9859364032745361, + "learning_rate": 5e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7278957366943359, + "num_tokens": 237233168.0, + "step": 9171 + }, + { + "epoch": 1.0072479683725017, + "grad_norm": 1.7077255249023438, + "learning_rate": 5e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7453410625457764, + "num_tokens": 237266816.0, + "step": 9172 + }, + { + "epoch": 1.0073577860751153, + "grad_norm": 1.9243502616882324, + "learning_rate": 5e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7190051078796387, + "num_tokens": 237296169.0, + "step": 9173 + }, + { + "epoch": 1.007467603777729, + "grad_norm": 2.0285403728485107, + "learning_rate": 5e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7126585245132446, + "num_tokens": 237320887.0, + "step": 9174 + }, + { + "epoch": 1.0075774214803426, + "grad_norm": 1.9284734725952148, + "learning_rate": 5e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7395604848861694, + "num_tokens": 237345536.0, + "step": 9175 + }, + { + "epoch": 1.0076872391829563, + "grad_norm": 1.7416075468063354, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.749241292476654, + "num_tokens": 237372883.0, + "step": 9176 + }, + { + "epoch": 1.0077970568855699, + "grad_norm": 1.736918568611145, + "learning_rate": 5e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7037839293479919, + "num_tokens": 237404443.0, + "step": 9177 + }, + { + "epoch": 1.0079068745881836, + "grad_norm": 1.7793073654174805, + "learning_rate": 5e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.744328498840332, + "num_tokens": 237437351.0, + "step": 9178 + }, + { + "epoch": 1.0080166922907974, + "grad_norm": 1.7055755853652954, + "learning_rate": 5e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7124407887458801, + "num_tokens": 237464776.0, + "step": 9179 + }, + { + "epoch": 1.008126509993411, + "grad_norm": 1.938363790512085, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7407016158103943, + "num_tokens": 237488382.0, + "step": 9180 + }, + { + "epoch": 1.0082363276960247, + "grad_norm": 2.1427865028381348, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7467448711395264, + "num_tokens": 237508412.0, + "step": 9181 + }, + { + "epoch": 1.0083461453986382, + "grad_norm": 2.067625045776367, + "learning_rate": 5e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7385236024856567, + "num_tokens": 237530312.0, + "step": 9182 + }, + { + "epoch": 1.008455963101252, + "grad_norm": 2.0244712829589844, + "learning_rate": 5e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7071106433868408, + "num_tokens": 237558943.0, + "step": 9183 + }, + { + "epoch": 1.0085657808038655, + "grad_norm": 1.9061691761016846, + "learning_rate": 5e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7264968156814575, + "num_tokens": 237585542.0, + "step": 9184 + }, + { + "epoch": 1.0086755985064793, + "grad_norm": 1.9876223802566528, + "learning_rate": 5e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7420516610145569, + "num_tokens": 237609840.0, + "step": 9185 + }, + { + "epoch": 1.008785416209093, + "grad_norm": 1.870076298713684, + "learning_rate": 5e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7624647617340088, + "num_tokens": 237633775.0, + "step": 9186 + }, + { + "epoch": 1.0088952339117065, + "grad_norm": 1.8018474578857422, + "learning_rate": 5e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7504931092262268, + "num_tokens": 237658819.0, + "step": 9187 + }, + { + "epoch": 1.0090050516143203, + "grad_norm": 1.9526077508926392, + "learning_rate": 5e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7407461404800415, + "num_tokens": 237682789.0, + "step": 9188 + }, + { + "epoch": 1.0091148693169338, + "grad_norm": 1.782148838043213, + "learning_rate": 5e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7219651937484741, + "num_tokens": 237710797.0, + "step": 9189 + }, + { + "epoch": 1.0092246870195476, + "grad_norm": 1.8780672550201416, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7327927350997925, + "num_tokens": 237735544.0, + "step": 9190 + }, + { + "epoch": 1.0093345047221611, + "grad_norm": 1.8635673522949219, + "learning_rate": 5e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7469300627708435, + "num_tokens": 237759163.0, + "step": 9191 + }, + { + "epoch": 1.0094443224247749, + "grad_norm": 1.7912546396255493, + "learning_rate": 5e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.6981233358383179, + "num_tokens": 237786570.0, + "step": 9192 + }, + { + "epoch": 1.0095541401273886, + "grad_norm": 1.677432894706726, + "learning_rate": 5e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7453826665878296, + "num_tokens": 237815019.0, + "step": 9193 + }, + { + "epoch": 1.0096639578300022, + "grad_norm": 1.7611132860183716, + "learning_rate": 5e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7214102149009705, + "num_tokens": 237841618.0, + "step": 9194 + }, + { + "epoch": 1.009773775532616, + "grad_norm": 1.9871946573257446, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7349553108215332, + "num_tokens": 237865770.0, + "step": 9195 + }, + { + "epoch": 1.0098835932352295, + "grad_norm": 1.7467728853225708, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7336450815200806, + "num_tokens": 237895742.0, + "step": 9196 + }, + { + "epoch": 1.0099934109378432, + "grad_norm": 1.873796820640564, + "learning_rate": 5e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.704321026802063, + "num_tokens": 237922084.0, + "step": 9197 + }, + { + "epoch": 1.0101032286404568, + "grad_norm": 1.8773893117904663, + "learning_rate": 5e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7260181903839111, + "num_tokens": 237946362.0, + "step": 9198 + }, + { + "epoch": 1.0102130463430705, + "grad_norm": 1.8783597946166992, + "learning_rate": 5e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7439596652984619, + "num_tokens": 237972419.0, + "step": 9199 + }, + { + "epoch": 1.010322864045684, + "grad_norm": 2.012033224105835, + "learning_rate": 5e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7444499135017395, + "num_tokens": 237993457.0, + "step": 9200 + }, + { + "epoch": 1.0104326817482978, + "grad_norm": 1.9547452926635742, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7559120059013367, + "num_tokens": 238017141.0, + "step": 9201 + }, + { + "epoch": 1.0105424994509116, + "grad_norm": 1.8853082656860352, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7518248558044434, + "num_tokens": 238040212.0, + "step": 9202 + }, + { + "epoch": 1.010652317153525, + "grad_norm": 1.7262401580810547, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7216804623603821, + "num_tokens": 238071110.0, + "step": 9203 + }, + { + "epoch": 1.0107621348561389, + "grad_norm": 1.899600625038147, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7346851825714111, + "num_tokens": 238096775.0, + "step": 9204 + }, + { + "epoch": 1.0108719525587524, + "grad_norm": 2.0167951583862305, + "learning_rate": 5e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7545442581176758, + "num_tokens": 238117433.0, + "step": 9205 + }, + { + "epoch": 1.0109817702613662, + "grad_norm": 1.7864826917648315, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.738578200340271, + "num_tokens": 238144871.0, + "step": 9206 + }, + { + "epoch": 1.0110915879639797, + "grad_norm": 1.8793002367019653, + "learning_rate": 5e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.753607988357544, + "num_tokens": 238169552.0, + "step": 9207 + }, + { + "epoch": 1.0112014056665934, + "grad_norm": 1.8519073724746704, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7163584232330322, + "num_tokens": 238196014.0, + "step": 9208 + }, + { + "epoch": 1.0113112233692072, + "grad_norm": 1.9732308387756348, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7308152914047241, + "num_tokens": 238218619.0, + "step": 9209 + }, + { + "epoch": 1.0114210410718207, + "grad_norm": 1.9041281938552856, + "learning_rate": 5e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7345749139785767, + "num_tokens": 238244688.0, + "step": 9210 + }, + { + "epoch": 1.0115308587744345, + "grad_norm": 2.069796085357666, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7399570941925049, + "num_tokens": 238267560.0, + "step": 9211 + }, + { + "epoch": 1.011640676477048, + "grad_norm": 1.6762562990188599, + "learning_rate": 5e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7261008024215698, + "num_tokens": 238299601.0, + "step": 9212 + }, + { + "epoch": 1.0117504941796618, + "grad_norm": 1.685805320739746, + "learning_rate": 5e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7466492652893066, + "num_tokens": 238330691.0, + "step": 9213 + }, + { + "epoch": 1.0118603118822753, + "grad_norm": 2.2997524738311768, + "learning_rate": 5e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7467677593231201, + "num_tokens": 238353009.0, + "step": 9214 + }, + { + "epoch": 1.011970129584889, + "grad_norm": 1.9898629188537598, + "learning_rate": 5e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7333137392997742, + "num_tokens": 238373868.0, + "step": 9215 + }, + { + "epoch": 1.0120799472875028, + "grad_norm": 1.7438290119171143, + "learning_rate": 5e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7661013603210449, + "num_tokens": 238397689.0, + "step": 9216 + }, + { + "epoch": 1.0121897649901164, + "grad_norm": 1.8925247192382812, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7253090739250183, + "num_tokens": 238423355.0, + "step": 9217 + }, + { + "epoch": 1.0122995826927301, + "grad_norm": 1.912916660308838, + "learning_rate": 5e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7389665842056274, + "num_tokens": 238448008.0, + "step": 9218 + }, + { + "epoch": 1.0124094003953437, + "grad_norm": 1.8369828462600708, + "learning_rate": 5e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7194505333900452, + "num_tokens": 238473069.0, + "step": 9219 + }, + { + "epoch": 1.0125192180979574, + "grad_norm": 1.8887507915496826, + "learning_rate": 5e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7483621835708618, + "num_tokens": 238495771.0, + "step": 9220 + }, + { + "epoch": 1.012629035800571, + "grad_norm": 1.9177016019821167, + "learning_rate": 5e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7300746440887451, + "num_tokens": 238522457.0, + "step": 9221 + }, + { + "epoch": 1.0127388535031847, + "grad_norm": 2.052412986755371, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7382293939590454, + "num_tokens": 238546580.0, + "step": 9222 + }, + { + "epoch": 1.0128486712057985, + "grad_norm": 1.7771453857421875, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7511610984802246, + "num_tokens": 238575703.0, + "step": 9223 + }, + { + "epoch": 1.012958488908412, + "grad_norm": 2.1700356006622314, + "learning_rate": 5e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7259379029273987, + "num_tokens": 238599514.0, + "step": 9224 + }, + { + "epoch": 1.0130683066110258, + "grad_norm": 1.8832868337631226, + "learning_rate": 5e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7507573962211609, + "num_tokens": 238623690.0, + "step": 9225 + }, + { + "epoch": 1.0131781243136393, + "grad_norm": 1.7569847106933594, + "learning_rate": 5e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7377103567123413, + "num_tokens": 238649432.0, + "step": 9226 + }, + { + "epoch": 1.013287942016253, + "grad_norm": 1.9731340408325195, + "learning_rate": 5e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7642160058021545, + "num_tokens": 238672350.0, + "step": 9227 + }, + { + "epoch": 1.0133977597188666, + "grad_norm": 1.7934331893920898, + "learning_rate": 5e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7317088842391968, + "num_tokens": 238697559.0, + "step": 9228 + }, + { + "epoch": 1.0135075774214803, + "grad_norm": 1.9756594896316528, + "learning_rate": 5e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7408796548843384, + "num_tokens": 238720424.0, + "step": 9229 + }, + { + "epoch": 1.013617395124094, + "grad_norm": 2.0721869468688965, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.756112813949585, + "num_tokens": 238740407.0, + "step": 9230 + }, + { + "epoch": 1.0137272128267076, + "grad_norm": 1.9204355478286743, + "learning_rate": 5e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7449678778648376, + "num_tokens": 238765772.0, + "step": 9231 + }, + { + "epoch": 1.0138370305293214, + "grad_norm": 1.966015100479126, + "learning_rate": 5e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.731455385684967, + "num_tokens": 238788924.0, + "step": 9232 + }, + { + "epoch": 1.013946848231935, + "grad_norm": 1.7262235879898071, + "learning_rate": 5e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7201746702194214, + "num_tokens": 238822537.0, + "step": 9233 + }, + { + "epoch": 1.0140566659345487, + "grad_norm": 2.396641492843628, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7350910902023315, + "num_tokens": 238840567.0, + "step": 9234 + }, + { + "epoch": 1.0141664836371622, + "grad_norm": 2.1848509311676025, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7266542315483093, + "num_tokens": 238863940.0, + "step": 9235 + }, + { + "epoch": 1.014276301339776, + "grad_norm": 2.027726888656616, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7208572626113892, + "num_tokens": 238888712.0, + "step": 9236 + }, + { + "epoch": 1.0143861190423897, + "grad_norm": 1.924107313156128, + "learning_rate": 5e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7184836864471436, + "num_tokens": 238914728.0, + "step": 9237 + }, + { + "epoch": 1.0144959367450033, + "grad_norm": 1.773478627204895, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7226982712745667, + "num_tokens": 238944761.0, + "step": 9238 + }, + { + "epoch": 1.014605754447617, + "grad_norm": 1.7465872764587402, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7556349635124207, + "num_tokens": 238969621.0, + "step": 9239 + }, + { + "epoch": 1.0147155721502306, + "grad_norm": 1.7668647766113281, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7226511836051941, + "num_tokens": 238996966.0, + "step": 9240 + }, + { + "epoch": 1.0148253898528443, + "grad_norm": 1.6917349100112915, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7390938401222229, + "num_tokens": 239023325.0, + "step": 9241 + }, + { + "epoch": 1.0149352075554579, + "grad_norm": 1.7831718921661377, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7318569421768188, + "num_tokens": 239050258.0, + "step": 9242 + }, + { + "epoch": 1.0150450252580716, + "grad_norm": 1.7838321924209595, + "learning_rate": 5e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.6950751543045044, + "num_tokens": 239082615.0, + "step": 9243 + }, + { + "epoch": 1.0151548429606854, + "grad_norm": 1.7585837841033936, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7275218367576599, + "num_tokens": 239112181.0, + "step": 9244 + }, + { + "epoch": 1.015264660663299, + "grad_norm": 1.7927583456039429, + "learning_rate": 5e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7159332036972046, + "num_tokens": 239137882.0, + "step": 9245 + }, + { + "epoch": 1.0153744783659127, + "grad_norm": 1.778048038482666, + "learning_rate": 5e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7036352157592773, + "num_tokens": 239170136.0, + "step": 9246 + }, + { + "epoch": 1.0154842960685262, + "grad_norm": 1.6064461469650269, + "learning_rate": 5e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7450408339500427, + "num_tokens": 239202812.0, + "step": 9247 + }, + { + "epoch": 1.01559411377114, + "grad_norm": 1.8763669729232788, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7428164482116699, + "num_tokens": 239227252.0, + "step": 9248 + }, + { + "epoch": 1.0157039314737535, + "grad_norm": 1.7313711643218994, + "learning_rate": 5e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7306724190711975, + "num_tokens": 239254384.0, + "step": 9249 + }, + { + "epoch": 1.0158137491763672, + "grad_norm": 1.689940094947815, + "learning_rate": 5e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7333179116249084, + "num_tokens": 239283975.0, + "step": 9250 + }, + { + "epoch": 1.015923566878981, + "grad_norm": 1.957495093345642, + "learning_rate": 5e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7236565351486206, + "num_tokens": 239308187.0, + "step": 9251 + }, + { + "epoch": 1.0160333845815945, + "grad_norm": 1.8495655059814453, + "learning_rate": 5e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7357300519943237, + "num_tokens": 239335050.0, + "step": 9252 + }, + { + "epoch": 1.0161432022842083, + "grad_norm": 2.0592501163482666, + "learning_rate": 5e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7618243098258972, + "num_tokens": 239355794.0, + "step": 9253 + }, + { + "epoch": 1.0162530199868218, + "grad_norm": 1.7992428541183472, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7392366528511047, + "num_tokens": 239383031.0, + "step": 9254 + }, + { + "epoch": 1.0163628376894356, + "grad_norm": 1.916906714439392, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7212480306625366, + "num_tokens": 239410316.0, + "step": 9255 + }, + { + "epoch": 1.0164726553920491, + "grad_norm": 1.8821581602096558, + "learning_rate": 5e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7472810745239258, + "num_tokens": 239435704.0, + "step": 9256 + }, + { + "epoch": 1.0165824730946629, + "grad_norm": 1.644031286239624, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7277653217315674, + "num_tokens": 239469243.0, + "step": 9257 + }, + { + "epoch": 1.0166922907972764, + "grad_norm": 1.7557674646377563, + "learning_rate": 5e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7456294894218445, + "num_tokens": 239495853.0, + "step": 9258 + }, + { + "epoch": 1.0168021084998902, + "grad_norm": 1.830344796180725, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7362828254699707, + "num_tokens": 239520894.0, + "step": 9259 + }, + { + "epoch": 1.016911926202504, + "grad_norm": 1.8678643703460693, + "learning_rate": 5e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7487623691558838, + "num_tokens": 239544547.0, + "step": 9260 + }, + { + "epoch": 1.0170217439051175, + "grad_norm": 1.9227579832077026, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7304445505142212, + "num_tokens": 239568725.0, + "step": 9261 + }, + { + "epoch": 1.0171315616077312, + "grad_norm": 1.7717821598052979, + "learning_rate": 5e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.761879563331604, + "num_tokens": 239594270.0, + "step": 9262 + }, + { + "epoch": 1.0172413793103448, + "grad_norm": 1.6823939085006714, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7329290509223938, + "num_tokens": 239622890.0, + "step": 9263 + }, + { + "epoch": 1.0173511970129585, + "grad_norm": 2.00168776512146, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7406831979751587, + "num_tokens": 239644756.0, + "step": 9264 + }, + { + "epoch": 1.017461014715572, + "grad_norm": 1.7816821336746216, + "learning_rate": 5e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7497868537902832, + "num_tokens": 239672004.0, + "step": 9265 + }, + { + "epoch": 1.0175708324181858, + "grad_norm": 1.7762969732284546, + "learning_rate": 5e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.730444073677063, + "num_tokens": 239700542.0, + "step": 9266 + }, + { + "epoch": 1.0176806501207996, + "grad_norm": 1.8751240968704224, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7351090908050537, + "num_tokens": 239728419.0, + "step": 9267 + }, + { + "epoch": 1.017790467823413, + "grad_norm": 2.0402214527130127, + "learning_rate": 5e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7374050617218018, + "num_tokens": 239752269.0, + "step": 9268 + }, + { + "epoch": 1.0179002855260268, + "grad_norm": 2.0281333923339844, + "learning_rate": 5e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7472026944160461, + "num_tokens": 239775925.0, + "step": 9269 + }, + { + "epoch": 1.0180101032286404, + "grad_norm": 1.9355884790420532, + "learning_rate": 5e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7466404438018799, + "num_tokens": 239800529.0, + "step": 9270 + }, + { + "epoch": 1.0181199209312541, + "grad_norm": 1.9882895946502686, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7529707551002502, + "num_tokens": 239822285.0, + "step": 9271 + }, + { + "epoch": 1.0182297386338677, + "grad_norm": 1.9118990898132324, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7329592704772949, + "num_tokens": 239847393.0, + "step": 9272 + }, + { + "epoch": 1.0183395563364814, + "grad_norm": 1.8290168046951294, + "learning_rate": 5e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7542970180511475, + "num_tokens": 239871964.0, + "step": 9273 + }, + { + "epoch": 1.0184493740390952, + "grad_norm": 2.0172159671783447, + "learning_rate": 5e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7490569353103638, + "num_tokens": 239896611.0, + "step": 9274 + }, + { + "epoch": 1.0185591917417087, + "grad_norm": 1.8458443880081177, + "learning_rate": 5e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7341231107711792, + "num_tokens": 239924521.0, + "step": 9275 + }, + { + "epoch": 1.0186690094443225, + "grad_norm": 1.7811474800109863, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7245490550994873, + "num_tokens": 239953163.0, + "step": 9276 + }, + { + "epoch": 1.018778827146936, + "grad_norm": 1.7111365795135498, + "learning_rate": 5e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6851329803466797, + "num_tokens": 239987165.0, + "step": 9277 + }, + { + "epoch": 1.0188886448495498, + "grad_norm": 2.049983024597168, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7362821102142334, + "num_tokens": 240008858.0, + "step": 9278 + }, + { + "epoch": 1.0189984625521633, + "grad_norm": 1.6692845821380615, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7463483810424805, + "num_tokens": 240040022.0, + "step": 9279 + }, + { + "epoch": 1.019108280254777, + "grad_norm": 1.783262848854065, + "learning_rate": 5e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6980286240577698, + "num_tokens": 240071278.0, + "step": 9280 + }, + { + "epoch": 1.0192180979573908, + "grad_norm": 1.9951326847076416, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7344259023666382, + "num_tokens": 240092763.0, + "step": 9281 + }, + { + "epoch": 1.0193279156600044, + "grad_norm": 1.851619839668274, + "learning_rate": 5e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.743024468421936, + "num_tokens": 240119571.0, + "step": 9282 + }, + { + "epoch": 1.0194377333626181, + "grad_norm": 1.988152265548706, + "learning_rate": 5e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7539158463478088, + "num_tokens": 240140881.0, + "step": 9283 + }, + { + "epoch": 1.0195475510652316, + "grad_norm": 2.500232696533203, + "learning_rate": 5e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7221316695213318, + "num_tokens": 240171801.0, + "step": 9284 + }, + { + "epoch": 1.0196573687678454, + "grad_norm": 1.7623279094696045, + "learning_rate": 5e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.735325813293457, + "num_tokens": 240197592.0, + "step": 9285 + }, + { + "epoch": 1.019767186470459, + "grad_norm": 1.787997841835022, + "learning_rate": 5e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7388952374458313, + "num_tokens": 240223244.0, + "step": 9286 + }, + { + "epoch": 1.0198770041730727, + "grad_norm": 1.7712361812591553, + "learning_rate": 5e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7243417501449585, + "num_tokens": 240250872.0, + "step": 9287 + }, + { + "epoch": 1.0199868218756865, + "grad_norm": 2.029946804046631, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7342015504837036, + "num_tokens": 240278472.0, + "step": 9288 + }, + { + "epoch": 1.0200966395783, + "grad_norm": 1.9686037302017212, + "learning_rate": 5e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7228456735610962, + "num_tokens": 240301956.0, + "step": 9289 + }, + { + "epoch": 1.0202064572809137, + "grad_norm": 1.8233221769332886, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7329534292221069, + "num_tokens": 240328492.0, + "step": 9290 + }, + { + "epoch": 1.0203162749835273, + "grad_norm": 2.002554416656494, + "learning_rate": 5e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7358272075653076, + "num_tokens": 240352405.0, + "step": 9291 + }, + { + "epoch": 1.020426092686141, + "grad_norm": 1.8685134649276733, + "learning_rate": 5e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7512099742889404, + "num_tokens": 240376487.0, + "step": 9292 + }, + { + "epoch": 1.0205359103887546, + "grad_norm": 1.8606200218200684, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7457699775695801, + "num_tokens": 240400833.0, + "step": 9293 + }, + { + "epoch": 1.0206457280913683, + "grad_norm": 1.8153184652328491, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7245662808418274, + "num_tokens": 240428082.0, + "step": 9294 + }, + { + "epoch": 1.020755545793982, + "grad_norm": 1.7818385362625122, + "learning_rate": 5e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7430938482284546, + "num_tokens": 240454540.0, + "step": 9295 + }, + { + "epoch": 1.0208653634965956, + "grad_norm": 1.7249282598495483, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7542340755462646, + "num_tokens": 240481162.0, + "step": 9296 + }, + { + "epoch": 1.0209751811992094, + "grad_norm": 1.8868451118469238, + "learning_rate": 5e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7200263142585754, + "num_tokens": 240508719.0, + "step": 9297 + }, + { + "epoch": 1.021084998901823, + "grad_norm": 1.9257688522338867, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7324298024177551, + "num_tokens": 240534065.0, + "step": 9298 + }, + { + "epoch": 1.0211948166044367, + "grad_norm": 2.3738582134246826, + "learning_rate": 5e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7554470300674438, + "num_tokens": 240551159.0, + "step": 9299 + }, + { + "epoch": 1.0213046343070502, + "grad_norm": 1.7504794597625732, + "learning_rate": 5e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7562752366065979, + "num_tokens": 240575819.0, + "step": 9300 + }, + { + "epoch": 1.021414452009664, + "grad_norm": 1.9605149030685425, + "learning_rate": 5e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7162967324256897, + "num_tokens": 240601232.0, + "step": 9301 + }, + { + "epoch": 1.0215242697122777, + "grad_norm": 1.8385552167892456, + "learning_rate": 5e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7295061945915222, + "num_tokens": 240626012.0, + "step": 9302 + }, + { + "epoch": 1.0216340874148913, + "grad_norm": 1.9312868118286133, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7348854541778564, + "num_tokens": 240651329.0, + "step": 9303 + }, + { + "epoch": 1.021743905117505, + "grad_norm": 1.8206079006195068, + "learning_rate": 5e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7590741515159607, + "num_tokens": 240677532.0, + "step": 9304 + }, + { + "epoch": 1.0218537228201185, + "grad_norm": 1.8240044116973877, + "learning_rate": 5e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7459952235221863, + "num_tokens": 240701304.0, + "step": 9305 + }, + { + "epoch": 1.0219635405227323, + "grad_norm": 1.7551285028457642, + "learning_rate": 5e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7241171598434448, + "num_tokens": 240729525.0, + "step": 9306 + }, + { + "epoch": 1.0220733582253458, + "grad_norm": 2.199110269546509, + "learning_rate": 5e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7349991202354431, + "num_tokens": 240749794.0, + "step": 9307 + }, + { + "epoch": 1.0221831759279596, + "grad_norm": 1.842221975326538, + "learning_rate": 5e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7332291603088379, + "num_tokens": 240776971.0, + "step": 9308 + }, + { + "epoch": 1.0222929936305734, + "grad_norm": 1.638327717781067, + "learning_rate": 5e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7436230182647705, + "num_tokens": 240807858.0, + "step": 9309 + }, + { + "epoch": 1.022402811333187, + "grad_norm": 1.784968376159668, + "learning_rate": 5e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.722787082195282, + "num_tokens": 240836015.0, + "step": 9310 + }, + { + "epoch": 1.0225126290358006, + "grad_norm": 1.87519109249115, + "learning_rate": 5e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7108410000801086, + "num_tokens": 240863998.0, + "step": 9311 + }, + { + "epoch": 1.0226224467384142, + "grad_norm": 1.7071651220321655, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7272447347640991, + "num_tokens": 240893373.0, + "step": 9312 + }, + { + "epoch": 1.022732264441028, + "grad_norm": 2.0895638465881348, + "learning_rate": 5e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7211962938308716, + "num_tokens": 240919489.0, + "step": 9313 + }, + { + "epoch": 1.0228420821436415, + "grad_norm": 1.8706015348434448, + "learning_rate": 5e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7191470861434937, + "num_tokens": 240946295.0, + "step": 9314 + }, + { + "epoch": 1.0229518998462552, + "grad_norm": 1.703356385231018, + "learning_rate": 5e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7425894141197205, + "num_tokens": 240976390.0, + "step": 9315 + }, + { + "epoch": 1.023061717548869, + "grad_norm": 1.7608919143676758, + "learning_rate": 5e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7063964009284973, + "num_tokens": 241004584.0, + "step": 9316 + }, + { + "epoch": 1.0231715352514825, + "grad_norm": 1.6531909704208374, + "learning_rate": 5e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7402162551879883, + "num_tokens": 241034016.0, + "step": 9317 + }, + { + "epoch": 1.0232813529540963, + "grad_norm": 2.2428224086761475, + "learning_rate": 5e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7621723413467407, + "num_tokens": 241053950.0, + "step": 9318 + }, + { + "epoch": 1.0233911706567098, + "grad_norm": 2.1633331775665283, + "learning_rate": 5e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7297113537788391, + "num_tokens": 241075481.0, + "step": 9319 + }, + { + "epoch": 1.0235009883593236, + "grad_norm": 2.248044967651367, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7325100898742676, + "num_tokens": 241100793.0, + "step": 9320 + }, + { + "epoch": 1.023610806061937, + "grad_norm": 2.022749662399292, + "learning_rate": 5e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7466024160385132, + "num_tokens": 241123873.0, + "step": 9321 + }, + { + "epoch": 1.0237206237645509, + "grad_norm": 2.02203106880188, + "learning_rate": 5e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7421749830245972, + "num_tokens": 241144231.0, + "step": 9322 + }, + { + "epoch": 1.0238304414671644, + "grad_norm": 1.8119474649429321, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7275782823562622, + "num_tokens": 241170845.0, + "step": 9323 + }, + { + "epoch": 1.0239402591697782, + "grad_norm": 1.765059232711792, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7749310731887817, + "num_tokens": 241195008.0, + "step": 9324 + }, + { + "epoch": 1.024050076872392, + "grad_norm": 1.7361366748809814, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7338522672653198, + "num_tokens": 241222649.0, + "step": 9325 + }, + { + "epoch": 1.0241598945750054, + "grad_norm": 1.9361640214920044, + "learning_rate": 5e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.751751184463501, + "num_tokens": 241246381.0, + "step": 9326 + }, + { + "epoch": 1.0242697122776192, + "grad_norm": 1.8272846937179565, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7232945561408997, + "num_tokens": 241272569.0, + "step": 9327 + }, + { + "epoch": 1.0243795299802327, + "grad_norm": 2.4002413749694824, + "learning_rate": 5e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.709197998046875, + "num_tokens": 241302631.0, + "step": 9328 + }, + { + "epoch": 1.0244893476828465, + "grad_norm": 1.818177580833435, + "learning_rate": 5e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7190685272216797, + "num_tokens": 241331015.0, + "step": 9329 + }, + { + "epoch": 1.02459916538546, + "grad_norm": 1.9320768117904663, + "learning_rate": 5e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7222791910171509, + "num_tokens": 241357985.0, + "step": 9330 + }, + { + "epoch": 1.0247089830880738, + "grad_norm": 1.8938297033309937, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7194552421569824, + "num_tokens": 241385136.0, + "step": 9331 + }, + { + "epoch": 1.0248188007906875, + "grad_norm": 1.728338360786438, + "learning_rate": 5e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7683961391448975, + "num_tokens": 241412288.0, + "step": 9332 + }, + { + "epoch": 1.024928618493301, + "grad_norm": 1.7925087213516235, + "learning_rate": 5e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7505037784576416, + "num_tokens": 241437281.0, + "step": 9333 + }, + { + "epoch": 1.0250384361959148, + "grad_norm": 1.6421149969100952, + "learning_rate": 5e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7602243423461914, + "num_tokens": 241464602.0, + "step": 9334 + }, + { + "epoch": 1.0251482538985284, + "grad_norm": 2.014763593673706, + "learning_rate": 5e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7632880210876465, + "num_tokens": 241485809.0, + "step": 9335 + }, + { + "epoch": 1.0252580716011421, + "grad_norm": 1.8353936672210693, + "learning_rate": 5e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7400293350219727, + "num_tokens": 241512200.0, + "step": 9336 + }, + { + "epoch": 1.0253678893037557, + "grad_norm": 1.9187841415405273, + "learning_rate": 5e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7277861833572388, + "num_tokens": 241536933.0, + "step": 9337 + }, + { + "epoch": 1.0254777070063694, + "grad_norm": 1.8368428945541382, + "learning_rate": 5e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7270821332931519, + "num_tokens": 241563340.0, + "step": 9338 + }, + { + "epoch": 1.0255875247089832, + "grad_norm": 1.9632911682128906, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7315596342086792, + "num_tokens": 241586927.0, + "step": 9339 + }, + { + "epoch": 1.0256973424115967, + "grad_norm": 1.9305083751678467, + "learning_rate": 5e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7239980697631836, + "num_tokens": 241612920.0, + "step": 9340 + }, + { + "epoch": 1.0258071601142105, + "grad_norm": 1.5858864784240723, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7359488010406494, + "num_tokens": 241644984.0, + "step": 9341 + }, + { + "epoch": 1.025916977816824, + "grad_norm": 1.80087411403656, + "learning_rate": 5e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7600150108337402, + "num_tokens": 241670975.0, + "step": 9342 + }, + { + "epoch": 1.0260267955194378, + "grad_norm": 2.009157180786133, + "learning_rate": 5e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7281779646873474, + "num_tokens": 241693496.0, + "step": 9343 + }, + { + "epoch": 1.0261366132220513, + "grad_norm": 1.8276331424713135, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7360895872116089, + "num_tokens": 241720360.0, + "step": 9344 + }, + { + "epoch": 1.026246430924665, + "grad_norm": 1.814294695854187, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7699521780014038, + "num_tokens": 241743691.0, + "step": 9345 + }, + { + "epoch": 1.0263562486272788, + "grad_norm": 1.9821089506149292, + "learning_rate": 5e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7267631888389587, + "num_tokens": 241767491.0, + "step": 9346 + }, + { + "epoch": 1.0264660663298923, + "grad_norm": 1.912564992904663, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7284812927246094, + "num_tokens": 241794161.0, + "step": 9347 + }, + { + "epoch": 1.026575884032506, + "grad_norm": 2.110234260559082, + "learning_rate": 5e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7714815139770508, + "num_tokens": 241814282.0, + "step": 9348 + }, + { + "epoch": 1.0266857017351196, + "grad_norm": 1.778430461883545, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.724148690700531, + "num_tokens": 241844096.0, + "step": 9349 + }, + { + "epoch": 1.0267955194377334, + "grad_norm": 1.834052324295044, + "learning_rate": 5e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7356520891189575, + "num_tokens": 241870553.0, + "step": 9350 + }, + { + "epoch": 1.026905337140347, + "grad_norm": 1.779120683670044, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7546509504318237, + "num_tokens": 241895152.0, + "step": 9351 + }, + { + "epoch": 1.0270151548429607, + "grad_norm": 2.068988084793091, + "learning_rate": 5e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7536607384681702, + "num_tokens": 241916410.0, + "step": 9352 + }, + { + "epoch": 1.0271249725455744, + "grad_norm": 1.9857980012893677, + "learning_rate": 5e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7427043914794922, + "num_tokens": 241941625.0, + "step": 9353 + }, + { + "epoch": 1.027234790248188, + "grad_norm": 1.8146486282348633, + "learning_rate": 5e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.728859543800354, + "num_tokens": 241967370.0, + "step": 9354 + }, + { + "epoch": 1.0273446079508017, + "grad_norm": 1.7268178462982178, + "learning_rate": 5e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7493613958358765, + "num_tokens": 241995243.0, + "step": 9355 + }, + { + "epoch": 1.0274544256534153, + "grad_norm": 1.8215200901031494, + "learning_rate": 5e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7268378734588623, + "num_tokens": 242021657.0, + "step": 9356 + }, + { + "epoch": 1.027564243356029, + "grad_norm": 1.9535508155822754, + "learning_rate": 5e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7420850992202759, + "num_tokens": 242046785.0, + "step": 9357 + }, + { + "epoch": 1.0276740610586426, + "grad_norm": 1.761873483657837, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7328102588653564, + "num_tokens": 242075691.0, + "step": 9358 + }, + { + "epoch": 1.0277838787612563, + "grad_norm": 1.7321836948394775, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7264689803123474, + "num_tokens": 242106165.0, + "step": 9359 + }, + { + "epoch": 1.02789369646387, + "grad_norm": 2.5000195503234863, + "learning_rate": 5e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7703231573104858, + "num_tokens": 242121826.0, + "step": 9360 + }, + { + "epoch": 1.0280035141664836, + "grad_norm": 1.7559974193572998, + "learning_rate": 5e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7231417298316956, + "num_tokens": 242151205.0, + "step": 9361 + }, + { + "epoch": 1.0281133318690974, + "grad_norm": 2.023930072784424, + "learning_rate": 5e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7405511140823364, + "num_tokens": 242175396.0, + "step": 9362 + }, + { + "epoch": 1.028223149571711, + "grad_norm": 1.6452370882034302, + "learning_rate": 5e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.749062180519104, + "num_tokens": 242205294.0, + "step": 9363 + }, + { + "epoch": 1.0283329672743247, + "grad_norm": 2.219841718673706, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7289258241653442, + "num_tokens": 242225104.0, + "step": 9364 + }, + { + "epoch": 1.0284427849769382, + "grad_norm": 2.0648279190063477, + "learning_rate": 5e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7510448098182678, + "num_tokens": 242247137.0, + "step": 9365 + }, + { + "epoch": 1.028552602679552, + "grad_norm": 1.6972955465316772, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7222590446472168, + "num_tokens": 242278766.0, + "step": 9366 + }, + { + "epoch": 1.0286624203821657, + "grad_norm": 1.7728297710418701, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7296773195266724, + "num_tokens": 242306701.0, + "step": 9367 + }, + { + "epoch": 1.0287722380847792, + "grad_norm": 2.03358793258667, + "learning_rate": 5e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7721678018569946, + "num_tokens": 242329214.0, + "step": 9368 + }, + { + "epoch": 1.028882055787393, + "grad_norm": 2.102414608001709, + "learning_rate": 5e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7573882341384888, + "num_tokens": 242349198.0, + "step": 9369 + }, + { + "epoch": 1.0289918734900065, + "grad_norm": 2.2364673614501953, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7558261752128601, + "num_tokens": 242367400.0, + "step": 9370 + }, + { + "epoch": 1.0291016911926203, + "grad_norm": 2.3541455268859863, + "learning_rate": 5e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7702795267105103, + "num_tokens": 242384899.0, + "step": 9371 + }, + { + "epoch": 1.0292115088952338, + "grad_norm": 1.9117951393127441, + "learning_rate": 5e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7325326204299927, + "num_tokens": 242410933.0, + "step": 9372 + }, + { + "epoch": 1.0293213265978476, + "grad_norm": 1.9770658016204834, + "learning_rate": 5e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7703328132629395, + "num_tokens": 242431386.0, + "step": 9373 + }, + { + "epoch": 1.0294311443004613, + "grad_norm": 1.9110043048858643, + "learning_rate": 5e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7158303260803223, + "num_tokens": 242459856.0, + "step": 9374 + }, + { + "epoch": 1.0295409620030749, + "grad_norm": 1.7777191400527954, + "learning_rate": 5e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7297735214233398, + "num_tokens": 242486888.0, + "step": 9375 + }, + { + "epoch": 1.0296507797056886, + "grad_norm": 1.8489060401916504, + "learning_rate": 5e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7433071732521057, + "num_tokens": 242513518.0, + "step": 9376 + }, + { + "epoch": 1.0297605974083022, + "grad_norm": 2.266878128051758, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7676568031311035, + "num_tokens": 242531724.0, + "step": 9377 + }, + { + "epoch": 1.029870415110916, + "grad_norm": 1.8871432542800903, + "learning_rate": 5e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7306211590766907, + "num_tokens": 242560581.0, + "step": 9378 + }, + { + "epoch": 1.0299802328135295, + "grad_norm": 1.8666160106658936, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7331171631813049, + "num_tokens": 242590041.0, + "step": 9379 + }, + { + "epoch": 1.0300900505161432, + "grad_norm": 1.8657640218734741, + "learning_rate": 5e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7182211875915527, + "num_tokens": 242618111.0, + "step": 9380 + }, + { + "epoch": 1.030199868218757, + "grad_norm": 1.664692759513855, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7253091335296631, + "num_tokens": 242650005.0, + "step": 9381 + }, + { + "epoch": 1.0303096859213705, + "grad_norm": 1.7867522239685059, + "learning_rate": 5e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.716508150100708, + "num_tokens": 242679413.0, + "step": 9382 + }, + { + "epoch": 1.0304195036239843, + "grad_norm": 1.6642652750015259, + "learning_rate": 5e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7280083894729614, + "num_tokens": 242711466.0, + "step": 9383 + }, + { + "epoch": 1.0305293213265978, + "grad_norm": 1.660512924194336, + "learning_rate": 5e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7176222801208496, + "num_tokens": 242745607.0, + "step": 9384 + }, + { + "epoch": 1.0306391390292116, + "grad_norm": 1.730855107307434, + "learning_rate": 5e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7232692241668701, + "num_tokens": 242775810.0, + "step": 9385 + }, + { + "epoch": 1.030748956731825, + "grad_norm": 1.78439462184906, + "learning_rate": 5e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.7492040395736694, + "num_tokens": 242803455.0, + "step": 9386 + }, + { + "epoch": 1.0308587744344389, + "grad_norm": 1.7915290594100952, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7429377436637878, + "num_tokens": 242830404.0, + "step": 9387 + }, + { + "epoch": 1.0309685921370524, + "grad_norm": 1.8339009284973145, + "learning_rate": 5e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7359575033187866, + "num_tokens": 242858427.0, + "step": 9388 + }, + { + "epoch": 1.0310784098396661, + "grad_norm": 2.0625665187835693, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7248221039772034, + "num_tokens": 242882335.0, + "step": 9389 + }, + { + "epoch": 1.03118822754228, + "grad_norm": 1.9528822898864746, + "learning_rate": 5e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7182332277297974, + "num_tokens": 242908740.0, + "step": 9390 + }, + { + "epoch": 1.0312980452448934, + "grad_norm": 2.0367136001586914, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7186322212219238, + "num_tokens": 242933820.0, + "step": 9391 + }, + { + "epoch": 1.0314078629475072, + "grad_norm": 2.015829086303711, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7341868877410889, + "num_tokens": 242955381.0, + "step": 9392 + }, + { + "epoch": 1.0315176806501207, + "grad_norm": 1.6587021350860596, + "learning_rate": 5e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7455822825431824, + "num_tokens": 242983358.0, + "step": 9393 + }, + { + "epoch": 1.0316274983527345, + "grad_norm": 1.9545539617538452, + "learning_rate": 5e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7423018217086792, + "num_tokens": 243005901.0, + "step": 9394 + }, + { + "epoch": 1.031737316055348, + "grad_norm": 1.8861767053604126, + "learning_rate": 5e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7302780151367188, + "num_tokens": 243031237.0, + "step": 9395 + }, + { + "epoch": 1.0318471337579618, + "grad_norm": 1.847620964050293, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7236646413803101, + "num_tokens": 243058613.0, + "step": 9396 + }, + { + "epoch": 1.0319569514605755, + "grad_norm": 1.8739593029022217, + "learning_rate": 5e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7444535493850708, + "num_tokens": 243084673.0, + "step": 9397 + }, + { + "epoch": 1.032066769163189, + "grad_norm": 2.04817795753479, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7333367466926575, + "num_tokens": 243108581.0, + "step": 9398 + }, + { + "epoch": 1.0321765868658028, + "grad_norm": 1.72551429271698, + "learning_rate": 5e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7260394096374512, + "num_tokens": 243141271.0, + "step": 9399 + }, + { + "epoch": 1.0322864045684164, + "grad_norm": 1.8689651489257812, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.717984676361084, + "num_tokens": 243168553.0, + "step": 9400 + }, + { + "epoch": 1.0323962222710301, + "grad_norm": 1.8555684089660645, + "learning_rate": 5e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.724698007106781, + "num_tokens": 243195476.0, + "step": 9401 + }, + { + "epoch": 1.0325060399736437, + "grad_norm": 1.7500113248825073, + "learning_rate": 5e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7366470694541931, + "num_tokens": 243223822.0, + "step": 9402 + }, + { + "epoch": 1.0326158576762574, + "grad_norm": 1.8836214542388916, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7491650581359863, + "num_tokens": 243247426.0, + "step": 9403 + }, + { + "epoch": 1.0327256753788712, + "grad_norm": 1.9614895582199097, + "learning_rate": 5e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7400532960891724, + "num_tokens": 243270419.0, + "step": 9404 + }, + { + "epoch": 1.0328354930814847, + "grad_norm": 1.892094373703003, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7381376624107361, + "num_tokens": 243295521.0, + "step": 9405 + }, + { + "epoch": 1.0329453107840985, + "grad_norm": 2.011669635772705, + "learning_rate": 5e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7459423542022705, + "num_tokens": 243317731.0, + "step": 9406 + }, + { + "epoch": 1.033055128486712, + "grad_norm": 1.947987675666809, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7318283319473267, + "num_tokens": 243342058.0, + "step": 9407 + }, + { + "epoch": 1.0331649461893258, + "grad_norm": 1.780210256576538, + "learning_rate": 5e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7491301894187927, + "num_tokens": 243371998.0, + "step": 9408 + }, + { + "epoch": 1.0332747638919393, + "grad_norm": 2.116981267929077, + "learning_rate": 5e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7252472639083862, + "num_tokens": 243394297.0, + "step": 9409 + }, + { + "epoch": 1.033384581594553, + "grad_norm": 1.841310739517212, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7333861589431763, + "num_tokens": 243421419.0, + "step": 9410 + }, + { + "epoch": 1.0334943992971668, + "grad_norm": 1.6834627389907837, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7634638547897339, + "num_tokens": 243452505.0, + "step": 9411 + }, + { + "epoch": 1.0336042169997803, + "grad_norm": 1.7719300985336304, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7279403805732727, + "num_tokens": 243481418.0, + "step": 9412 + }, + { + "epoch": 1.033714034702394, + "grad_norm": 1.797263503074646, + "learning_rate": 5e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7305203676223755, + "num_tokens": 243508709.0, + "step": 9413 + }, + { + "epoch": 1.0338238524050076, + "grad_norm": 1.9194536209106445, + "learning_rate": 5e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7231642007827759, + "num_tokens": 243535454.0, + "step": 9414 + }, + { + "epoch": 1.0339336701076214, + "grad_norm": 2.03338885307312, + "learning_rate": 5e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7561529874801636, + "num_tokens": 243558573.0, + "step": 9415 + }, + { + "epoch": 1.034043487810235, + "grad_norm": 1.9769725799560547, + "learning_rate": 5e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7242921590805054, + "num_tokens": 243583827.0, + "step": 9416 + }, + { + "epoch": 1.0341533055128487, + "grad_norm": 1.8234882354736328, + "learning_rate": 5e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.7539746761322021, + "num_tokens": 243611335.0, + "step": 9417 + }, + { + "epoch": 1.0342631232154624, + "grad_norm": 1.747703194618225, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7424191236495972, + "num_tokens": 243639845.0, + "step": 9418 + }, + { + "epoch": 1.034372940918076, + "grad_norm": 1.658747911453247, + "learning_rate": 5e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7366502285003662, + "num_tokens": 243668867.0, + "step": 9419 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 1.799835443496704, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7395142316818237, + "num_tokens": 243696082.0, + "step": 9420 + }, + { + "epoch": 1.0345925763233033, + "grad_norm": 1.9603562355041504, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7658073902130127, + "num_tokens": 243717297.0, + "step": 9421 + }, + { + "epoch": 1.034702394025917, + "grad_norm": 2.1008927822113037, + "learning_rate": 5e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7675964832305908, + "num_tokens": 243735927.0, + "step": 9422 + }, + { + "epoch": 1.0348122117285306, + "grad_norm": 2.0134315490722656, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7514731884002686, + "num_tokens": 243756859.0, + "step": 9423 + }, + { + "epoch": 1.0349220294311443, + "grad_norm": 1.9454082250595093, + "learning_rate": 5e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7318475842475891, + "num_tokens": 243779225.0, + "step": 9424 + }, + { + "epoch": 1.035031847133758, + "grad_norm": 1.750252604484558, + "learning_rate": 5e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.726267397403717, + "num_tokens": 243807632.0, + "step": 9425 + }, + { + "epoch": 1.0351416648363716, + "grad_norm": 1.6672531366348267, + "learning_rate": 5e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7191295027732849, + "num_tokens": 243842140.0, + "step": 9426 + }, + { + "epoch": 1.0352514825389854, + "grad_norm": 2.0821449756622314, + "learning_rate": 5e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.725898802280426, + "num_tokens": 243863416.0, + "step": 9427 + }, + { + "epoch": 1.035361300241599, + "grad_norm": 1.8518807888031006, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.721913754940033, + "num_tokens": 243892455.0, + "step": 9428 + }, + { + "epoch": 1.0354711179442126, + "grad_norm": 1.8967809677124023, + "learning_rate": 5e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7498830556869507, + "num_tokens": 243919048.0, + "step": 9429 + }, + { + "epoch": 1.0355809356468262, + "grad_norm": 1.9580485820770264, + "learning_rate": 5e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7340061664581299, + "num_tokens": 243944707.0, + "step": 9430 + }, + { + "epoch": 1.03569075334944, + "grad_norm": 1.858775019645691, + "learning_rate": 5e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7523702383041382, + "num_tokens": 243970827.0, + "step": 9431 + }, + { + "epoch": 1.0358005710520537, + "grad_norm": 1.6792210340499878, + "learning_rate": 5e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7578411102294922, + "num_tokens": 243997362.0, + "step": 9432 + }, + { + "epoch": 1.0359103887546672, + "grad_norm": 1.7875529527664185, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.731023907661438, + "num_tokens": 244023533.0, + "step": 9433 + }, + { + "epoch": 1.036020206457281, + "grad_norm": 1.7441455125808716, + "learning_rate": 5e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7292376160621643, + "num_tokens": 244051557.0, + "step": 9434 + }, + { + "epoch": 1.0361300241598945, + "grad_norm": 2.113497018814087, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7560371160507202, + "num_tokens": 244072779.0, + "step": 9435 + }, + { + "epoch": 1.0362398418625083, + "grad_norm": 2.1446895599365234, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.747289776802063, + "num_tokens": 244091772.0, + "step": 9436 + }, + { + "epoch": 1.0363496595651218, + "grad_norm": 1.7974929809570312, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.725922703742981, + "num_tokens": 244119144.0, + "step": 9437 + }, + { + "epoch": 1.0364594772677356, + "grad_norm": 1.8752648830413818, + "learning_rate": 5e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7400466203689575, + "num_tokens": 244146688.0, + "step": 9438 + }, + { + "epoch": 1.036569294970349, + "grad_norm": 1.9706090688705444, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7255358099937439, + "num_tokens": 244172317.0, + "step": 9439 + }, + { + "epoch": 1.0366791126729629, + "grad_norm": 2.054832696914673, + "learning_rate": 5e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7337446808815002, + "num_tokens": 244195970.0, + "step": 9440 + }, + { + "epoch": 1.0367889303755766, + "grad_norm": 1.8063592910766602, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7152543067932129, + "num_tokens": 244226040.0, + "step": 9441 + }, + { + "epoch": 1.0368987480781902, + "grad_norm": 2.17577862739563, + "learning_rate": 5e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7466548681259155, + "num_tokens": 244244683.0, + "step": 9442 + }, + { + "epoch": 1.037008565780804, + "grad_norm": 1.718100666999817, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7410906553268433, + "num_tokens": 244274961.0, + "step": 9443 + }, + { + "epoch": 1.0371183834834174, + "grad_norm": 1.7863891124725342, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7394350171089172, + "num_tokens": 244302417.0, + "step": 9444 + }, + { + "epoch": 1.0372282011860312, + "grad_norm": 2.295963764190674, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.730153501033783, + "num_tokens": 244321213.0, + "step": 9445 + }, + { + "epoch": 1.037338018888645, + "grad_norm": 1.7707501649856567, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.722663164138794, + "num_tokens": 244350897.0, + "step": 9446 + }, + { + "epoch": 1.0374478365912585, + "grad_norm": 2.045880079269409, + "learning_rate": 5e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.743103563785553, + "num_tokens": 244372053.0, + "step": 9447 + }, + { + "epoch": 1.0375576542938723, + "grad_norm": 2.1335275173187256, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7431765794754028, + "num_tokens": 244392916.0, + "step": 9448 + }, + { + "epoch": 1.0376674719964858, + "grad_norm": 1.7748205661773682, + "learning_rate": 5e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7458673715591431, + "num_tokens": 244421384.0, + "step": 9449 + }, + { + "epoch": 1.0377772896990995, + "grad_norm": 1.8483976125717163, + "learning_rate": 5e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7317577600479126, + "num_tokens": 244448894.0, + "step": 9450 + }, + { + "epoch": 1.037887107401713, + "grad_norm": 1.967350721359253, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7315527200698853, + "num_tokens": 244474008.0, + "step": 9451 + }, + { + "epoch": 1.0379969251043268, + "grad_norm": 1.8374745845794678, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7420260906219482, + "num_tokens": 244500031.0, + "step": 9452 + }, + { + "epoch": 1.0381067428069404, + "grad_norm": 1.6265639066696167, + "learning_rate": 5e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7451572418212891, + "num_tokens": 244531830.0, + "step": 9453 + }, + { + "epoch": 1.0382165605095541, + "grad_norm": 2.110081911087036, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7196237444877625, + "num_tokens": 244558880.0, + "step": 9454 + }, + { + "epoch": 1.0383263782121679, + "grad_norm": 2.1491479873657227, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7701313495635986, + "num_tokens": 244578020.0, + "step": 9455 + }, + { + "epoch": 1.0384361959147814, + "grad_norm": 1.9604650735855103, + "learning_rate": 5e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7488972544670105, + "num_tokens": 244601023.0, + "step": 9456 + }, + { + "epoch": 1.0385460136173952, + "grad_norm": 1.9683518409729004, + "learning_rate": 5e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7272345423698425, + "num_tokens": 244625980.0, + "step": 9457 + }, + { + "epoch": 1.0386558313200087, + "grad_norm": 1.7914856672286987, + "learning_rate": 5e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7199520468711853, + "num_tokens": 244653715.0, + "step": 9458 + }, + { + "epoch": 1.0387656490226225, + "grad_norm": 1.8445197343826294, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7230775356292725, + "num_tokens": 244683185.0, + "step": 9459 + }, + { + "epoch": 1.038875466725236, + "grad_norm": 1.7789578437805176, + "learning_rate": 5e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7198811173439026, + "num_tokens": 244709758.0, + "step": 9460 + }, + { + "epoch": 1.0389852844278498, + "grad_norm": 1.9613615274429321, + "learning_rate": 5e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7300115823745728, + "num_tokens": 244735212.0, + "step": 9461 + }, + { + "epoch": 1.0390951021304635, + "grad_norm": 1.924886703491211, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7177929282188416, + "num_tokens": 244760094.0, + "step": 9462 + }, + { + "epoch": 1.039204919833077, + "grad_norm": 1.946793556213379, + "learning_rate": 5e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7628406286239624, + "num_tokens": 244783101.0, + "step": 9463 + }, + { + "epoch": 1.0393147375356908, + "grad_norm": 1.8359817266464233, + "learning_rate": 5e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7182854413986206, + "num_tokens": 244814993.0, + "step": 9464 + }, + { + "epoch": 1.0394245552383043, + "grad_norm": 2.004448413848877, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7370415925979614, + "num_tokens": 244836467.0, + "step": 9465 + }, + { + "epoch": 1.039534372940918, + "grad_norm": 1.8488729000091553, + "learning_rate": 5e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7421598434448242, + "num_tokens": 244861140.0, + "step": 9466 + }, + { + "epoch": 1.0396441906435316, + "grad_norm": 1.6966251134872437, + "learning_rate": 5e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7502471208572388, + "num_tokens": 244888322.0, + "step": 9467 + }, + { + "epoch": 1.0397540083461454, + "grad_norm": 1.6645580530166626, + "learning_rate": 5e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.746569037437439, + "num_tokens": 244919728.0, + "step": 9468 + }, + { + "epoch": 1.0398638260487592, + "grad_norm": 1.9893229007720947, + "learning_rate": 5e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7458194494247437, + "num_tokens": 244942619.0, + "step": 9469 + }, + { + "epoch": 1.0399736437513727, + "grad_norm": 1.9009593725204468, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7612181901931763, + "num_tokens": 244966319.0, + "step": 9470 + }, + { + "epoch": 1.0400834614539864, + "grad_norm": 1.8743635416030884, + "learning_rate": 5e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7362955212593079, + "num_tokens": 244991340.0, + "step": 9471 + }, + { + "epoch": 1.0401932791566, + "grad_norm": 2.0079267024993896, + "learning_rate": 5e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7275810837745667, + "num_tokens": 245015261.0, + "step": 9472 + }, + { + "epoch": 1.0403030968592137, + "grad_norm": 1.909178614616394, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7277249097824097, + "num_tokens": 245041899.0, + "step": 9473 + }, + { + "epoch": 1.0404129145618273, + "grad_norm": 1.948041558265686, + "learning_rate": 5e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7353359460830688, + "num_tokens": 245064110.0, + "step": 9474 + }, + { + "epoch": 1.040522732264441, + "grad_norm": 2.0114986896514893, + "learning_rate": 5e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7222380638122559, + "num_tokens": 245087099.0, + "step": 9475 + }, + { + "epoch": 1.0406325499670548, + "grad_norm": 1.969249963760376, + "learning_rate": 5e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7104259729385376, + "num_tokens": 245116123.0, + "step": 9476 + }, + { + "epoch": 1.0407423676696683, + "grad_norm": 1.9229929447174072, + "learning_rate": 5e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7387678027153015, + "num_tokens": 245139406.0, + "step": 9477 + }, + { + "epoch": 1.040852185372282, + "grad_norm": 1.8452050685882568, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7202937602996826, + "num_tokens": 245166361.0, + "step": 9478 + }, + { + "epoch": 1.0409620030748956, + "grad_norm": 1.7677229642868042, + "learning_rate": 5e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7343016862869263, + "num_tokens": 245194253.0, + "step": 9479 + }, + { + "epoch": 1.0410718207775094, + "grad_norm": 1.749466896057129, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.715549886226654, + "num_tokens": 245221800.0, + "step": 9480 + }, + { + "epoch": 1.041181638480123, + "grad_norm": 1.8765252828598022, + "learning_rate": 5e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.7843751907348633, + "num_tokens": 245244460.0, + "step": 9481 + }, + { + "epoch": 1.0412914561827367, + "grad_norm": 1.6119049787521362, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7229463458061218, + "num_tokens": 245278069.0, + "step": 9482 + }, + { + "epoch": 1.0414012738853504, + "grad_norm": 1.977472186088562, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7273584604263306, + "num_tokens": 245302468.0, + "step": 9483 + }, + { + "epoch": 1.041511091587964, + "grad_norm": 2.161644458770752, + "learning_rate": 5e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7516570687294006, + "num_tokens": 245321241.0, + "step": 9484 + }, + { + "epoch": 1.0416209092905777, + "grad_norm": 1.8049994707107544, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7271127700805664, + "num_tokens": 245350101.0, + "step": 9485 + }, + { + "epoch": 1.0417307269931912, + "grad_norm": 1.9644548892974854, + "learning_rate": 5e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7394562363624573, + "num_tokens": 245372974.0, + "step": 9486 + }, + { + "epoch": 1.041840544695805, + "grad_norm": 2.0405476093292236, + "learning_rate": 5e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7180875539779663, + "num_tokens": 245396938.0, + "step": 9487 + }, + { + "epoch": 1.0419503623984185, + "grad_norm": 1.74172842502594, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.717022180557251, + "num_tokens": 245427176.0, + "step": 9488 + }, + { + "epoch": 1.0420601801010323, + "grad_norm": 1.7497577667236328, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7321441769599915, + "num_tokens": 245456266.0, + "step": 9489 + }, + { + "epoch": 1.042169997803646, + "grad_norm": 1.837296962738037, + "learning_rate": 5e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7437122464179993, + "num_tokens": 245480941.0, + "step": 9490 + }, + { + "epoch": 1.0422798155062596, + "grad_norm": 1.6323810815811157, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7260026931762695, + "num_tokens": 245513411.0, + "step": 9491 + }, + { + "epoch": 1.0423896332088733, + "grad_norm": 1.8980777263641357, + "learning_rate": 5e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7718601226806641, + "num_tokens": 245535484.0, + "step": 9492 + }, + { + "epoch": 1.0424994509114869, + "grad_norm": 1.771999716758728, + "learning_rate": 5e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7172678709030151, + "num_tokens": 245565352.0, + "step": 9493 + }, + { + "epoch": 1.0426092686141006, + "grad_norm": 2.1941263675689697, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7411954998970032, + "num_tokens": 245585104.0, + "step": 9494 + }, + { + "epoch": 1.0427190863167142, + "grad_norm": 1.7113984823226929, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7514307498931885, + "num_tokens": 245613000.0, + "step": 9495 + }, + { + "epoch": 1.042828904019328, + "grad_norm": 1.7721048593521118, + "learning_rate": 5e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7358047962188721, + "num_tokens": 245642282.0, + "step": 9496 + }, + { + "epoch": 1.0429387217219417, + "grad_norm": 1.817657470703125, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7238563299179077, + "num_tokens": 245670278.0, + "step": 9497 + }, + { + "epoch": 1.0430485394245552, + "grad_norm": 1.8362635374069214, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7150046825408936, + "num_tokens": 245697958.0, + "step": 9498 + }, + { + "epoch": 1.043158357127169, + "grad_norm": 2.0872886180877686, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7338477373123169, + "num_tokens": 245718678.0, + "step": 9499 + }, + { + "epoch": 1.0432681748297825, + "grad_norm": 1.9035704135894775, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7384803295135498, + "num_tokens": 245743982.0, + "step": 9500 + }, + { + "epoch": 1.0433779925323963, + "grad_norm": 2.0974748134613037, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7291049957275391, + "num_tokens": 245763805.0, + "step": 9501 + }, + { + "epoch": 1.0434878102350098, + "grad_norm": 1.8232076168060303, + "learning_rate": 5e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7144626975059509, + "num_tokens": 245792256.0, + "step": 9502 + }, + { + "epoch": 1.0435976279376236, + "grad_norm": 2.022336006164551, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7154044508934021, + "num_tokens": 245816846.0, + "step": 9503 + }, + { + "epoch": 1.043707445640237, + "grad_norm": 2.0208427906036377, + "learning_rate": 5e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.743245005607605, + "num_tokens": 245839767.0, + "step": 9504 + }, + { + "epoch": 1.0438172633428509, + "grad_norm": 1.9169015884399414, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7320588827133179, + "num_tokens": 245863920.0, + "step": 9505 + }, + { + "epoch": 1.0439270810454646, + "grad_norm": 1.8679537773132324, + "learning_rate": 5e-06, + "loss": 0.874, + "mean_token_accuracy": 0.732913613319397, + "num_tokens": 245890318.0, + "step": 9506 + }, + { + "epoch": 1.0440368987480781, + "grad_norm": 1.7295945882797241, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.754638671875, + "num_tokens": 245919508.0, + "step": 9507 + }, + { + "epoch": 1.044146716450692, + "grad_norm": 2.157878875732422, + "learning_rate": 5e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.707593560218811, + "num_tokens": 245942601.0, + "step": 9508 + }, + { + "epoch": 1.0442565341533054, + "grad_norm": 2.020561456680298, + "learning_rate": 5e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7487096786499023, + "num_tokens": 245964988.0, + "step": 9509 + }, + { + "epoch": 1.0443663518559192, + "grad_norm": 1.838451862335205, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7462442517280579, + "num_tokens": 245991630.0, + "step": 9510 + }, + { + "epoch": 1.0444761695585327, + "grad_norm": 1.857252597808838, + "learning_rate": 5e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.738852858543396, + "num_tokens": 246016987.0, + "step": 9511 + }, + { + "epoch": 1.0445859872611465, + "grad_norm": 1.6682264804840088, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7400944828987122, + "num_tokens": 246049117.0, + "step": 9512 + }, + { + "epoch": 1.0446958049637602, + "grad_norm": 1.9147084951400757, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7469407916069031, + "num_tokens": 246071187.0, + "step": 9513 + }, + { + "epoch": 1.0448056226663738, + "grad_norm": 1.8295787572860718, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.738219141960144, + "num_tokens": 246097769.0, + "step": 9514 + }, + { + "epoch": 1.0449154403689875, + "grad_norm": 1.9735954999923706, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.758649468421936, + "num_tokens": 246120108.0, + "step": 9515 + }, + { + "epoch": 1.045025258071601, + "grad_norm": 1.7305071353912354, + "learning_rate": 5e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7230664491653442, + "num_tokens": 246151262.0, + "step": 9516 + }, + { + "epoch": 1.0451350757742148, + "grad_norm": 1.9399415254592896, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.729934573173523, + "num_tokens": 246176047.0, + "step": 9517 + }, + { + "epoch": 1.0452448934768284, + "grad_norm": 1.7462444305419922, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7342861890792847, + "num_tokens": 246204907.0, + "step": 9518 + }, + { + "epoch": 1.0453547111794421, + "grad_norm": 2.1522715091705322, + "learning_rate": 5e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7420845627784729, + "num_tokens": 246226339.0, + "step": 9519 + }, + { + "epoch": 1.0454645288820559, + "grad_norm": 2.01481556892395, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7508819103240967, + "num_tokens": 246247543.0, + "step": 9520 + }, + { + "epoch": 1.0455743465846694, + "grad_norm": 2.0497372150421143, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7252073884010315, + "num_tokens": 246272493.0, + "step": 9521 + }, + { + "epoch": 1.0456841642872832, + "grad_norm": 2.107285261154175, + "learning_rate": 5e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7458486557006836, + "num_tokens": 246293289.0, + "step": 9522 + }, + { + "epoch": 1.0457939819898967, + "grad_norm": 2.012657880783081, + "learning_rate": 5e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7591456770896912, + "num_tokens": 246316360.0, + "step": 9523 + }, + { + "epoch": 1.0459037996925105, + "grad_norm": 1.7242698669433594, + "learning_rate": 5e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7240583896636963, + "num_tokens": 246347187.0, + "step": 9524 + }, + { + "epoch": 1.046013617395124, + "grad_norm": 1.7706917524337769, + "learning_rate": 5e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7209372520446777, + "num_tokens": 246375371.0, + "step": 9525 + }, + { + "epoch": 1.0461234350977378, + "grad_norm": 1.8866039514541626, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.737148642539978, + "num_tokens": 246401776.0, + "step": 9526 + }, + { + "epoch": 1.0462332528003515, + "grad_norm": 1.8831300735473633, + "learning_rate": 5e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7322466373443604, + "num_tokens": 246426583.0, + "step": 9527 + }, + { + "epoch": 1.046343070502965, + "grad_norm": 1.771786093711853, + "learning_rate": 5e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.714789628982544, + "num_tokens": 246453913.0, + "step": 9528 + }, + { + "epoch": 1.0464528882055788, + "grad_norm": 1.8970625400543213, + "learning_rate": 5e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7397514581680298, + "num_tokens": 246478388.0, + "step": 9529 + }, + { + "epoch": 1.0465627059081923, + "grad_norm": 1.9918138980865479, + "learning_rate": 5e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7343701124191284, + "num_tokens": 246501956.0, + "step": 9530 + }, + { + "epoch": 1.046672523610806, + "grad_norm": 1.783849835395813, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7237159013748169, + "num_tokens": 246528722.0, + "step": 9531 + }, + { + "epoch": 1.0467823413134196, + "grad_norm": 2.0073130130767822, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7290111184120178, + "num_tokens": 246552561.0, + "step": 9532 + }, + { + "epoch": 1.0468921590160334, + "grad_norm": 1.8550238609313965, + "learning_rate": 5e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7507970333099365, + "num_tokens": 246578019.0, + "step": 9533 + }, + { + "epoch": 1.0470019767186471, + "grad_norm": 1.8196489810943604, + "learning_rate": 5e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7269114255905151, + "num_tokens": 246608684.0, + "step": 9534 + }, + { + "epoch": 1.0471117944212607, + "grad_norm": 1.6618937253952026, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7162563800811768, + "num_tokens": 246642204.0, + "step": 9535 + }, + { + "epoch": 1.0472216121238744, + "grad_norm": 1.675252914428711, + "learning_rate": 5e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7276305556297302, + "num_tokens": 246673479.0, + "step": 9536 + }, + { + "epoch": 1.047331429826488, + "grad_norm": 1.6953197717666626, + "learning_rate": 5e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7083760499954224, + "num_tokens": 246704900.0, + "step": 9537 + }, + { + "epoch": 1.0474412475291017, + "grad_norm": 1.7986376285552979, + "learning_rate": 5e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7180109024047852, + "num_tokens": 246732016.0, + "step": 9538 + }, + { + "epoch": 1.0475510652317153, + "grad_norm": 1.8722659349441528, + "learning_rate": 5e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7218937873840332, + "num_tokens": 246759042.0, + "step": 9539 + }, + { + "epoch": 1.047660882934329, + "grad_norm": 1.7202301025390625, + "learning_rate": 5e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7517589330673218, + "num_tokens": 246787241.0, + "step": 9540 + }, + { + "epoch": 1.0477707006369428, + "grad_norm": 2.0793750286102295, + "learning_rate": 5e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7489513158798218, + "num_tokens": 246808622.0, + "step": 9541 + }, + { + "epoch": 1.0478805183395563, + "grad_norm": 1.8926513195037842, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7218140363693237, + "num_tokens": 246837220.0, + "step": 9542 + }, + { + "epoch": 1.04799033604217, + "grad_norm": 1.9518475532531738, + "learning_rate": 5e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7288002967834473, + "num_tokens": 246859112.0, + "step": 9543 + }, + { + "epoch": 1.0481001537447836, + "grad_norm": 1.943264126777649, + "learning_rate": 5e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7311688661575317, + "num_tokens": 246885153.0, + "step": 9544 + }, + { + "epoch": 1.0482099714473974, + "grad_norm": 1.7603462934494019, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7403683662414551, + "num_tokens": 246909971.0, + "step": 9545 + }, + { + "epoch": 1.048319789150011, + "grad_norm": 3.454648494720459, + "learning_rate": 5e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7209593057632446, + "num_tokens": 246936922.0, + "step": 9546 + }, + { + "epoch": 1.0484296068526247, + "grad_norm": 1.79478919506073, + "learning_rate": 5e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7061583399772644, + "num_tokens": 246967336.0, + "step": 9547 + }, + { + "epoch": 1.0485394245552384, + "grad_norm": 2.0511415004730225, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7409847974777222, + "num_tokens": 246989049.0, + "step": 9548 + }, + { + "epoch": 1.048649242257852, + "grad_norm": 1.8648781776428223, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7760775089263916, + "num_tokens": 247012005.0, + "step": 9549 + }, + { + "epoch": 1.0487590599604657, + "grad_norm": 1.8437354564666748, + "learning_rate": 5e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7204256057739258, + "num_tokens": 247039576.0, + "step": 9550 + }, + { + "epoch": 1.0488688776630792, + "grad_norm": 1.9076088666915894, + "learning_rate": 5e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7231737375259399, + "num_tokens": 247065903.0, + "step": 9551 + }, + { + "epoch": 1.048978695365693, + "grad_norm": 1.7033427953720093, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7442142367362976, + "num_tokens": 247095045.0, + "step": 9552 + }, + { + "epoch": 1.0490885130683065, + "grad_norm": 1.869805097579956, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7209591865539551, + "num_tokens": 247120136.0, + "step": 9553 + }, + { + "epoch": 1.0491983307709203, + "grad_norm": 1.9727842807769775, + "learning_rate": 5e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7443845272064209, + "num_tokens": 247143780.0, + "step": 9554 + }, + { + "epoch": 1.0493081484735338, + "grad_norm": 1.9012049436569214, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7443851828575134, + "num_tokens": 247172088.0, + "step": 9555 + }, + { + "epoch": 1.0494179661761476, + "grad_norm": 1.7381469011306763, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7216634750366211, + "num_tokens": 247202405.0, + "step": 9556 + }, + { + "epoch": 1.0495277838787613, + "grad_norm": 1.9361512660980225, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.763519287109375, + "num_tokens": 247225371.0, + "step": 9557 + }, + { + "epoch": 1.0496376015813749, + "grad_norm": 1.8607550859451294, + "learning_rate": 5e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7268432378768921, + "num_tokens": 247253823.0, + "step": 9558 + }, + { + "epoch": 1.0497474192839886, + "grad_norm": 2.1017096042633057, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.765724778175354, + "num_tokens": 247273478.0, + "step": 9559 + }, + { + "epoch": 1.0498572369866022, + "grad_norm": 1.8603049516677856, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7270472645759583, + "num_tokens": 247298667.0, + "step": 9560 + }, + { + "epoch": 1.049967054689216, + "grad_norm": 1.762690782546997, + "learning_rate": 5e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7366982102394104, + "num_tokens": 247326399.0, + "step": 9561 + }, + { + "epoch": 1.0500768723918297, + "grad_norm": 1.7593483924865723, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7323259115219116, + "num_tokens": 247354996.0, + "step": 9562 + }, + { + "epoch": 1.0501866900944432, + "grad_norm": 1.8354878425598145, + "learning_rate": 5e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7353074550628662, + "num_tokens": 247380752.0, + "step": 9563 + }, + { + "epoch": 1.050296507797057, + "grad_norm": 2.08316969871521, + "learning_rate": 5e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7426092028617859, + "num_tokens": 247401895.0, + "step": 9564 + }, + { + "epoch": 1.0504063254996705, + "grad_norm": 2.245347499847412, + "learning_rate": 5e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7352527379989624, + "num_tokens": 247423505.0, + "step": 9565 + }, + { + "epoch": 1.0505161432022843, + "grad_norm": 1.9620858430862427, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7342866659164429, + "num_tokens": 247447215.0, + "step": 9566 + }, + { + "epoch": 1.0506259609048978, + "grad_norm": 1.6403459310531616, + "learning_rate": 5e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7261413335800171, + "num_tokens": 247478869.0, + "step": 9567 + }, + { + "epoch": 1.0507357786075116, + "grad_norm": 2.068460702896118, + "learning_rate": 5e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7417712807655334, + "num_tokens": 247501856.0, + "step": 9568 + }, + { + "epoch": 1.050845596310125, + "grad_norm": 2.0340535640716553, + "learning_rate": 5e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7304215431213379, + "num_tokens": 247525599.0, + "step": 9569 + }, + { + "epoch": 1.0509554140127388, + "grad_norm": 1.9553625583648682, + "learning_rate": 5e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7418391108512878, + "num_tokens": 247549295.0, + "step": 9570 + }, + { + "epoch": 1.0510652317153526, + "grad_norm": 2.104902744293213, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7323088645935059, + "num_tokens": 247569105.0, + "step": 9571 + }, + { + "epoch": 1.0511750494179661, + "grad_norm": 1.9685267210006714, + "learning_rate": 5e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7161327600479126, + "num_tokens": 247595732.0, + "step": 9572 + }, + { + "epoch": 1.05128486712058, + "grad_norm": 1.7402325868606567, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.72263103723526, + "num_tokens": 247625361.0, + "step": 9573 + }, + { + "epoch": 1.0513946848231934, + "grad_norm": 2.0043656826019287, + "learning_rate": 5e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7335758209228516, + "num_tokens": 247650898.0, + "step": 9574 + }, + { + "epoch": 1.0515045025258072, + "grad_norm": 1.659716010093689, + "learning_rate": 5e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7199909687042236, + "num_tokens": 247684311.0, + "step": 9575 + }, + { + "epoch": 1.0516143202284207, + "grad_norm": 1.645753264427185, + "learning_rate": 5e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7261738181114197, + "num_tokens": 247717895.0, + "step": 9576 + }, + { + "epoch": 1.0517241379310345, + "grad_norm": 1.8768516778945923, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7297146320343018, + "num_tokens": 247745806.0, + "step": 9577 + }, + { + "epoch": 1.0518339556336482, + "grad_norm": 2.066899299621582, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7461532354354858, + "num_tokens": 247767273.0, + "step": 9578 + }, + { + "epoch": 1.0519437733362618, + "grad_norm": 1.7978004217147827, + "learning_rate": 5e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7502570748329163, + "num_tokens": 247793417.0, + "step": 9579 + }, + { + "epoch": 1.0520535910388755, + "grad_norm": 1.9119112491607666, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7329018712043762, + "num_tokens": 247820523.0, + "step": 9580 + }, + { + "epoch": 1.052163408741489, + "grad_norm": 1.8481022119522095, + "learning_rate": 5e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7634069919586182, + "num_tokens": 247845760.0, + "step": 9581 + }, + { + "epoch": 1.0522732264441028, + "grad_norm": 1.819146752357483, + "learning_rate": 5e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7537270784378052, + "num_tokens": 247873549.0, + "step": 9582 + }, + { + "epoch": 1.0523830441467164, + "grad_norm": 2.033823251724243, + "learning_rate": 5e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7309609055519104, + "num_tokens": 247899016.0, + "step": 9583 + }, + { + "epoch": 1.05249286184933, + "grad_norm": 2.034686326980591, + "learning_rate": 5e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7413649559020996, + "num_tokens": 247920973.0, + "step": 9584 + }, + { + "epoch": 1.0526026795519439, + "grad_norm": 2.024028778076172, + "learning_rate": 5e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7423419952392578, + "num_tokens": 247943546.0, + "step": 9585 + }, + { + "epoch": 1.0527124972545574, + "grad_norm": 2.134382486343384, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7347379922866821, + "num_tokens": 247966473.0, + "step": 9586 + }, + { + "epoch": 1.0528223149571712, + "grad_norm": 1.8497394323349, + "learning_rate": 5e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.738473653793335, + "num_tokens": 247993020.0, + "step": 9587 + }, + { + "epoch": 1.0529321326597847, + "grad_norm": 1.9243690967559814, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7505794763565063, + "num_tokens": 248016136.0, + "step": 9588 + }, + { + "epoch": 1.0530419503623984, + "grad_norm": 1.9081555604934692, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.759271502494812, + "num_tokens": 248041624.0, + "step": 9589 + }, + { + "epoch": 1.053151768065012, + "grad_norm": 1.8407589197158813, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.726294994354248, + "num_tokens": 248069087.0, + "step": 9590 + }, + { + "epoch": 1.0532615857676257, + "grad_norm": 1.8607748746871948, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7327328324317932, + "num_tokens": 248096481.0, + "step": 9591 + }, + { + "epoch": 1.0533714034702395, + "grad_norm": 1.792853832244873, + "learning_rate": 5e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7109569311141968, + "num_tokens": 248130500.0, + "step": 9592 + }, + { + "epoch": 1.053481221172853, + "grad_norm": 1.710489273071289, + "learning_rate": 5e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7554618120193481, + "num_tokens": 248157613.0, + "step": 9593 + }, + { + "epoch": 1.0535910388754668, + "grad_norm": 1.8754597902297974, + "learning_rate": 5e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7364981174468994, + "num_tokens": 248183137.0, + "step": 9594 + }, + { + "epoch": 1.0537008565780803, + "grad_norm": 1.9204579591751099, + "learning_rate": 5e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7292265295982361, + "num_tokens": 248207918.0, + "step": 9595 + }, + { + "epoch": 1.053810674280694, + "grad_norm": 2.143812417984009, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.750244677066803, + "num_tokens": 248228127.0, + "step": 9596 + }, + { + "epoch": 1.0539204919833076, + "grad_norm": 1.8706649541854858, + "learning_rate": 5e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7369726896286011, + "num_tokens": 248252186.0, + "step": 9597 + }, + { + "epoch": 1.0540303096859214, + "grad_norm": 2.0697598457336426, + "learning_rate": 5e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7249369025230408, + "num_tokens": 248275728.0, + "step": 9598 + }, + { + "epoch": 1.0541401273885351, + "grad_norm": 2.2045390605926514, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7357565760612488, + "num_tokens": 248297425.0, + "step": 9599 + }, + { + "epoch": 1.0542499450911487, + "grad_norm": 2.007615566253662, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7375514507293701, + "num_tokens": 248320308.0, + "step": 9600 + }, + { + "epoch": 1.0543597627937624, + "grad_norm": 1.9544576406478882, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7287877798080444, + "num_tokens": 248346028.0, + "step": 9601 + }, + { + "epoch": 1.054469580496376, + "grad_norm": 1.663011908531189, + "learning_rate": 5e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.730372965335846, + "num_tokens": 248377588.0, + "step": 9602 + }, + { + "epoch": 1.0545793981989897, + "grad_norm": 1.8017796277999878, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7259304523468018, + "num_tokens": 248404480.0, + "step": 9603 + }, + { + "epoch": 1.0546892159016032, + "grad_norm": 1.9415887594223022, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7445715665817261, + "num_tokens": 248427582.0, + "step": 9604 + }, + { + "epoch": 1.054799033604217, + "grad_norm": 2.013077735900879, + "learning_rate": 5e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7245796322822571, + "num_tokens": 248450845.0, + "step": 9605 + }, + { + "epoch": 1.0549088513068308, + "grad_norm": 1.9432164430618286, + "learning_rate": 5e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7429836988449097, + "num_tokens": 248471761.0, + "step": 9606 + }, + { + "epoch": 1.0550186690094443, + "grad_norm": 1.7721843719482422, + "learning_rate": 5e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7221285104751587, + "num_tokens": 248501304.0, + "step": 9607 + }, + { + "epoch": 1.055128486712058, + "grad_norm": 1.7903566360473633, + "learning_rate": 5e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7494041323661804, + "num_tokens": 248525799.0, + "step": 9608 + }, + { + "epoch": 1.0552383044146716, + "grad_norm": 1.913650631904602, + "learning_rate": 5e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7484683990478516, + "num_tokens": 248549758.0, + "step": 9609 + }, + { + "epoch": 1.0553481221172853, + "grad_norm": 2.043016195297241, + "learning_rate": 5e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7380548715591431, + "num_tokens": 248573535.0, + "step": 9610 + }, + { + "epoch": 1.0554579398198989, + "grad_norm": 1.8531972169876099, + "learning_rate": 5e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7006852626800537, + "num_tokens": 248604852.0, + "step": 9611 + }, + { + "epoch": 1.0555677575225126, + "grad_norm": 2.022759437561035, + "learning_rate": 5e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7308138012886047, + "num_tokens": 248627944.0, + "step": 9612 + }, + { + "epoch": 1.0556775752251264, + "grad_norm": 1.986491322517395, + "learning_rate": 5e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.7451838254928589, + "num_tokens": 248651271.0, + "step": 9613 + }, + { + "epoch": 1.05578739292774, + "grad_norm": 1.93600594997406, + "learning_rate": 5e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7360952496528625, + "num_tokens": 248677040.0, + "step": 9614 + }, + { + "epoch": 1.0558972106303537, + "grad_norm": 2.0210928916931152, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.727692186832428, + "num_tokens": 248701504.0, + "step": 9615 + }, + { + "epoch": 1.0560070283329672, + "grad_norm": 2.007915735244751, + "learning_rate": 5e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7190157175064087, + "num_tokens": 248728807.0, + "step": 9616 + }, + { + "epoch": 1.056116846035581, + "grad_norm": 1.8413140773773193, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7505801916122437, + "num_tokens": 248754076.0, + "step": 9617 + }, + { + "epoch": 1.0562266637381945, + "grad_norm": 1.7990473508834839, + "learning_rate": 5e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7500649690628052, + "num_tokens": 248781719.0, + "step": 9618 + }, + { + "epoch": 1.0563364814408083, + "grad_norm": 1.7687788009643555, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7561933994293213, + "num_tokens": 248808311.0, + "step": 9619 + }, + { + "epoch": 1.0564462991434218, + "grad_norm": 1.9031022787094116, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7297290563583374, + "num_tokens": 248837878.0, + "step": 9620 + }, + { + "epoch": 1.0565561168460356, + "grad_norm": 1.850801944732666, + "learning_rate": 5e-06, + "loss": 0.6834, + "mean_token_accuracy": 0.7730435132980347, + "num_tokens": 248863410.0, + "step": 9621 + }, + { + "epoch": 1.0566659345486493, + "grad_norm": 1.8396246433258057, + "learning_rate": 5e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7031973600387573, + "num_tokens": 248894320.0, + "step": 9622 + }, + { + "epoch": 1.0567757522512629, + "grad_norm": 1.6803547143936157, + "learning_rate": 5e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7214270234107971, + "num_tokens": 248928032.0, + "step": 9623 + }, + { + "epoch": 1.0568855699538766, + "grad_norm": 2.0387868881225586, + "learning_rate": 5e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7348932027816772, + "num_tokens": 248950737.0, + "step": 9624 + }, + { + "epoch": 1.0569953876564901, + "grad_norm": 1.790242075920105, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7264872193336487, + "num_tokens": 248977832.0, + "step": 9625 + }, + { + "epoch": 1.057105205359104, + "grad_norm": 2.0063345432281494, + "learning_rate": 5e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7444134950637817, + "num_tokens": 249003791.0, + "step": 9626 + }, + { + "epoch": 1.0572150230617177, + "grad_norm": 1.6619229316711426, + "learning_rate": 5e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.715192973613739, + "num_tokens": 249036695.0, + "step": 9627 + }, + { + "epoch": 1.0573248407643312, + "grad_norm": 2.028813600540161, + "learning_rate": 5e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.740312933921814, + "num_tokens": 249058591.0, + "step": 9628 + }, + { + "epoch": 1.057434658466945, + "grad_norm": 1.7494205236434937, + "learning_rate": 5e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7355812191963196, + "num_tokens": 249087295.0, + "step": 9629 + }, + { + "epoch": 1.0575444761695585, + "grad_norm": 1.7389979362487793, + "learning_rate": 5e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7304798364639282, + "num_tokens": 249115676.0, + "step": 9630 + }, + { + "epoch": 1.0576542938721722, + "grad_norm": 1.9788752794265747, + "learning_rate": 5e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7264776229858398, + "num_tokens": 249145190.0, + "step": 9631 + }, + { + "epoch": 1.0577641115747858, + "grad_norm": 2.02117657661438, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7278653979301453, + "num_tokens": 249169593.0, + "step": 9632 + }, + { + "epoch": 1.0578739292773995, + "grad_norm": 1.8931217193603516, + "learning_rate": 5e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7359027862548828, + "num_tokens": 249196046.0, + "step": 9633 + }, + { + "epoch": 1.057983746980013, + "grad_norm": 1.8999580144882202, + "learning_rate": 5e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.734565258026123, + "num_tokens": 249222517.0, + "step": 9634 + }, + { + "epoch": 1.0580935646826268, + "grad_norm": 1.743547797203064, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7245503664016724, + "num_tokens": 249252720.0, + "step": 9635 + }, + { + "epoch": 1.0582033823852406, + "grad_norm": 1.9889074563980103, + "learning_rate": 5e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7359681725502014, + "num_tokens": 249277385.0, + "step": 9636 + }, + { + "epoch": 1.0583132000878541, + "grad_norm": 2.010523557662964, + "learning_rate": 5e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7523189783096313, + "num_tokens": 249299945.0, + "step": 9637 + }, + { + "epoch": 1.0584230177904679, + "grad_norm": 2.0757453441619873, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7529644966125488, + "num_tokens": 249319780.0, + "step": 9638 + }, + { + "epoch": 1.0585328354930814, + "grad_norm": 1.9387840032577515, + "learning_rate": 5e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7382583022117615, + "num_tokens": 249344262.0, + "step": 9639 + }, + { + "epoch": 1.0586426531956952, + "grad_norm": 2.015003204345703, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7265607118606567, + "num_tokens": 249366076.0, + "step": 9640 + }, + { + "epoch": 1.0587524708983087, + "grad_norm": 1.8226630687713623, + "learning_rate": 5e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7582852840423584, + "num_tokens": 249390173.0, + "step": 9641 + }, + { + "epoch": 1.0588622886009225, + "grad_norm": 1.8296502828598022, + "learning_rate": 5e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7332810163497925, + "num_tokens": 249415609.0, + "step": 9642 + }, + { + "epoch": 1.0589721063035362, + "grad_norm": 1.870154857635498, + "learning_rate": 5e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7476674318313599, + "num_tokens": 249439813.0, + "step": 9643 + }, + { + "epoch": 1.0590819240061498, + "grad_norm": 1.782639503479004, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7342035174369812, + "num_tokens": 249468374.0, + "step": 9644 + }, + { + "epoch": 1.0591917417087635, + "grad_norm": 2.137105941772461, + "learning_rate": 5e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7198778390884399, + "num_tokens": 249493840.0, + "step": 9645 + }, + { + "epoch": 1.059301559411377, + "grad_norm": 1.9790982007980347, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7329909801483154, + "num_tokens": 249517355.0, + "step": 9646 + }, + { + "epoch": 1.0594113771139908, + "grad_norm": 2.032048463821411, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7379536032676697, + "num_tokens": 249541566.0, + "step": 9647 + }, + { + "epoch": 1.0595211948166043, + "grad_norm": 1.914945363998413, + "learning_rate": 5e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7242732644081116, + "num_tokens": 249569661.0, + "step": 9648 + }, + { + "epoch": 1.059631012519218, + "grad_norm": 1.986362338066101, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7334162592887878, + "num_tokens": 249594704.0, + "step": 9649 + }, + { + "epoch": 1.0597408302218319, + "grad_norm": 1.9028360843658447, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7433523535728455, + "num_tokens": 249619623.0, + "step": 9650 + }, + { + "epoch": 1.0598506479244454, + "grad_norm": 1.7759088277816772, + "learning_rate": 5e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.7534606456756592, + "num_tokens": 249645892.0, + "step": 9651 + }, + { + "epoch": 1.0599604656270591, + "grad_norm": 1.8141194581985474, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7483577728271484, + "num_tokens": 249671526.0, + "step": 9652 + }, + { + "epoch": 1.0600702833296727, + "grad_norm": 1.6009033918380737, + "learning_rate": 5e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7525736093521118, + "num_tokens": 249701820.0, + "step": 9653 + }, + { + "epoch": 1.0601801010322864, + "grad_norm": 1.8397607803344727, + "learning_rate": 5e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7341841459274292, + "num_tokens": 249726228.0, + "step": 9654 + }, + { + "epoch": 1.0602899187349, + "grad_norm": 1.9218099117279053, + "learning_rate": 5e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7285511493682861, + "num_tokens": 249751228.0, + "step": 9655 + }, + { + "epoch": 1.0603997364375137, + "grad_norm": 1.7344260215759277, + "learning_rate": 5e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7497562170028687, + "num_tokens": 249779502.0, + "step": 9656 + }, + { + "epoch": 1.0605095541401275, + "grad_norm": 1.8808835744857788, + "learning_rate": 5e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7158030867576599, + "num_tokens": 249808001.0, + "step": 9657 + }, + { + "epoch": 1.060619371842741, + "grad_norm": 1.8067008256912231, + "learning_rate": 5e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.728407621383667, + "num_tokens": 249837695.0, + "step": 9658 + }, + { + "epoch": 1.0607291895453548, + "grad_norm": 2.1726484298706055, + "learning_rate": 5e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7294760942459106, + "num_tokens": 249859710.0, + "step": 9659 + }, + { + "epoch": 1.0608390072479683, + "grad_norm": 2.024773359298706, + "learning_rate": 5e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7384084463119507, + "num_tokens": 249881732.0, + "step": 9660 + }, + { + "epoch": 1.060948824950582, + "grad_norm": 1.8454749584197998, + "learning_rate": 5e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7472811937332153, + "num_tokens": 249907565.0, + "step": 9661 + }, + { + "epoch": 1.0610586426531956, + "grad_norm": 1.8280744552612305, + "learning_rate": 5e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7173752784729004, + "num_tokens": 249933878.0, + "step": 9662 + }, + { + "epoch": 1.0611684603558094, + "grad_norm": 1.79008948802948, + "learning_rate": 5e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7363748550415039, + "num_tokens": 249964056.0, + "step": 9663 + }, + { + "epoch": 1.0612782780584231, + "grad_norm": 1.9944260120391846, + "learning_rate": 5e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7447094917297363, + "num_tokens": 249986554.0, + "step": 9664 + }, + { + "epoch": 1.0613880957610367, + "grad_norm": 2.0766780376434326, + "learning_rate": 5e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7319773435592651, + "num_tokens": 250009444.0, + "step": 9665 + }, + { + "epoch": 1.0614979134636504, + "grad_norm": 2.0590648651123047, + "learning_rate": 5e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7306873798370361, + "num_tokens": 250030501.0, + "step": 9666 + }, + { + "epoch": 1.061607731166264, + "grad_norm": 1.712112545967102, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7196646332740784, + "num_tokens": 250062352.0, + "step": 9667 + }, + { + "epoch": 1.0617175488688777, + "grad_norm": 2.3836958408355713, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7402186393737793, + "num_tokens": 250080350.0, + "step": 9668 + }, + { + "epoch": 1.0618273665714912, + "grad_norm": 1.9261982440948486, + "learning_rate": 5e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7222357392311096, + "num_tokens": 250105704.0, + "step": 9669 + }, + { + "epoch": 1.061937184274105, + "grad_norm": 1.6934947967529297, + "learning_rate": 5e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7084646821022034, + "num_tokens": 250135022.0, + "step": 9670 + }, + { + "epoch": 1.0620470019767188, + "grad_norm": 1.7175171375274658, + "learning_rate": 5e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7141350507736206, + "num_tokens": 250166396.0, + "step": 9671 + }, + { + "epoch": 1.0621568196793323, + "grad_norm": 1.8544871807098389, + "learning_rate": 5e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7303770184516907, + "num_tokens": 250192688.0, + "step": 9672 + }, + { + "epoch": 1.062266637381946, + "grad_norm": 2.0061466693878174, + "learning_rate": 5e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7215121984481812, + "num_tokens": 250219156.0, + "step": 9673 + }, + { + "epoch": 1.0623764550845596, + "grad_norm": 1.7443010807037354, + "learning_rate": 5e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7279849648475647, + "num_tokens": 250248730.0, + "step": 9674 + }, + { + "epoch": 1.0624862727871733, + "grad_norm": 2.0482804775238037, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.756668210029602, + "num_tokens": 250269567.0, + "step": 9675 + }, + { + "epoch": 1.0625960904897869, + "grad_norm": 1.8846486806869507, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7543094158172607, + "num_tokens": 250295177.0, + "step": 9676 + }, + { + "epoch": 1.0627059081924006, + "grad_norm": 1.8951927423477173, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7433649301528931, + "num_tokens": 250321136.0, + "step": 9677 + }, + { + "epoch": 1.0628157258950144, + "grad_norm": 2.1797397136688232, + "learning_rate": 5e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7407925128936768, + "num_tokens": 250342286.0, + "step": 9678 + }, + { + "epoch": 1.062925543597628, + "grad_norm": 1.9066916704177856, + "learning_rate": 5e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7302895784378052, + "num_tokens": 250368400.0, + "step": 9679 + }, + { + "epoch": 1.0630353613002417, + "grad_norm": 2.062925338745117, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7444618940353394, + "num_tokens": 250390402.0, + "step": 9680 + }, + { + "epoch": 1.0631451790028552, + "grad_norm": 2.0288195610046387, + "learning_rate": 5e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7481767535209656, + "num_tokens": 250414605.0, + "step": 9681 + }, + { + "epoch": 1.063254996705469, + "grad_norm": 1.7153364419937134, + "learning_rate": 5e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7379233837127686, + "num_tokens": 250445583.0, + "step": 9682 + }, + { + "epoch": 1.0633648144080825, + "grad_norm": 1.8822580575942993, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7364789247512817, + "num_tokens": 250472053.0, + "step": 9683 + }, + { + "epoch": 1.0634746321106963, + "grad_norm": 1.5478147268295288, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7307244539260864, + "num_tokens": 250505371.0, + "step": 9684 + }, + { + "epoch": 1.0635844498133098, + "grad_norm": 1.9027042388916016, + "learning_rate": 5e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7360917329788208, + "num_tokens": 250529739.0, + "step": 9685 + }, + { + "epoch": 1.0636942675159236, + "grad_norm": 1.9799067974090576, + "learning_rate": 5e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7296532988548279, + "num_tokens": 250557217.0, + "step": 9686 + }, + { + "epoch": 1.0638040852185373, + "grad_norm": 1.8099831342697144, + "learning_rate": 5e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7127983570098877, + "num_tokens": 250587138.0, + "step": 9687 + }, + { + "epoch": 1.0639139029211508, + "grad_norm": 1.8108412027359009, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7551333904266357, + "num_tokens": 250613932.0, + "step": 9688 + }, + { + "epoch": 1.0640237206237646, + "grad_norm": 2.0898146629333496, + "learning_rate": 5e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.755536675453186, + "num_tokens": 250634357.0, + "step": 9689 + }, + { + "epoch": 1.0641335383263781, + "grad_norm": 1.9771655797958374, + "learning_rate": 5e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7526335120201111, + "num_tokens": 250656097.0, + "step": 9690 + }, + { + "epoch": 1.064243356028992, + "grad_norm": 2.0804340839385986, + "learning_rate": 5e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7370323538780212, + "num_tokens": 250680567.0, + "step": 9691 + }, + { + "epoch": 1.0643531737316057, + "grad_norm": 2.0090227127075195, + "learning_rate": 5e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7409014701843262, + "num_tokens": 250704004.0, + "step": 9692 + }, + { + "epoch": 1.0644629914342192, + "grad_norm": 1.931706428527832, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7401966452598572, + "num_tokens": 250729475.0, + "step": 9693 + }, + { + "epoch": 1.064572809136833, + "grad_norm": 1.6595282554626465, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7274637222290039, + "num_tokens": 250762171.0, + "step": 9694 + }, + { + "epoch": 1.0646826268394465, + "grad_norm": 1.895440936088562, + "learning_rate": 5e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7308548092842102, + "num_tokens": 250785756.0, + "step": 9695 + }, + { + "epoch": 1.0647924445420602, + "grad_norm": 1.9341741800308228, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7381432056427002, + "num_tokens": 250809663.0, + "step": 9696 + }, + { + "epoch": 1.0649022622446738, + "grad_norm": 1.9144015312194824, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7260226011276245, + "num_tokens": 250835799.0, + "step": 9697 + }, + { + "epoch": 1.0650120799472875, + "grad_norm": 1.9706456661224365, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7390034198760986, + "num_tokens": 250858855.0, + "step": 9698 + }, + { + "epoch": 1.065121897649901, + "grad_norm": 1.9597586393356323, + "learning_rate": 5e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7412618398666382, + "num_tokens": 250882085.0, + "step": 9699 + }, + { + "epoch": 1.0652317153525148, + "grad_norm": 2.068765640258789, + "learning_rate": 5e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7223819494247437, + "num_tokens": 250906763.0, + "step": 9700 + }, + { + "epoch": 1.0653415330551286, + "grad_norm": 1.8528131246566772, + "learning_rate": 5e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7146442532539368, + "num_tokens": 250933398.0, + "step": 9701 + }, + { + "epoch": 1.065451350757742, + "grad_norm": 2.0600931644439697, + "learning_rate": 5e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7585142850875854, + "num_tokens": 250954472.0, + "step": 9702 + }, + { + "epoch": 1.0655611684603559, + "grad_norm": 1.9809831380844116, + "learning_rate": 5e-06, + "loss": 0.7863, + "mean_token_accuracy": 0.7460125684738159, + "num_tokens": 250979754.0, + "step": 9703 + }, + { + "epoch": 1.0656709861629694, + "grad_norm": 2.128387928009033, + "learning_rate": 5e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7329055070877075, + "num_tokens": 251001232.0, + "step": 9704 + }, + { + "epoch": 1.0657808038655832, + "grad_norm": 1.9295217990875244, + "learning_rate": 5e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7402812242507935, + "num_tokens": 251025625.0, + "step": 9705 + }, + { + "epoch": 1.0658906215681967, + "grad_norm": 2.0134639739990234, + "learning_rate": 5e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7251005172729492, + "num_tokens": 251050329.0, + "step": 9706 + }, + { + "epoch": 1.0660004392708105, + "grad_norm": 1.8486634492874146, + "learning_rate": 5e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7330152988433838, + "num_tokens": 251077093.0, + "step": 9707 + }, + { + "epoch": 1.0661102569734242, + "grad_norm": 2.047304391860962, + "learning_rate": 5e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7238545417785645, + "num_tokens": 251099782.0, + "step": 9708 + }, + { + "epoch": 1.0662200746760377, + "grad_norm": 1.8168754577636719, + "learning_rate": 5e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7325413227081299, + "num_tokens": 251126973.0, + "step": 9709 + }, + { + "epoch": 1.0663298923786515, + "grad_norm": 1.9399323463439941, + "learning_rate": 5e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7342435717582703, + "num_tokens": 251151389.0, + "step": 9710 + }, + { + "epoch": 1.066439710081265, + "grad_norm": 1.9877099990844727, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7280914783477783, + "num_tokens": 251173658.0, + "step": 9711 + }, + { + "epoch": 1.0665495277838788, + "grad_norm": 1.7706159353256226, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.751350998878479, + "num_tokens": 251200866.0, + "step": 9712 + }, + { + "epoch": 1.0666593454864923, + "grad_norm": 1.8363901376724243, + "learning_rate": 5e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7318488359451294, + "num_tokens": 251227676.0, + "step": 9713 + }, + { + "epoch": 1.066769163189106, + "grad_norm": 1.9847044944763184, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7299875617027283, + "num_tokens": 251252604.0, + "step": 9714 + }, + { + "epoch": 1.0668789808917198, + "grad_norm": 1.9573777914047241, + "learning_rate": 5e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7343653440475464, + "num_tokens": 251278092.0, + "step": 9715 + }, + { + "epoch": 1.0669887985943334, + "grad_norm": 1.841707468032837, + "learning_rate": 5e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.73762047290802, + "num_tokens": 251306406.0, + "step": 9716 + }, + { + "epoch": 1.0670986162969471, + "grad_norm": 1.912341833114624, + "learning_rate": 5e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7379717826843262, + "num_tokens": 251333173.0, + "step": 9717 + }, + { + "epoch": 1.0672084339995607, + "grad_norm": 1.8041554689407349, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7162148952484131, + "num_tokens": 251363154.0, + "step": 9718 + }, + { + "epoch": 1.0673182517021744, + "grad_norm": 1.8152693510055542, + "learning_rate": 5e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7290765643119812, + "num_tokens": 251391334.0, + "step": 9719 + }, + { + "epoch": 1.067428069404788, + "grad_norm": 2.1291873455047607, + "learning_rate": 5e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7272974252700806, + "num_tokens": 251413803.0, + "step": 9720 + }, + { + "epoch": 1.0675378871074017, + "grad_norm": 1.8417952060699463, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7350828647613525, + "num_tokens": 251440772.0, + "step": 9721 + }, + { + "epoch": 1.0676477048100155, + "grad_norm": 1.818721890449524, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7295083999633789, + "num_tokens": 251470321.0, + "step": 9722 + }, + { + "epoch": 1.067757522512629, + "grad_norm": 1.9987536668777466, + "learning_rate": 5e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7326582074165344, + "num_tokens": 251494615.0, + "step": 9723 + }, + { + "epoch": 1.0678673402152428, + "grad_norm": 2.058393716812134, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7329834699630737, + "num_tokens": 251518242.0, + "step": 9724 + }, + { + "epoch": 1.0679771579178563, + "grad_norm": 1.5923558473587036, + "learning_rate": 5e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.709068238735199, + "num_tokens": 251554408.0, + "step": 9725 + }, + { + "epoch": 1.06808697562047, + "grad_norm": 1.9829086065292358, + "learning_rate": 5e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7438149452209473, + "num_tokens": 251577370.0, + "step": 9726 + }, + { + "epoch": 1.0681967933230836, + "grad_norm": 1.7280699014663696, + "learning_rate": 5e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7305665016174316, + "num_tokens": 251607542.0, + "step": 9727 + }, + { + "epoch": 1.0683066110256974, + "grad_norm": 2.1892080307006836, + "learning_rate": 5e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7220146656036377, + "num_tokens": 251628283.0, + "step": 9728 + }, + { + "epoch": 1.068416428728311, + "grad_norm": 1.958563208580017, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7444779276847839, + "num_tokens": 251653754.0, + "step": 9729 + }, + { + "epoch": 1.0685262464309246, + "grad_norm": 2.47371506690979, + "learning_rate": 5e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7395431399345398, + "num_tokens": 251670786.0, + "step": 9730 + }, + { + "epoch": 1.0686360641335384, + "grad_norm": 1.9733346700668335, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7349691390991211, + "num_tokens": 251695750.0, + "step": 9731 + }, + { + "epoch": 1.068745881836152, + "grad_norm": 2.2559738159179688, + "learning_rate": 5e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7681474685668945, + "num_tokens": 251713659.0, + "step": 9732 + }, + { + "epoch": 1.0688556995387657, + "grad_norm": 1.8376328945159912, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7427129745483398, + "num_tokens": 251737788.0, + "step": 9733 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 1.8452385663986206, + "learning_rate": 5e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7297490835189819, + "num_tokens": 251763040.0, + "step": 9734 + }, + { + "epoch": 1.069075334943993, + "grad_norm": 1.9734054803848267, + "learning_rate": 5e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7482422590255737, + "num_tokens": 251786321.0, + "step": 9735 + }, + { + "epoch": 1.0691851526466065, + "grad_norm": 1.910578966140747, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7370814085006714, + "num_tokens": 251810527.0, + "step": 9736 + }, + { + "epoch": 1.0692949703492203, + "grad_norm": 1.9204679727554321, + "learning_rate": 5e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.739415168762207, + "num_tokens": 251835623.0, + "step": 9737 + }, + { + "epoch": 1.069404788051834, + "grad_norm": 1.8195444345474243, + "learning_rate": 5e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7088765501976013, + "num_tokens": 251861456.0, + "step": 9738 + }, + { + "epoch": 1.0695146057544476, + "grad_norm": 1.9082973003387451, + "learning_rate": 5e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7461408376693726, + "num_tokens": 251887344.0, + "step": 9739 + }, + { + "epoch": 1.0696244234570613, + "grad_norm": 2.106290578842163, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7378461360931396, + "num_tokens": 251911081.0, + "step": 9740 + }, + { + "epoch": 1.0697342411596749, + "grad_norm": 1.8865317106246948, + "learning_rate": 5e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7168129086494446, + "num_tokens": 251940549.0, + "step": 9741 + }, + { + "epoch": 1.0698440588622886, + "grad_norm": 1.811565637588501, + "learning_rate": 5e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7567726373672485, + "num_tokens": 251965460.0, + "step": 9742 + }, + { + "epoch": 1.0699538765649024, + "grad_norm": 1.9611488580703735, + "learning_rate": 5e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7581453323364258, + "num_tokens": 251988196.0, + "step": 9743 + }, + { + "epoch": 1.070063694267516, + "grad_norm": 2.1147308349609375, + "learning_rate": 5e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7236525416374207, + "num_tokens": 252011909.0, + "step": 9744 + }, + { + "epoch": 1.0701735119701297, + "grad_norm": 2.2243106365203857, + "learning_rate": 5e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7701043486595154, + "num_tokens": 252029795.0, + "step": 9745 + }, + { + "epoch": 1.0702833296727432, + "grad_norm": 1.9649782180786133, + "learning_rate": 5e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7156833410263062, + "num_tokens": 252057012.0, + "step": 9746 + }, + { + "epoch": 1.070393147375357, + "grad_norm": 1.8383334875106812, + "learning_rate": 5e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7114096879959106, + "num_tokens": 252085312.0, + "step": 9747 + }, + { + "epoch": 1.0705029650779705, + "grad_norm": 2.006755828857422, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7486366629600525, + "num_tokens": 252108547.0, + "step": 9748 + }, + { + "epoch": 1.0706127827805842, + "grad_norm": 1.6403582096099854, + "learning_rate": 5e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7399130463600159, + "num_tokens": 252142942.0, + "step": 9749 + }, + { + "epoch": 1.0707226004831978, + "grad_norm": 1.8119105100631714, + "learning_rate": 5e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7456307411193848, + "num_tokens": 252170295.0, + "step": 9750 + }, + { + "epoch": 1.0708324181858115, + "grad_norm": 1.7957500219345093, + "learning_rate": 5e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.717047929763794, + "num_tokens": 252200651.0, + "step": 9751 + }, + { + "epoch": 1.0709422358884253, + "grad_norm": 1.8149789571762085, + "learning_rate": 5e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.726970911026001, + "num_tokens": 252226153.0, + "step": 9752 + }, + { + "epoch": 1.0710520535910388, + "grad_norm": 1.9044299125671387, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7307573556900024, + "num_tokens": 252251215.0, + "step": 9753 + }, + { + "epoch": 1.0711618712936526, + "grad_norm": 1.8595836162567139, + "learning_rate": 5e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7138954401016235, + "num_tokens": 252280259.0, + "step": 9754 + }, + { + "epoch": 1.0712716889962661, + "grad_norm": 1.9052166938781738, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7265886068344116, + "num_tokens": 252306145.0, + "step": 9755 + }, + { + "epoch": 1.0713815066988799, + "grad_norm": 1.749887466430664, + "learning_rate": 5e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7540075778961182, + "num_tokens": 252334332.0, + "step": 9756 + }, + { + "epoch": 1.0714913244014934, + "grad_norm": 1.9517742395401, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7486948370933533, + "num_tokens": 252357351.0, + "step": 9757 + }, + { + "epoch": 1.0716011421041072, + "grad_norm": 1.6486880779266357, + "learning_rate": 5e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.720046877861023, + "num_tokens": 252389272.0, + "step": 9758 + }, + { + "epoch": 1.071710959806721, + "grad_norm": 1.5871272087097168, + "learning_rate": 5e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7549439668655396, + "num_tokens": 252420339.0, + "step": 9759 + }, + { + "epoch": 1.0718207775093345, + "grad_norm": 1.7417895793914795, + "learning_rate": 5e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7146139740943909, + "num_tokens": 252449909.0, + "step": 9760 + }, + { + "epoch": 1.0719305952119482, + "grad_norm": 1.8951877355575562, + "learning_rate": 5e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7530053853988647, + "num_tokens": 252474349.0, + "step": 9761 + }, + { + "epoch": 1.0720404129145618, + "grad_norm": 1.9463458061218262, + "learning_rate": 5e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7380583882331848, + "num_tokens": 252496514.0, + "step": 9762 + }, + { + "epoch": 1.0721502306171755, + "grad_norm": 1.8510980606079102, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7349730730056763, + "num_tokens": 252521591.0, + "step": 9763 + }, + { + "epoch": 1.072260048319789, + "grad_norm": 1.7262581586837769, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7349586486816406, + "num_tokens": 252550658.0, + "step": 9764 + }, + { + "epoch": 1.0723698660224028, + "grad_norm": 1.9522303342819214, + "learning_rate": 5e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.7395788431167603, + "num_tokens": 252575058.0, + "step": 9765 + }, + { + "epoch": 1.0724796837250166, + "grad_norm": 1.8626652956008911, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.738701581954956, + "num_tokens": 252601482.0, + "step": 9766 + }, + { + "epoch": 1.07258950142763, + "grad_norm": 2.0759854316711426, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7394990921020508, + "num_tokens": 252626671.0, + "step": 9767 + }, + { + "epoch": 1.0726993191302439, + "grad_norm": 2.3533220291137695, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7247883081436157, + "num_tokens": 252648098.0, + "step": 9768 + }, + { + "epoch": 1.0728091368328574, + "grad_norm": 1.8577423095703125, + "learning_rate": 5e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7194093465805054, + "num_tokens": 252674309.0, + "step": 9769 + }, + { + "epoch": 1.0729189545354711, + "grad_norm": 1.9985626935958862, + "learning_rate": 5e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.739067554473877, + "num_tokens": 252696866.0, + "step": 9770 + }, + { + "epoch": 1.0730287722380847, + "grad_norm": 1.6876543760299683, + "learning_rate": 5e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7087982892990112, + "num_tokens": 252727980.0, + "step": 9771 + }, + { + "epoch": 1.0731385899406984, + "grad_norm": 1.869910478591919, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7301663160324097, + "num_tokens": 252754532.0, + "step": 9772 + }, + { + "epoch": 1.0732484076433122, + "grad_norm": 1.7983993291854858, + "learning_rate": 5e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7609628438949585, + "num_tokens": 252782501.0, + "step": 9773 + }, + { + "epoch": 1.0733582253459257, + "grad_norm": 1.690298080444336, + "learning_rate": 5e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7182176113128662, + "num_tokens": 252815046.0, + "step": 9774 + }, + { + "epoch": 1.0734680430485395, + "grad_norm": 2.2760210037231445, + "learning_rate": 5e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7474709749221802, + "num_tokens": 252833782.0, + "step": 9775 + }, + { + "epoch": 1.073577860751153, + "grad_norm": 1.8878393173217773, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7574800252914429, + "num_tokens": 252857601.0, + "step": 9776 + }, + { + "epoch": 1.0736876784537668, + "grad_norm": 1.5486061573028564, + "learning_rate": 5e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7580997943878174, + "num_tokens": 252890154.0, + "step": 9777 + }, + { + "epoch": 1.0737974961563803, + "grad_norm": 1.8881415128707886, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7152551412582397, + "num_tokens": 252922145.0, + "step": 9778 + }, + { + "epoch": 1.073907313858994, + "grad_norm": 2.1474757194519043, + "learning_rate": 5e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7232751846313477, + "num_tokens": 252945377.0, + "step": 9779 + }, + { + "epoch": 1.0740171315616078, + "grad_norm": 1.8020137548446655, + "learning_rate": 5e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.725184440612793, + "num_tokens": 252975229.0, + "step": 9780 + }, + { + "epoch": 1.0741269492642214, + "grad_norm": 1.8498495817184448, + "learning_rate": 5e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.747583270072937, + "num_tokens": 253001095.0, + "step": 9781 + }, + { + "epoch": 1.0742367669668351, + "grad_norm": 1.757276177406311, + "learning_rate": 5e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7440033555030823, + "num_tokens": 253028834.0, + "step": 9782 + }, + { + "epoch": 1.0743465846694487, + "grad_norm": 1.8479710817337036, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7395209074020386, + "num_tokens": 253056295.0, + "step": 9783 + }, + { + "epoch": 1.0744564023720624, + "grad_norm": 1.7855064868927002, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7257206439971924, + "num_tokens": 253082959.0, + "step": 9784 + }, + { + "epoch": 1.074566220074676, + "grad_norm": 2.0310356616973877, + "learning_rate": 5e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7180153131484985, + "num_tokens": 253107416.0, + "step": 9785 + }, + { + "epoch": 1.0746760377772897, + "grad_norm": 1.9993242025375366, + "learning_rate": 5e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7139345407485962, + "num_tokens": 253131740.0, + "step": 9786 + }, + { + "epoch": 1.0747858554799035, + "grad_norm": 1.713544249534607, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7186511158943176, + "num_tokens": 253160274.0, + "step": 9787 + }, + { + "epoch": 1.074895673182517, + "grad_norm": 1.7937201261520386, + "learning_rate": 5e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7093201875686646, + "num_tokens": 253187661.0, + "step": 9788 + }, + { + "epoch": 1.0750054908851308, + "grad_norm": 1.9103822708129883, + "learning_rate": 5e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.740541934967041, + "num_tokens": 253212915.0, + "step": 9789 + }, + { + "epoch": 1.0751153085877443, + "grad_norm": 1.8040111064910889, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7364490032196045, + "num_tokens": 253239676.0, + "step": 9790 + }, + { + "epoch": 1.075225126290358, + "grad_norm": 2.0825893878936768, + "learning_rate": 5e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7486938238143921, + "num_tokens": 253260082.0, + "step": 9791 + }, + { + "epoch": 1.0753349439929716, + "grad_norm": 1.7299448251724243, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7267139554023743, + "num_tokens": 253290187.0, + "step": 9792 + }, + { + "epoch": 1.0754447616955853, + "grad_norm": 1.920351505279541, + "learning_rate": 5e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7526984214782715, + "num_tokens": 253314366.0, + "step": 9793 + }, + { + "epoch": 1.075554579398199, + "grad_norm": 1.9490623474121094, + "learning_rate": 5e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7306059002876282, + "num_tokens": 253345368.0, + "step": 9794 + }, + { + "epoch": 1.0756643971008126, + "grad_norm": 1.8347762823104858, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7454853653907776, + "num_tokens": 253370602.0, + "step": 9795 + }, + { + "epoch": 1.0757742148034264, + "grad_norm": 1.9717024564743042, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7279258370399475, + "num_tokens": 253396374.0, + "step": 9796 + }, + { + "epoch": 1.07588403250604, + "grad_norm": 2.0158441066741943, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.740168571472168, + "num_tokens": 253419434.0, + "step": 9797 + }, + { + "epoch": 1.0759938502086537, + "grad_norm": 2.1033971309661865, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7333023548126221, + "num_tokens": 253442232.0, + "step": 9798 + }, + { + "epoch": 1.0761036679112672, + "grad_norm": 1.9343540668487549, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7236998677253723, + "num_tokens": 253467675.0, + "step": 9799 + }, + { + "epoch": 1.076213485613881, + "grad_norm": 1.6601308584213257, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7519505620002747, + "num_tokens": 253496129.0, + "step": 9800 + }, + { + "epoch": 1.0763233033164945, + "grad_norm": 1.9406766891479492, + "learning_rate": 5e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7240542769432068, + "num_tokens": 253521369.0, + "step": 9801 + }, + { + "epoch": 1.0764331210191083, + "grad_norm": 2.0496044158935547, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7315534353256226, + "num_tokens": 253544864.0, + "step": 9802 + }, + { + "epoch": 1.076542938721722, + "grad_norm": 1.8448970317840576, + "learning_rate": 5e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7383485436439514, + "num_tokens": 253576106.0, + "step": 9803 + }, + { + "epoch": 1.0766527564243356, + "grad_norm": 2.018951654434204, + "learning_rate": 5e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7347794771194458, + "num_tokens": 253600204.0, + "step": 9804 + }, + { + "epoch": 1.0767625741269493, + "grad_norm": 1.7504327297210693, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7518417239189148, + "num_tokens": 253628463.0, + "step": 9805 + }, + { + "epoch": 1.0768723918295628, + "grad_norm": 1.8352359533309937, + "learning_rate": 5e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7121567726135254, + "num_tokens": 253655806.0, + "step": 9806 + }, + { + "epoch": 1.0769822095321766, + "grad_norm": 1.9220761060714722, + "learning_rate": 5e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7315536737442017, + "num_tokens": 253680957.0, + "step": 9807 + }, + { + "epoch": 1.0770920272347904, + "grad_norm": 2.0162103176116943, + "learning_rate": 5e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7426799535751343, + "num_tokens": 253703697.0, + "step": 9808 + }, + { + "epoch": 1.077201844937404, + "grad_norm": 1.8767017126083374, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.748039960861206, + "num_tokens": 253730970.0, + "step": 9809 + }, + { + "epoch": 1.0773116626400177, + "grad_norm": 2.1415462493896484, + "learning_rate": 5e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7491637468338013, + "num_tokens": 253752678.0, + "step": 9810 + }, + { + "epoch": 1.0774214803426312, + "grad_norm": 1.8121260404586792, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7274986505508423, + "num_tokens": 253780797.0, + "step": 9811 + }, + { + "epoch": 1.077531298045245, + "grad_norm": 1.8815484046936035, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7157381772994995, + "num_tokens": 253807100.0, + "step": 9812 + }, + { + "epoch": 1.0776411157478585, + "grad_norm": 1.9835749864578247, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7482632398605347, + "num_tokens": 253829721.0, + "step": 9813 + }, + { + "epoch": 1.0777509334504722, + "grad_norm": 1.9870240688323975, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7692121267318726, + "num_tokens": 253851996.0, + "step": 9814 + }, + { + "epoch": 1.0778607511530858, + "grad_norm": 2.076335906982422, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7366647720336914, + "num_tokens": 253873945.0, + "step": 9815 + }, + { + "epoch": 1.0779705688556995, + "grad_norm": 2.180272102355957, + "learning_rate": 5e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7218879461288452, + "num_tokens": 253896983.0, + "step": 9816 + }, + { + "epoch": 1.0780803865583133, + "grad_norm": 2.051330089569092, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7306185364723206, + "num_tokens": 253922257.0, + "step": 9817 + }, + { + "epoch": 1.0781902042609268, + "grad_norm": 1.7203233242034912, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7323222160339355, + "num_tokens": 253950821.0, + "step": 9818 + }, + { + "epoch": 1.0783000219635406, + "grad_norm": 1.836642861366272, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7518383264541626, + "num_tokens": 253979001.0, + "step": 9819 + }, + { + "epoch": 1.0784098396661541, + "grad_norm": 1.9907116889953613, + "learning_rate": 5e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7392736673355103, + "num_tokens": 254003245.0, + "step": 9820 + }, + { + "epoch": 1.0785196573687679, + "grad_norm": 1.7593275308609009, + "learning_rate": 5e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7276009321212769, + "num_tokens": 254030282.0, + "step": 9821 + }, + { + "epoch": 1.0786294750713814, + "grad_norm": 1.7821226119995117, + "learning_rate": 5e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7222315073013306, + "num_tokens": 254058063.0, + "step": 9822 + }, + { + "epoch": 1.0787392927739952, + "grad_norm": 1.8056315183639526, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.744343638420105, + "num_tokens": 254084737.0, + "step": 9823 + }, + { + "epoch": 1.078849110476609, + "grad_norm": 1.8654497861862183, + "learning_rate": 5e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7355101108551025, + "num_tokens": 254111393.0, + "step": 9824 + }, + { + "epoch": 1.0789589281792225, + "grad_norm": 1.8421930074691772, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7218688726425171, + "num_tokens": 254140962.0, + "step": 9825 + }, + { + "epoch": 1.0790687458818362, + "grad_norm": 2.1568617820739746, + "learning_rate": 5e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.744285523891449, + "num_tokens": 254161215.0, + "step": 9826 + }, + { + "epoch": 1.0791785635844497, + "grad_norm": 2.0542826652526855, + "learning_rate": 5e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.744150698184967, + "num_tokens": 254183570.0, + "step": 9827 + }, + { + "epoch": 1.0792883812870635, + "grad_norm": 2.0166165828704834, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7567617893218994, + "num_tokens": 254205257.0, + "step": 9828 + }, + { + "epoch": 1.079398198989677, + "grad_norm": 1.8232433795928955, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.739813506603241, + "num_tokens": 254234500.0, + "step": 9829 + }, + { + "epoch": 1.0795080166922908, + "grad_norm": 1.852347731590271, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7381502389907837, + "num_tokens": 254262326.0, + "step": 9830 + }, + { + "epoch": 1.0796178343949046, + "grad_norm": 2.16565203666687, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7264444828033447, + "num_tokens": 254285912.0, + "step": 9831 + }, + { + "epoch": 1.079727652097518, + "grad_norm": 1.9906922578811646, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7511112689971924, + "num_tokens": 254307490.0, + "step": 9832 + }, + { + "epoch": 1.0798374698001318, + "grad_norm": 1.9468015432357788, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.730488121509552, + "num_tokens": 254334347.0, + "step": 9833 + }, + { + "epoch": 1.0799472875027454, + "grad_norm": 1.8817628622055054, + "learning_rate": 5e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7440696358680725, + "num_tokens": 254362321.0, + "step": 9834 + }, + { + "epoch": 1.0800571052053591, + "grad_norm": 1.9265245199203491, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7498998641967773, + "num_tokens": 254386432.0, + "step": 9835 + }, + { + "epoch": 1.0801669229079727, + "grad_norm": 1.9009995460510254, + "learning_rate": 5e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.735651969909668, + "num_tokens": 254412616.0, + "step": 9836 + }, + { + "epoch": 1.0802767406105864, + "grad_norm": 1.687566876411438, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7317600250244141, + "num_tokens": 254443963.0, + "step": 9837 + }, + { + "epoch": 1.0803865583132002, + "grad_norm": 1.885411262512207, + "learning_rate": 5e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7288133502006531, + "num_tokens": 254469409.0, + "step": 9838 + }, + { + "epoch": 1.0804963760158137, + "grad_norm": 1.7396060228347778, + "learning_rate": 5e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7430312037467957, + "num_tokens": 254496495.0, + "step": 9839 + }, + { + "epoch": 1.0806061937184275, + "grad_norm": 2.006633758544922, + "learning_rate": 5e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7218570709228516, + "num_tokens": 254519389.0, + "step": 9840 + }, + { + "epoch": 1.080716011421041, + "grad_norm": 1.8912729024887085, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7253299951553345, + "num_tokens": 254544804.0, + "step": 9841 + }, + { + "epoch": 1.0808258291236548, + "grad_norm": 1.933397650718689, + "learning_rate": 5e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7515429258346558, + "num_tokens": 254567955.0, + "step": 9842 + }, + { + "epoch": 1.0809356468262683, + "grad_norm": 2.0055606365203857, + "learning_rate": 5e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7473125457763672, + "num_tokens": 254592158.0, + "step": 9843 + }, + { + "epoch": 1.081045464528882, + "grad_norm": 1.7387534379959106, + "learning_rate": 5e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7410566210746765, + "num_tokens": 254620227.0, + "step": 9844 + }, + { + "epoch": 1.0811552822314958, + "grad_norm": 1.9486229419708252, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7279184460639954, + "num_tokens": 254643846.0, + "step": 9845 + }, + { + "epoch": 1.0812650999341094, + "grad_norm": 1.7774016857147217, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7364920377731323, + "num_tokens": 254672142.0, + "step": 9846 + }, + { + "epoch": 1.081374917636723, + "grad_norm": 1.6808522939682007, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7346038818359375, + "num_tokens": 254703357.0, + "step": 9847 + }, + { + "epoch": 1.0814847353393366, + "grad_norm": 1.8346343040466309, + "learning_rate": 5e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7222301959991455, + "num_tokens": 254728553.0, + "step": 9848 + }, + { + "epoch": 1.0815945530419504, + "grad_norm": 1.9979791641235352, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7295960783958435, + "num_tokens": 254752207.0, + "step": 9849 + }, + { + "epoch": 1.081704370744564, + "grad_norm": 1.8435778617858887, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7317599058151245, + "num_tokens": 254781904.0, + "step": 9850 + }, + { + "epoch": 1.0818141884471777, + "grad_norm": 1.8739725351333618, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7499690651893616, + "num_tokens": 254808325.0, + "step": 9851 + }, + { + "epoch": 1.0819240061497912, + "grad_norm": 2.01023006439209, + "learning_rate": 5e-06, + "loss": 0.781, + "mean_token_accuracy": 0.757023811340332, + "num_tokens": 254832542.0, + "step": 9852 + }, + { + "epoch": 1.082033823852405, + "grad_norm": 1.794727087020874, + "learning_rate": 5e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7418665885925293, + "num_tokens": 254860662.0, + "step": 9853 + }, + { + "epoch": 1.0821436415550187, + "grad_norm": 2.1202778816223145, + "learning_rate": 5e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7626348733901978, + "num_tokens": 254880148.0, + "step": 9854 + }, + { + "epoch": 1.0822534592576323, + "grad_norm": 1.6676617860794067, + "learning_rate": 5e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7313050031661987, + "num_tokens": 254915277.0, + "step": 9855 + }, + { + "epoch": 1.082363276960246, + "grad_norm": 1.7332940101623535, + "learning_rate": 5e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7251080870628357, + "num_tokens": 254944661.0, + "step": 9856 + }, + { + "epoch": 1.0824730946628596, + "grad_norm": 2.198303699493408, + "learning_rate": 5e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7645448446273804, + "num_tokens": 254964660.0, + "step": 9857 + }, + { + "epoch": 1.0825829123654733, + "grad_norm": 1.9179311990737915, + "learning_rate": 5e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7301869988441467, + "num_tokens": 254993349.0, + "step": 9858 + }, + { + "epoch": 1.082692730068087, + "grad_norm": 2.1580874919891357, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7410552501678467, + "num_tokens": 255015586.0, + "step": 9859 + }, + { + "epoch": 1.0828025477707006, + "grad_norm": 2.0492405891418457, + "learning_rate": 5e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7429177761077881, + "num_tokens": 255036886.0, + "step": 9860 + }, + { + "epoch": 1.0829123654733144, + "grad_norm": 2.2035651206970215, + "learning_rate": 5e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7124979496002197, + "num_tokens": 255059680.0, + "step": 9861 + }, + { + "epoch": 1.083022183175928, + "grad_norm": 1.9506118297576904, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.745927631855011, + "num_tokens": 255085461.0, + "step": 9862 + }, + { + "epoch": 1.0831320008785417, + "grad_norm": 2.2160520553588867, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.75765061378479, + "num_tokens": 255104096.0, + "step": 9863 + }, + { + "epoch": 1.0832418185811552, + "grad_norm": 1.921954870223999, + "learning_rate": 5e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7615333199501038, + "num_tokens": 255127598.0, + "step": 9864 + }, + { + "epoch": 1.083351636283769, + "grad_norm": 1.9697731733322144, + "learning_rate": 5e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7711706161499023, + "num_tokens": 255150707.0, + "step": 9865 + }, + { + "epoch": 1.0834614539863825, + "grad_norm": 2.0226995944976807, + "learning_rate": 5e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7415074706077576, + "num_tokens": 255173570.0, + "step": 9866 + }, + { + "epoch": 1.0835712716889963, + "grad_norm": 1.6345152854919434, + "learning_rate": 5e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7052301168441772, + "num_tokens": 255206045.0, + "step": 9867 + }, + { + "epoch": 1.08368108939161, + "grad_norm": 2.0713584423065186, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7620084881782532, + "num_tokens": 255226817.0, + "step": 9868 + }, + { + "epoch": 1.0837909070942235, + "grad_norm": 1.8368520736694336, + "learning_rate": 5e-06, + "loss": 0.6583, + "mean_token_accuracy": 0.7813522815704346, + "num_tokens": 255252237.0, + "step": 9869 + }, + { + "epoch": 1.0839007247968373, + "grad_norm": 1.9988155364990234, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7373420000076294, + "num_tokens": 255279142.0, + "step": 9870 + }, + { + "epoch": 1.0840105424994508, + "grad_norm": 2.2617154121398926, + "learning_rate": 5e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.754199206829071, + "num_tokens": 255299319.0, + "step": 9871 + }, + { + "epoch": 1.0841203602020646, + "grad_norm": 1.8741493225097656, + "learning_rate": 5e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7317650318145752, + "num_tokens": 255325614.0, + "step": 9872 + }, + { + "epoch": 1.0842301779046783, + "grad_norm": 1.8239459991455078, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.716687023639679, + "num_tokens": 255356204.0, + "step": 9873 + }, + { + "epoch": 1.0843399956072919, + "grad_norm": 2.041773557662964, + "learning_rate": 5e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7018195390701294, + "num_tokens": 255383569.0, + "step": 9874 + }, + { + "epoch": 1.0844498133099056, + "grad_norm": 2.105180263519287, + "learning_rate": 5e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7346576452255249, + "num_tokens": 255409007.0, + "step": 9875 + }, + { + "epoch": 1.0845596310125192, + "grad_norm": 2.0603671073913574, + "learning_rate": 5e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7435053586959839, + "num_tokens": 255430404.0, + "step": 9876 + }, + { + "epoch": 1.084669448715133, + "grad_norm": 1.8338780403137207, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7366913557052612, + "num_tokens": 255458347.0, + "step": 9877 + }, + { + "epoch": 1.0847792664177465, + "grad_norm": 2.0294735431671143, + "learning_rate": 5e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7338745594024658, + "num_tokens": 255482880.0, + "step": 9878 + }, + { + "epoch": 1.0848890841203602, + "grad_norm": 1.8958759307861328, + "learning_rate": 5e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7362586855888367, + "num_tokens": 255507865.0, + "step": 9879 + }, + { + "epoch": 1.0849989018229738, + "grad_norm": 1.9828232526779175, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7333838939666748, + "num_tokens": 255532126.0, + "step": 9880 + }, + { + "epoch": 1.0851087195255875, + "grad_norm": 1.9473446607589722, + "learning_rate": 5e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7162883281707764, + "num_tokens": 255557704.0, + "step": 9881 + }, + { + "epoch": 1.0852185372282013, + "grad_norm": 2.196782112121582, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7574495077133179, + "num_tokens": 255579135.0, + "step": 9882 + }, + { + "epoch": 1.0853283549308148, + "grad_norm": 1.9927706718444824, + "learning_rate": 5e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7504799365997314, + "num_tokens": 255602236.0, + "step": 9883 + }, + { + "epoch": 1.0854381726334286, + "grad_norm": 1.768604040145874, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7486521601676941, + "num_tokens": 255628965.0, + "step": 9884 + }, + { + "epoch": 1.085547990336042, + "grad_norm": 1.9031113386154175, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7302125096321106, + "num_tokens": 255653616.0, + "step": 9885 + }, + { + "epoch": 1.0856578080386559, + "grad_norm": 1.7369965314865112, + "learning_rate": 5e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.7412782907485962, + "num_tokens": 255682813.0, + "step": 9886 + }, + { + "epoch": 1.0857676257412694, + "grad_norm": 2.030663013458252, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.745426595211029, + "num_tokens": 255707379.0, + "step": 9887 + }, + { + "epoch": 1.0858774434438832, + "grad_norm": 1.7906124591827393, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7404267191886902, + "num_tokens": 255735189.0, + "step": 9888 + }, + { + "epoch": 1.085987261146497, + "grad_norm": 1.6538383960723877, + "learning_rate": 5e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7404569983482361, + "num_tokens": 255764738.0, + "step": 9889 + }, + { + "epoch": 1.0860970788491104, + "grad_norm": 1.7578778266906738, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7249863147735596, + "num_tokens": 255794345.0, + "step": 9890 + }, + { + "epoch": 1.0862068965517242, + "grad_norm": 1.7853734493255615, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7429513931274414, + "num_tokens": 255821773.0, + "step": 9891 + }, + { + "epoch": 1.0863167142543377, + "grad_norm": 2.1276724338531494, + "learning_rate": 5e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7030235528945923, + "num_tokens": 255846793.0, + "step": 9892 + }, + { + "epoch": 1.0864265319569515, + "grad_norm": 1.8490608930587769, + "learning_rate": 5e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7053309679031372, + "num_tokens": 255876044.0, + "step": 9893 + }, + { + "epoch": 1.086536349659565, + "grad_norm": 2.1224751472473145, + "learning_rate": 5e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7464535236358643, + "num_tokens": 255897493.0, + "step": 9894 + }, + { + "epoch": 1.0866461673621788, + "grad_norm": 1.7476515769958496, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.728529691696167, + "num_tokens": 255926627.0, + "step": 9895 + }, + { + "epoch": 1.0867559850647925, + "grad_norm": 1.8438395261764526, + "learning_rate": 5e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7080302238464355, + "num_tokens": 255954244.0, + "step": 9896 + }, + { + "epoch": 1.086865802767406, + "grad_norm": 1.8602206707000732, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7373858690261841, + "num_tokens": 255981261.0, + "step": 9897 + }, + { + "epoch": 1.0869756204700198, + "grad_norm": 1.801985740661621, + "learning_rate": 5e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7209020853042603, + "num_tokens": 256009069.0, + "step": 9898 + }, + { + "epoch": 1.0870854381726334, + "grad_norm": 1.8048510551452637, + "learning_rate": 5e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7251028418540955, + "num_tokens": 256036587.0, + "step": 9899 + }, + { + "epoch": 1.0871952558752471, + "grad_norm": 1.8487217426300049, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7475926876068115, + "num_tokens": 256060834.0, + "step": 9900 + }, + { + "epoch": 1.0873050735778607, + "grad_norm": 1.992250919342041, + "learning_rate": 5e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7141463756561279, + "num_tokens": 256086843.0, + "step": 9901 + }, + { + "epoch": 1.0874148912804744, + "grad_norm": 2.139084577560425, + "learning_rate": 5e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7625527381896973, + "num_tokens": 256105530.0, + "step": 9902 + }, + { + "epoch": 1.0875247089830882, + "grad_norm": 1.7819477319717407, + "learning_rate": 5e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7274925708770752, + "num_tokens": 256132941.0, + "step": 9903 + }, + { + "epoch": 1.0876345266857017, + "grad_norm": 1.9636824131011963, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7339251041412354, + "num_tokens": 256159517.0, + "step": 9904 + }, + { + "epoch": 1.0877443443883155, + "grad_norm": 1.888944387435913, + "learning_rate": 5e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7366306781768799, + "num_tokens": 256185254.0, + "step": 9905 + }, + { + "epoch": 1.087854162090929, + "grad_norm": 1.7283803224563599, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7354992628097534, + "num_tokens": 256214636.0, + "step": 9906 + }, + { + "epoch": 1.0879639797935428, + "grad_norm": 1.767982006072998, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7290320992469788, + "num_tokens": 256242987.0, + "step": 9907 + }, + { + "epoch": 1.0880737974961563, + "grad_norm": 2.047313690185547, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7378942966461182, + "num_tokens": 256263645.0, + "step": 9908 + }, + { + "epoch": 1.08818361519877, + "grad_norm": 1.8367968797683716, + "learning_rate": 5e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7338577508926392, + "num_tokens": 256287222.0, + "step": 9909 + }, + { + "epoch": 1.0882934329013838, + "grad_norm": 2.09358811378479, + "learning_rate": 5e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7444074153900146, + "num_tokens": 256308896.0, + "step": 9910 + }, + { + "epoch": 1.0884032506039973, + "grad_norm": 1.7769646644592285, + "learning_rate": 5e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7147840261459351, + "num_tokens": 256340584.0, + "step": 9911 + }, + { + "epoch": 1.088513068306611, + "grad_norm": 2.040127992630005, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7360789775848389, + "num_tokens": 256363236.0, + "step": 9912 + }, + { + "epoch": 1.0886228860092246, + "grad_norm": 1.6810657978057861, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.736172080039978, + "num_tokens": 256393962.0, + "step": 9913 + }, + { + "epoch": 1.0887327037118384, + "grad_norm": 1.9600168466567993, + "learning_rate": 5e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.7742886543273926, + "num_tokens": 256416485.0, + "step": 9914 + }, + { + "epoch": 1.088842521414452, + "grad_norm": 1.825722336769104, + "learning_rate": 5e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7291470766067505, + "num_tokens": 256444152.0, + "step": 9915 + }, + { + "epoch": 1.0889523391170657, + "grad_norm": 2.2427713871002197, + "learning_rate": 5e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7567244172096252, + "num_tokens": 256464236.0, + "step": 9916 + }, + { + "epoch": 1.0890621568196792, + "grad_norm": 1.9276280403137207, + "learning_rate": 5e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7303515672683716, + "num_tokens": 256491148.0, + "step": 9917 + }, + { + "epoch": 1.089171974522293, + "grad_norm": 2.28233003616333, + "learning_rate": 5e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7358629703521729, + "num_tokens": 256513016.0, + "step": 9918 + }, + { + "epoch": 1.0892817922249067, + "grad_norm": 1.7994697093963623, + "learning_rate": 5e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7272060513496399, + "num_tokens": 256540999.0, + "step": 9919 + }, + { + "epoch": 1.0893916099275203, + "grad_norm": 2.015011787414551, + "learning_rate": 5e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7407384514808655, + "num_tokens": 256565022.0, + "step": 9920 + }, + { + "epoch": 1.089501427630134, + "grad_norm": 1.9719737768173218, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7313781976699829, + "num_tokens": 256589131.0, + "step": 9921 + }, + { + "epoch": 1.0896112453327476, + "grad_norm": 2.092092514038086, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7530294060707092, + "num_tokens": 256610222.0, + "step": 9922 + }, + { + "epoch": 1.0897210630353613, + "grad_norm": 2.039905309677124, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7338081002235413, + "num_tokens": 256634651.0, + "step": 9923 + }, + { + "epoch": 1.089830880737975, + "grad_norm": 1.9245843887329102, + "learning_rate": 5e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7389681339263916, + "num_tokens": 256661095.0, + "step": 9924 + }, + { + "epoch": 1.0899406984405886, + "grad_norm": 1.9068206548690796, + "learning_rate": 5e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7094398736953735, + "num_tokens": 256687216.0, + "step": 9925 + }, + { + "epoch": 1.0900505161432024, + "grad_norm": 2.0139873027801514, + "learning_rate": 5e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7416484355926514, + "num_tokens": 256708371.0, + "step": 9926 + }, + { + "epoch": 1.090160333845816, + "grad_norm": 1.9121278524398804, + "learning_rate": 5e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7370654344558716, + "num_tokens": 256734115.0, + "step": 9927 + }, + { + "epoch": 1.0902701515484297, + "grad_norm": 2.0237538814544678, + "learning_rate": 5e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7439733743667603, + "num_tokens": 256757385.0, + "step": 9928 + }, + { + "epoch": 1.0903799692510432, + "grad_norm": 1.6512144804000854, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7386239171028137, + "num_tokens": 256787605.0, + "step": 9929 + }, + { + "epoch": 1.090489786953657, + "grad_norm": 1.7070412635803223, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7134138345718384, + "num_tokens": 256818066.0, + "step": 9930 + }, + { + "epoch": 1.0905996046562705, + "grad_norm": 1.9798392057418823, + "learning_rate": 5e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7579235434532166, + "num_tokens": 256841980.0, + "step": 9931 + }, + { + "epoch": 1.0907094223588842, + "grad_norm": 2.065169095993042, + "learning_rate": 5e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7155071496963501, + "num_tokens": 256866833.0, + "step": 9932 + }, + { + "epoch": 1.090819240061498, + "grad_norm": 1.8675898313522339, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7282719612121582, + "num_tokens": 256895207.0, + "step": 9933 + }, + { + "epoch": 1.0909290577641115, + "grad_norm": 2.020695686340332, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7165213227272034, + "num_tokens": 256920376.0, + "step": 9934 + }, + { + "epoch": 1.0910388754667253, + "grad_norm": 1.81687331199646, + "learning_rate": 5e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7262239456176758, + "num_tokens": 256950186.0, + "step": 9935 + }, + { + "epoch": 1.0911486931693388, + "grad_norm": 1.7253044843673706, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7288777232170105, + "num_tokens": 256981444.0, + "step": 9936 + }, + { + "epoch": 1.0912585108719526, + "grad_norm": 1.8855406045913696, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7490020990371704, + "num_tokens": 257005534.0, + "step": 9937 + }, + { + "epoch": 1.0913683285745663, + "grad_norm": 1.8904500007629395, + "learning_rate": 5e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7505152821540833, + "num_tokens": 257030725.0, + "step": 9938 + }, + { + "epoch": 1.0914781462771799, + "grad_norm": 2.160688638687134, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7546414732933044, + "num_tokens": 257052274.0, + "step": 9939 + }, + { + "epoch": 1.0915879639797936, + "grad_norm": 1.8060437440872192, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7413746118545532, + "num_tokens": 257082227.0, + "step": 9940 + }, + { + "epoch": 1.0916977816824072, + "grad_norm": 1.8340317010879517, + "learning_rate": 5e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.72606360912323, + "num_tokens": 257109337.0, + "step": 9941 + }, + { + "epoch": 1.091807599385021, + "grad_norm": 1.7717409133911133, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7204878330230713, + "num_tokens": 257139401.0, + "step": 9942 + }, + { + "epoch": 1.0919174170876345, + "grad_norm": 1.878082513809204, + "learning_rate": 5e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7303429841995239, + "num_tokens": 257165338.0, + "step": 9943 + }, + { + "epoch": 1.0920272347902482, + "grad_norm": 2.145890235900879, + "learning_rate": 5e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7037808299064636, + "num_tokens": 257190795.0, + "step": 9944 + }, + { + "epoch": 1.0921370524928617, + "grad_norm": 1.759221076965332, + "learning_rate": 5e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7326885461807251, + "num_tokens": 257220081.0, + "step": 9945 + }, + { + "epoch": 1.0922468701954755, + "grad_norm": 2.034748077392578, + "learning_rate": 5e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7336667776107788, + "num_tokens": 257242469.0, + "step": 9946 + }, + { + "epoch": 1.0923566878980893, + "grad_norm": 1.8761171102523804, + "learning_rate": 5e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7339261770248413, + "num_tokens": 257270304.0, + "step": 9947 + }, + { + "epoch": 1.0924665056007028, + "grad_norm": 1.8640286922454834, + "learning_rate": 5e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7456907033920288, + "num_tokens": 257295381.0, + "step": 9948 + }, + { + "epoch": 1.0925763233033166, + "grad_norm": 2.1340396404266357, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7134653329849243, + "num_tokens": 257318688.0, + "step": 9949 + }, + { + "epoch": 1.09268614100593, + "grad_norm": 1.9888458251953125, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7358347177505493, + "num_tokens": 257344918.0, + "step": 9950 + }, + { + "epoch": 1.0927959587085438, + "grad_norm": 1.9100359678268433, + "learning_rate": 5e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.736684262752533, + "num_tokens": 257371243.0, + "step": 9951 + }, + { + "epoch": 1.0929057764111574, + "grad_norm": 1.7557868957519531, + "learning_rate": 5e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7229893207550049, + "num_tokens": 257402781.0, + "step": 9952 + }, + { + "epoch": 1.0930155941137711, + "grad_norm": 2.0517914295196533, + "learning_rate": 5e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.733504056930542, + "num_tokens": 257426134.0, + "step": 9953 + }, + { + "epoch": 1.093125411816385, + "grad_norm": 1.928551435470581, + "learning_rate": 5e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7199138402938843, + "num_tokens": 257451250.0, + "step": 9954 + }, + { + "epoch": 1.0932352295189984, + "grad_norm": 2.1019575595855713, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7232236862182617, + "num_tokens": 257472128.0, + "step": 9955 + }, + { + "epoch": 1.0933450472216122, + "grad_norm": 1.7728734016418457, + "learning_rate": 5e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.6994156837463379, + "num_tokens": 257501431.0, + "step": 9956 + }, + { + "epoch": 1.0934548649242257, + "grad_norm": 1.891109585762024, + "learning_rate": 5e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7540832757949829, + "num_tokens": 257525109.0, + "step": 9957 + }, + { + "epoch": 1.0935646826268395, + "grad_norm": 1.7838813066482544, + "learning_rate": 5e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7132807374000549, + "num_tokens": 257554522.0, + "step": 9958 + }, + { + "epoch": 1.093674500329453, + "grad_norm": 2.062825918197632, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7277822494506836, + "num_tokens": 257576719.0, + "step": 9959 + }, + { + "epoch": 1.0937843180320668, + "grad_norm": 1.9483333826065063, + "learning_rate": 5e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7231597900390625, + "num_tokens": 257604886.0, + "step": 9960 + }, + { + "epoch": 1.0938941357346805, + "grad_norm": 1.7678282260894775, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7405401468276978, + "num_tokens": 257631591.0, + "step": 9961 + }, + { + "epoch": 1.094003953437294, + "grad_norm": 1.6830635070800781, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7207826972007751, + "num_tokens": 257662470.0, + "step": 9962 + }, + { + "epoch": 1.0941137711399078, + "grad_norm": 1.8748582601547241, + "learning_rate": 5e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7624537944793701, + "num_tokens": 257685621.0, + "step": 9963 + }, + { + "epoch": 1.0942235888425214, + "grad_norm": 1.6986005306243896, + "learning_rate": 5e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7650065422058105, + "num_tokens": 257712206.0, + "step": 9964 + }, + { + "epoch": 1.0943334065451351, + "grad_norm": 2.0643177032470703, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7345988750457764, + "num_tokens": 257734669.0, + "step": 9965 + }, + { + "epoch": 1.0944432242477486, + "grad_norm": 2.0970375537872314, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7435428500175476, + "num_tokens": 257756528.0, + "step": 9966 + }, + { + "epoch": 1.0945530419503624, + "grad_norm": 1.8683871030807495, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7606998682022095, + "num_tokens": 257780055.0, + "step": 9967 + }, + { + "epoch": 1.0946628596529762, + "grad_norm": 1.6548349857330322, + "learning_rate": 5e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7346124649047852, + "num_tokens": 257812010.0, + "step": 9968 + }, + { + "epoch": 1.0947726773555897, + "grad_norm": 1.7921322584152222, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7255349159240723, + "num_tokens": 257839244.0, + "step": 9969 + }, + { + "epoch": 1.0948824950582035, + "grad_norm": 1.6406643390655518, + "learning_rate": 5e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7041832804679871, + "num_tokens": 257874247.0, + "step": 9970 + }, + { + "epoch": 1.094992312760817, + "grad_norm": 1.8221405744552612, + "learning_rate": 5e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7412683963775635, + "num_tokens": 257904364.0, + "step": 9971 + }, + { + "epoch": 1.0951021304634307, + "grad_norm": 1.9197192192077637, + "learning_rate": 5e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7349313497543335, + "num_tokens": 257929784.0, + "step": 9972 + }, + { + "epoch": 1.0952119481660443, + "grad_norm": 1.8323928117752075, + "learning_rate": 5e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7496806383132935, + "num_tokens": 257953280.0, + "step": 9973 + }, + { + "epoch": 1.095321765868658, + "grad_norm": 2.1166553497314453, + "learning_rate": 5e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7281814813613892, + "num_tokens": 257976768.0, + "step": 9974 + }, + { + "epoch": 1.0954315835712718, + "grad_norm": 1.8793295621871948, + "learning_rate": 5e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7360377311706543, + "num_tokens": 258004407.0, + "step": 9975 + }, + { + "epoch": 1.0955414012738853, + "grad_norm": 1.9213531017303467, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7541303634643555, + "num_tokens": 258031173.0, + "step": 9976 + }, + { + "epoch": 1.095651218976499, + "grad_norm": 1.796230673789978, + "learning_rate": 5e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7304386496543884, + "num_tokens": 258062271.0, + "step": 9977 + }, + { + "epoch": 1.0957610366791126, + "grad_norm": 1.9903321266174316, + "learning_rate": 5e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7366664409637451, + "num_tokens": 258086579.0, + "step": 9978 + }, + { + "epoch": 1.0958708543817264, + "grad_norm": 1.937167763710022, + "learning_rate": 5e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7129944562911987, + "num_tokens": 258111751.0, + "step": 9979 + }, + { + "epoch": 1.09598067208434, + "grad_norm": 2.0450518131256104, + "learning_rate": 5e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7394404411315918, + "num_tokens": 258134697.0, + "step": 9980 + }, + { + "epoch": 1.0960904897869537, + "grad_norm": 2.0266659259796143, + "learning_rate": 5e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7348504066467285, + "num_tokens": 258158239.0, + "step": 9981 + }, + { + "epoch": 1.0962003074895672, + "grad_norm": 1.8708319664001465, + "learning_rate": 5e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7327576875686646, + "num_tokens": 258184512.0, + "step": 9982 + }, + { + "epoch": 1.096310125192181, + "grad_norm": 2.018792152404785, + "learning_rate": 5e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.748053252696991, + "num_tokens": 258205919.0, + "step": 9983 + }, + { + "epoch": 1.0964199428947947, + "grad_norm": 2.1423087120056152, + "learning_rate": 5e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7420653700828552, + "num_tokens": 258227182.0, + "step": 9984 + }, + { + "epoch": 1.0965297605974083, + "grad_norm": 1.850553274154663, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7443346381187439, + "num_tokens": 258253916.0, + "step": 9985 + }, + { + "epoch": 1.096639578300022, + "grad_norm": 2.012436628341675, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.754300057888031, + "num_tokens": 258275112.0, + "step": 9986 + }, + { + "epoch": 1.0967493960026355, + "grad_norm": 1.8587114810943604, + "learning_rate": 5e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7379961609840393, + "num_tokens": 258301583.0, + "step": 9987 + }, + { + "epoch": 1.0968592137052493, + "grad_norm": 1.768954873085022, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7224642038345337, + "num_tokens": 258332160.0, + "step": 9988 + }, + { + "epoch": 1.096969031407863, + "grad_norm": 2.143217086791992, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7465020418167114, + "num_tokens": 258353954.0, + "step": 9989 + }, + { + "epoch": 1.0970788491104766, + "grad_norm": 1.7766722440719604, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7283488512039185, + "num_tokens": 258381701.0, + "step": 9990 + }, + { + "epoch": 1.0971886668130904, + "grad_norm": 1.676884651184082, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7180397510528564, + "num_tokens": 258414609.0, + "step": 9991 + }, + { + "epoch": 1.0972984845157039, + "grad_norm": 2.054718255996704, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7374697923660278, + "num_tokens": 258436398.0, + "step": 9992 + }, + { + "epoch": 1.0974083022183176, + "grad_norm": 2.1082546710968018, + "learning_rate": 5e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7371853590011597, + "num_tokens": 258457615.0, + "step": 9993 + }, + { + "epoch": 1.0975181199209312, + "grad_norm": 1.730002522468567, + "learning_rate": 5e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7321401834487915, + "num_tokens": 258487080.0, + "step": 9994 + }, + { + "epoch": 1.097627937623545, + "grad_norm": 2.0902979373931885, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7622309327125549, + "num_tokens": 258506298.0, + "step": 9995 + }, + { + "epoch": 1.0977377553261585, + "grad_norm": 1.8618541955947876, + "learning_rate": 5e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7444496750831604, + "num_tokens": 258530882.0, + "step": 9996 + }, + { + "epoch": 1.0978475730287722, + "grad_norm": 1.684438943862915, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.737563967704773, + "num_tokens": 258562186.0, + "step": 9997 + }, + { + "epoch": 1.097957390731386, + "grad_norm": 1.631803035736084, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7209159135818481, + "num_tokens": 258596445.0, + "step": 9998 + }, + { + "epoch": 1.0980672084339995, + "grad_norm": 1.7874001264572144, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7257193922996521, + "num_tokens": 258626586.0, + "step": 9999 + }, + { + "epoch": 1.0981770261366133, + "grad_norm": 2.0077290534973145, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7396327257156372, + "num_tokens": 258649089.0, + "step": 10000 + }, + { + "epoch": 1.0982868438392268, + "grad_norm": 1.885599970817566, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7455943822860718, + "num_tokens": 258673463.0, + "step": 10001 + }, + { + "epoch": 1.0983966615418406, + "grad_norm": 1.8348859548568726, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.740241289138794, + "num_tokens": 258702274.0, + "step": 10002 + }, + { + "epoch": 1.098506479244454, + "grad_norm": 2.0012283325195312, + "learning_rate": 5e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7178682088851929, + "num_tokens": 258724975.0, + "step": 10003 + }, + { + "epoch": 1.0986162969470679, + "grad_norm": 1.8414678573608398, + "learning_rate": 5e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7194700837135315, + "num_tokens": 258751629.0, + "step": 10004 + }, + { + "epoch": 1.0987261146496816, + "grad_norm": 1.955845594406128, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7274532914161682, + "num_tokens": 258777205.0, + "step": 10005 + }, + { + "epoch": 1.0988359323522952, + "grad_norm": 1.9070490598678589, + "learning_rate": 5e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7054725885391235, + "num_tokens": 258804095.0, + "step": 10006 + }, + { + "epoch": 1.098945750054909, + "grad_norm": 1.8624604940414429, + "learning_rate": 5e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7137026786804199, + "num_tokens": 258830963.0, + "step": 10007 + }, + { + "epoch": 1.0990555677575224, + "grad_norm": 2.105107545852661, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.711184561252594, + "num_tokens": 258854386.0, + "step": 10008 + }, + { + "epoch": 1.0991653854601362, + "grad_norm": 1.9021435976028442, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.749172568321228, + "num_tokens": 258877774.0, + "step": 10009 + }, + { + "epoch": 1.0992752031627497, + "grad_norm": 1.8340381383895874, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7364028692245483, + "num_tokens": 258903786.0, + "step": 10010 + }, + { + "epoch": 1.0993850208653635, + "grad_norm": 1.7598209381103516, + "learning_rate": 5e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7217974662780762, + "num_tokens": 258932168.0, + "step": 10011 + }, + { + "epoch": 1.0994948385679773, + "grad_norm": 2.2152888774871826, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7329599857330322, + "num_tokens": 258952256.0, + "step": 10012 + }, + { + "epoch": 1.0996046562705908, + "grad_norm": 1.7183746099472046, + "learning_rate": 5e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7158088684082031, + "num_tokens": 258983795.0, + "step": 10013 + }, + { + "epoch": 1.0997144739732045, + "grad_norm": 1.861270785331726, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7184914946556091, + "num_tokens": 259011736.0, + "step": 10014 + }, + { + "epoch": 1.099824291675818, + "grad_norm": 1.5493981838226318, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7595243453979492, + "num_tokens": 259046834.0, + "step": 10015 + }, + { + "epoch": 1.0999341093784318, + "grad_norm": 1.8630828857421875, + "learning_rate": 5e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7221609354019165, + "num_tokens": 259073754.0, + "step": 10016 + }, + { + "epoch": 1.1000439270810454, + "grad_norm": 1.9580103158950806, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7550034523010254, + "num_tokens": 259097394.0, + "step": 10017 + }, + { + "epoch": 1.1001537447836591, + "grad_norm": 1.9960341453552246, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7497168779373169, + "num_tokens": 259120438.0, + "step": 10018 + }, + { + "epoch": 1.1002635624862729, + "grad_norm": 2.1534080505371094, + "learning_rate": 5e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7382606267929077, + "num_tokens": 259140015.0, + "step": 10019 + }, + { + "epoch": 1.1003733801888864, + "grad_norm": 1.6845736503601074, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7380809783935547, + "num_tokens": 259167700.0, + "step": 10020 + }, + { + "epoch": 1.1004831978915002, + "grad_norm": 1.8949289321899414, + "learning_rate": 5e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7458365559577942, + "num_tokens": 259191047.0, + "step": 10021 + }, + { + "epoch": 1.1005930155941137, + "grad_norm": 1.894028902053833, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7317686676979065, + "num_tokens": 259215179.0, + "step": 10022 + }, + { + "epoch": 1.1007028332967275, + "grad_norm": 1.7282658815383911, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7326001524925232, + "num_tokens": 259245244.0, + "step": 10023 + }, + { + "epoch": 1.100812650999341, + "grad_norm": 1.9087504148483276, + "learning_rate": 5e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.736688494682312, + "num_tokens": 259271593.0, + "step": 10024 + }, + { + "epoch": 1.1009224687019548, + "grad_norm": 1.9806736707687378, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7334156632423401, + "num_tokens": 259294902.0, + "step": 10025 + }, + { + "epoch": 1.1010322864045685, + "grad_norm": 1.920263409614563, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.729972779750824, + "num_tokens": 259320472.0, + "step": 10026 + }, + { + "epoch": 1.101142104107182, + "grad_norm": 1.8119672536849976, + "learning_rate": 5e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7268238663673401, + "num_tokens": 259352090.0, + "step": 10027 + }, + { + "epoch": 1.1012519218097958, + "grad_norm": 2.1536827087402344, + "learning_rate": 5e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7446435689926147, + "num_tokens": 259371962.0, + "step": 10028 + }, + { + "epoch": 1.1013617395124093, + "grad_norm": 1.9644147157669067, + "learning_rate": 5e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7379324436187744, + "num_tokens": 259394007.0, + "step": 10029 + }, + { + "epoch": 1.101471557215023, + "grad_norm": 1.8330193758010864, + "learning_rate": 5e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7302104830741882, + "num_tokens": 259421820.0, + "step": 10030 + }, + { + "epoch": 1.1015813749176366, + "grad_norm": 1.9074021577835083, + "learning_rate": 5e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7312241196632385, + "num_tokens": 259446864.0, + "step": 10031 + }, + { + "epoch": 1.1016911926202504, + "grad_norm": 1.7113162279129028, + "learning_rate": 5e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7312161922454834, + "num_tokens": 259473537.0, + "step": 10032 + }, + { + "epoch": 1.1018010103228641, + "grad_norm": 2.040009021759033, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7388920187950134, + "num_tokens": 259495296.0, + "step": 10033 + }, + { + "epoch": 1.1019108280254777, + "grad_norm": 1.7786451578140259, + "learning_rate": 5e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7009491920471191, + "num_tokens": 259528197.0, + "step": 10034 + }, + { + "epoch": 1.1020206457280914, + "grad_norm": 2.08827805519104, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7421302795410156, + "num_tokens": 259548997.0, + "step": 10035 + }, + { + "epoch": 1.102130463430705, + "grad_norm": 1.8137024641036987, + "learning_rate": 5e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7146781086921692, + "num_tokens": 259578890.0, + "step": 10036 + }, + { + "epoch": 1.1022402811333187, + "grad_norm": 1.9976208209991455, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.726619303226471, + "num_tokens": 259601242.0, + "step": 10037 + }, + { + "epoch": 1.1023500988359323, + "grad_norm": 2.0689289569854736, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7308677434921265, + "num_tokens": 259623203.0, + "step": 10038 + }, + { + "epoch": 1.102459916538546, + "grad_norm": 1.8588308095932007, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7505358457565308, + "num_tokens": 259649148.0, + "step": 10039 + }, + { + "epoch": 1.1025697342411598, + "grad_norm": 1.790298342704773, + "learning_rate": 5e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7026394605636597, + "num_tokens": 259678220.0, + "step": 10040 + }, + { + "epoch": 1.1026795519437733, + "grad_norm": 1.68247389793396, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7713341116905212, + "num_tokens": 259706763.0, + "step": 10041 + }, + { + "epoch": 1.102789369646387, + "grad_norm": 1.9016685485839844, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7326483130455017, + "num_tokens": 259735938.0, + "step": 10042 + }, + { + "epoch": 1.1028991873490006, + "grad_norm": 1.843437910079956, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7300253510475159, + "num_tokens": 259762099.0, + "step": 10043 + }, + { + "epoch": 1.1030090050516144, + "grad_norm": 1.8463616371154785, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.731232225894928, + "num_tokens": 259790400.0, + "step": 10044 + }, + { + "epoch": 1.103118822754228, + "grad_norm": 1.8392255306243896, + "learning_rate": 5e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7212247848510742, + "num_tokens": 259817006.0, + "step": 10045 + }, + { + "epoch": 1.1032286404568417, + "grad_norm": 1.9112260341644287, + "learning_rate": 5e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7541900873184204, + "num_tokens": 259841122.0, + "step": 10046 + }, + { + "epoch": 1.1033384581594552, + "grad_norm": 1.8948675394058228, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7525257468223572, + "num_tokens": 259866859.0, + "step": 10047 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 1.8046212196350098, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7438948154449463, + "num_tokens": 259892330.0, + "step": 10048 + }, + { + "epoch": 1.1035580935646827, + "grad_norm": 1.9682093858718872, + "learning_rate": 5e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7435891628265381, + "num_tokens": 259917088.0, + "step": 10049 + }, + { + "epoch": 1.1036679112672962, + "grad_norm": 1.7752790451049805, + "learning_rate": 5e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7180802822113037, + "num_tokens": 259946423.0, + "step": 10050 + }, + { + "epoch": 1.10377772896991, + "grad_norm": 1.8402676582336426, + "learning_rate": 5e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7293137311935425, + "num_tokens": 259973201.0, + "step": 10051 + }, + { + "epoch": 1.1038875466725235, + "grad_norm": 2.220020055770874, + "learning_rate": 5e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.74574214220047, + "num_tokens": 259992747.0, + "step": 10052 + }, + { + "epoch": 1.1039973643751373, + "grad_norm": 1.9579929113388062, + "learning_rate": 5e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7112480401992798, + "num_tokens": 260020338.0, + "step": 10053 + }, + { + "epoch": 1.104107182077751, + "grad_norm": 1.6530531644821167, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7295730113983154, + "num_tokens": 260052808.0, + "step": 10054 + }, + { + "epoch": 1.1042169997803646, + "grad_norm": 1.9339115619659424, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7245041728019714, + "num_tokens": 260079509.0, + "step": 10055 + }, + { + "epoch": 1.1043268174829783, + "grad_norm": 2.0373129844665527, + "learning_rate": 5e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.731194019317627, + "num_tokens": 260102256.0, + "step": 10056 + }, + { + "epoch": 1.1044366351855919, + "grad_norm": 1.7118488550186157, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7378124594688416, + "num_tokens": 260130472.0, + "step": 10057 + }, + { + "epoch": 1.1045464528882056, + "grad_norm": 2.046933174133301, + "learning_rate": 5e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7420716881752014, + "num_tokens": 260154558.0, + "step": 10058 + }, + { + "epoch": 1.1046562705908192, + "grad_norm": 5.316543102264404, + "learning_rate": 5e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7420530319213867, + "num_tokens": 260181377.0, + "step": 10059 + }, + { + "epoch": 1.104766088293433, + "grad_norm": 2.0055365562438965, + "learning_rate": 5e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7286909222602844, + "num_tokens": 260205055.0, + "step": 10060 + }, + { + "epoch": 1.1048759059960465, + "grad_norm": 1.7079191207885742, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7161791324615479, + "num_tokens": 260235669.0, + "step": 10061 + }, + { + "epoch": 1.1049857236986602, + "grad_norm": 1.8390007019042969, + "learning_rate": 5e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7455826997756958, + "num_tokens": 260261112.0, + "step": 10062 + }, + { + "epoch": 1.105095541401274, + "grad_norm": 1.882178544998169, + "learning_rate": 5e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.731425404548645, + "num_tokens": 260286891.0, + "step": 10063 + }, + { + "epoch": 1.1052053591038875, + "grad_norm": 1.7760417461395264, + "learning_rate": 5e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7093765735626221, + "num_tokens": 260317270.0, + "step": 10064 + }, + { + "epoch": 1.1053151768065013, + "grad_norm": 1.903523564338684, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7170964479446411, + "num_tokens": 260346818.0, + "step": 10065 + }, + { + "epoch": 1.1054249945091148, + "grad_norm": 1.660288691520691, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7553712725639343, + "num_tokens": 260377810.0, + "step": 10066 + }, + { + "epoch": 1.1055348122117286, + "grad_norm": 1.890462875366211, + "learning_rate": 5e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7161290049552917, + "num_tokens": 260405481.0, + "step": 10067 + }, + { + "epoch": 1.105644629914342, + "grad_norm": 1.9108147621154785, + "learning_rate": 5e-06, + "loss": 0.865, + "mean_token_accuracy": 0.729691207408905, + "num_tokens": 260435494.0, + "step": 10068 + }, + { + "epoch": 1.1057544476169558, + "grad_norm": 1.7949577569961548, + "learning_rate": 5e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7427138686180115, + "num_tokens": 260465750.0, + "step": 10069 + }, + { + "epoch": 1.1058642653195696, + "grad_norm": 1.9369244575500488, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7540865540504456, + "num_tokens": 260490389.0, + "step": 10070 + }, + { + "epoch": 1.1059740830221831, + "grad_norm": 2.027644157409668, + "learning_rate": 5e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7207800149917603, + "num_tokens": 260515610.0, + "step": 10071 + }, + { + "epoch": 1.106083900724797, + "grad_norm": 1.7166165113449097, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7145861387252808, + "num_tokens": 260552649.0, + "step": 10072 + }, + { + "epoch": 1.1061937184274104, + "grad_norm": 1.9195685386657715, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7333681583404541, + "num_tokens": 260577116.0, + "step": 10073 + }, + { + "epoch": 1.1063035361300242, + "grad_norm": 1.7268980741500854, + "learning_rate": 5e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7273974418640137, + "num_tokens": 260607415.0, + "step": 10074 + }, + { + "epoch": 1.1064133538326377, + "grad_norm": 1.990580677986145, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7440428137779236, + "num_tokens": 260630045.0, + "step": 10075 + }, + { + "epoch": 1.1065231715352515, + "grad_norm": 2.140727996826172, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7321630716323853, + "num_tokens": 260653313.0, + "step": 10076 + }, + { + "epoch": 1.1066329892378652, + "grad_norm": 1.6149715185165405, + "learning_rate": 5e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7251113653182983, + "num_tokens": 260685726.0, + "step": 10077 + }, + { + "epoch": 1.1067428069404788, + "grad_norm": 1.9303654432296753, + "learning_rate": 5e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7522009611129761, + "num_tokens": 260709407.0, + "step": 10078 + }, + { + "epoch": 1.1068526246430925, + "grad_norm": 1.7026309967041016, + "learning_rate": 5e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7290521860122681, + "num_tokens": 260740550.0, + "step": 10079 + }, + { + "epoch": 1.106962442345706, + "grad_norm": 1.969678282737732, + "learning_rate": 5e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7383885383605957, + "num_tokens": 260764138.0, + "step": 10080 + }, + { + "epoch": 1.1070722600483198, + "grad_norm": 2.1660752296447754, + "learning_rate": 5e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7487531304359436, + "num_tokens": 260785865.0, + "step": 10081 + }, + { + "epoch": 1.1071820777509334, + "grad_norm": 1.910600185394287, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7322975397109985, + "num_tokens": 260811725.0, + "step": 10082 + }, + { + "epoch": 1.1072918954535471, + "grad_norm": 1.7448537349700928, + "learning_rate": 5e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7087820172309875, + "num_tokens": 260844735.0, + "step": 10083 + }, + { + "epoch": 1.1074017131561609, + "grad_norm": 2.0618646144866943, + "learning_rate": 5e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7285363674163818, + "num_tokens": 260867587.0, + "step": 10084 + }, + { + "epoch": 1.1075115308587744, + "grad_norm": 1.853317379951477, + "learning_rate": 5e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.74749755859375, + "num_tokens": 260890697.0, + "step": 10085 + }, + { + "epoch": 1.1076213485613882, + "grad_norm": 1.850303053855896, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7489360570907593, + "num_tokens": 260916042.0, + "step": 10086 + }, + { + "epoch": 1.1077311662640017, + "grad_norm": 2.206995725631714, + "learning_rate": 5e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7579473257064819, + "num_tokens": 260934174.0, + "step": 10087 + }, + { + "epoch": 1.1078409839666155, + "grad_norm": 1.8755382299423218, + "learning_rate": 5e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7115641832351685, + "num_tokens": 260962810.0, + "step": 10088 + }, + { + "epoch": 1.107950801669229, + "grad_norm": 1.6868269443511963, + "learning_rate": 5e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7376495599746704, + "num_tokens": 260992075.0, + "step": 10089 + }, + { + "epoch": 1.1080606193718427, + "grad_norm": 2.056049108505249, + "learning_rate": 5e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7428938150405884, + "num_tokens": 261015099.0, + "step": 10090 + }, + { + "epoch": 1.1081704370744565, + "grad_norm": 1.8625925779342651, + "learning_rate": 5e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7435132265090942, + "num_tokens": 261043278.0, + "step": 10091 + }, + { + "epoch": 1.10828025477707, + "grad_norm": 1.7928515672683716, + "learning_rate": 5e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7238218188285828, + "num_tokens": 261072008.0, + "step": 10092 + }, + { + "epoch": 1.1083900724796838, + "grad_norm": 1.9032199382781982, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.76100754737854, + "num_tokens": 261096494.0, + "step": 10093 + }, + { + "epoch": 1.1084998901822973, + "grad_norm": 2.1280264854431152, + "learning_rate": 5e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7248611450195312, + "num_tokens": 261120661.0, + "step": 10094 + }, + { + "epoch": 1.108609707884911, + "grad_norm": 1.6941962242126465, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7353993058204651, + "num_tokens": 261154092.0, + "step": 10095 + }, + { + "epoch": 1.1087195255875246, + "grad_norm": 2.213963031768799, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7312914133071899, + "num_tokens": 261174986.0, + "step": 10096 + }, + { + "epoch": 1.1088293432901384, + "grad_norm": 1.947852373123169, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7327253818511963, + "num_tokens": 261202189.0, + "step": 10097 + }, + { + "epoch": 1.108939160992752, + "grad_norm": 2.2374494075775146, + "learning_rate": 5e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7669761180877686, + "num_tokens": 261220276.0, + "step": 10098 + }, + { + "epoch": 1.1090489786953657, + "grad_norm": 1.8305689096450806, + "learning_rate": 5e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.7670499086380005, + "num_tokens": 261244097.0, + "step": 10099 + }, + { + "epoch": 1.1091587963979794, + "grad_norm": 2.2613894939422607, + "learning_rate": 5e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7597293853759766, + "num_tokens": 261262229.0, + "step": 10100 + }, + { + "epoch": 1.109268614100593, + "grad_norm": 1.9978642463684082, + "learning_rate": 5e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7401443123817444, + "num_tokens": 261287047.0, + "step": 10101 + }, + { + "epoch": 1.1093784318032067, + "grad_norm": 1.7775323390960693, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7206698656082153, + "num_tokens": 261316907.0, + "step": 10102 + }, + { + "epoch": 1.1094882495058203, + "grad_norm": 1.8367661237716675, + "learning_rate": 5e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7301174998283386, + "num_tokens": 261344336.0, + "step": 10103 + }, + { + "epoch": 1.109598067208434, + "grad_norm": 1.7805449962615967, + "learning_rate": 5e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.722282886505127, + "num_tokens": 261374001.0, + "step": 10104 + }, + { + "epoch": 1.1097078849110478, + "grad_norm": 1.851321816444397, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7340011596679688, + "num_tokens": 261403829.0, + "step": 10105 + }, + { + "epoch": 1.1098177026136613, + "grad_norm": 1.7793383598327637, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7142305374145508, + "num_tokens": 261432033.0, + "step": 10106 + }, + { + "epoch": 1.109927520316275, + "grad_norm": 1.9296380281448364, + "learning_rate": 5e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.757240355014801, + "num_tokens": 261456606.0, + "step": 10107 + }, + { + "epoch": 1.1100373380188886, + "grad_norm": 2.113184690475464, + "learning_rate": 5e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7535545229911804, + "num_tokens": 261480664.0, + "step": 10108 + }, + { + "epoch": 1.1101471557215024, + "grad_norm": 1.7476325035095215, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7388993501663208, + "num_tokens": 261507335.0, + "step": 10109 + }, + { + "epoch": 1.110256973424116, + "grad_norm": 1.8349530696868896, + "learning_rate": 5e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7256944179534912, + "num_tokens": 261537435.0, + "step": 10110 + }, + { + "epoch": 1.1103667911267296, + "grad_norm": 2.2202365398406982, + "learning_rate": 5e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.74994957447052, + "num_tokens": 261557476.0, + "step": 10111 + }, + { + "epoch": 1.1104766088293432, + "grad_norm": 1.7847514152526855, + "learning_rate": 5e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7423701286315918, + "num_tokens": 261587814.0, + "step": 10112 + }, + { + "epoch": 1.110586426531957, + "grad_norm": 2.0579020977020264, + "learning_rate": 5e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7428350448608398, + "num_tokens": 261609708.0, + "step": 10113 + }, + { + "epoch": 1.1106962442345707, + "grad_norm": 1.8011876344680786, + "learning_rate": 5e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7311218976974487, + "num_tokens": 261633805.0, + "step": 10114 + }, + { + "epoch": 1.1108060619371842, + "grad_norm": 1.8965919017791748, + "learning_rate": 5e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7317016124725342, + "num_tokens": 261659108.0, + "step": 10115 + }, + { + "epoch": 1.110915879639798, + "grad_norm": 2.2010908126831055, + "learning_rate": 5e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7392762899398804, + "num_tokens": 261679244.0, + "step": 10116 + }, + { + "epoch": 1.1110256973424115, + "grad_norm": 1.7896184921264648, + "learning_rate": 5e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7379417419433594, + "num_tokens": 261705199.0, + "step": 10117 + }, + { + "epoch": 1.1111355150450253, + "grad_norm": 2.4648799896240234, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7189078330993652, + "num_tokens": 261729115.0, + "step": 10118 + }, + { + "epoch": 1.111245332747639, + "grad_norm": 1.9372775554656982, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7403066754341125, + "num_tokens": 261752326.0, + "step": 10119 + }, + { + "epoch": 1.1113551504502526, + "grad_norm": 1.8441959619522095, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7250928282737732, + "num_tokens": 261778076.0, + "step": 10120 + }, + { + "epoch": 1.1114649681528663, + "grad_norm": 1.8337013721466064, + "learning_rate": 5e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7316125631332397, + "num_tokens": 261803392.0, + "step": 10121 + }, + { + "epoch": 1.1115747858554799, + "grad_norm": 2.215886116027832, + "learning_rate": 5e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7563046216964722, + "num_tokens": 261822713.0, + "step": 10122 + }, + { + "epoch": 1.1116846035580936, + "grad_norm": 2.115647554397583, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7488385438919067, + "num_tokens": 261842384.0, + "step": 10123 + }, + { + "epoch": 1.1117944212607072, + "grad_norm": 2.070936918258667, + "learning_rate": 5e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7260081768035889, + "num_tokens": 261866334.0, + "step": 10124 + }, + { + "epoch": 1.111904238963321, + "grad_norm": 1.8826335668563843, + "learning_rate": 5e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7298402786254883, + "num_tokens": 261893474.0, + "step": 10125 + }, + { + "epoch": 1.1120140566659344, + "grad_norm": 1.8743419647216797, + "learning_rate": 5e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7266403436660767, + "num_tokens": 261919859.0, + "step": 10126 + }, + { + "epoch": 1.1121238743685482, + "grad_norm": 2.006347179412842, + "learning_rate": 5e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.7493436336517334, + "num_tokens": 261942760.0, + "step": 10127 + }, + { + "epoch": 1.112233692071162, + "grad_norm": 1.706795573234558, + "learning_rate": 5e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7318823337554932, + "num_tokens": 261972869.0, + "step": 10128 + }, + { + "epoch": 1.1123435097737755, + "grad_norm": 1.9506925344467163, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7453247904777527, + "num_tokens": 261996911.0, + "step": 10129 + }, + { + "epoch": 1.1124533274763893, + "grad_norm": 1.6305910348892212, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7197326421737671, + "num_tokens": 262027197.0, + "step": 10130 + }, + { + "epoch": 1.1125631451790028, + "grad_norm": 1.6499515771865845, + "learning_rate": 5e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7184159159660339, + "num_tokens": 262059960.0, + "step": 10131 + }, + { + "epoch": 1.1126729628816165, + "grad_norm": 2.0673575401306152, + "learning_rate": 5e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7518906593322754, + "num_tokens": 262079868.0, + "step": 10132 + }, + { + "epoch": 1.11278278058423, + "grad_norm": 1.842822790145874, + "learning_rate": 5e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7370392084121704, + "num_tokens": 262106669.0, + "step": 10133 + }, + { + "epoch": 1.1128925982868438, + "grad_norm": 1.9508733749389648, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7263308763504028, + "num_tokens": 262131071.0, + "step": 10134 + }, + { + "epoch": 1.1130024159894576, + "grad_norm": 1.7671141624450684, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7247836589813232, + "num_tokens": 262160925.0, + "step": 10135 + }, + { + "epoch": 1.1131122336920711, + "grad_norm": 1.9442088603973389, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7291996479034424, + "num_tokens": 262188448.0, + "step": 10136 + }, + { + "epoch": 1.1132220513946849, + "grad_norm": 1.947864055633545, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7341859936714172, + "num_tokens": 262214032.0, + "step": 10137 + }, + { + "epoch": 1.1133318690972984, + "grad_norm": 1.9848051071166992, + "learning_rate": 5e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7145438194274902, + "num_tokens": 262239664.0, + "step": 10138 + }, + { + "epoch": 1.1134416867999122, + "grad_norm": 1.9467166662216187, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7431278228759766, + "num_tokens": 262264561.0, + "step": 10139 + }, + { + "epoch": 1.1135515045025257, + "grad_norm": 1.7634093761444092, + "learning_rate": 5e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7186831831932068, + "num_tokens": 262294739.0, + "step": 10140 + }, + { + "epoch": 1.1136613222051395, + "grad_norm": 1.9118294715881348, + "learning_rate": 5e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7313272953033447, + "num_tokens": 262321885.0, + "step": 10141 + }, + { + "epoch": 1.1137711399077532, + "grad_norm": 1.8731560707092285, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7634514570236206, + "num_tokens": 262343294.0, + "step": 10142 + }, + { + "epoch": 1.1138809576103668, + "grad_norm": 2.022191047668457, + "learning_rate": 5e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7291205525398254, + "num_tokens": 262366148.0, + "step": 10143 + }, + { + "epoch": 1.1139907753129805, + "grad_norm": 1.9373011589050293, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7583138942718506, + "num_tokens": 262386775.0, + "step": 10144 + }, + { + "epoch": 1.114100593015594, + "grad_norm": 1.8911826610565186, + "learning_rate": 5e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7402329444885254, + "num_tokens": 262413581.0, + "step": 10145 + }, + { + "epoch": 1.1142104107182078, + "grad_norm": 1.8237214088439941, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7432764768600464, + "num_tokens": 262438626.0, + "step": 10146 + }, + { + "epoch": 1.1143202284208213, + "grad_norm": 2.1755504608154297, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7550798654556274, + "num_tokens": 262457571.0, + "step": 10147 + }, + { + "epoch": 1.114430046123435, + "grad_norm": 1.9735442399978638, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7405608296394348, + "num_tokens": 262480857.0, + "step": 10148 + }, + { + "epoch": 1.1145398638260489, + "grad_norm": 1.9369451999664307, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7334980368614197, + "num_tokens": 262505757.0, + "step": 10149 + }, + { + "epoch": 1.1146496815286624, + "grad_norm": 2.0180201530456543, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7204410433769226, + "num_tokens": 262532011.0, + "step": 10150 + }, + { + "epoch": 1.1147594992312762, + "grad_norm": 1.8451581001281738, + "learning_rate": 5e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7314293384552002, + "num_tokens": 262564004.0, + "step": 10151 + }, + { + "epoch": 1.1148693169338897, + "grad_norm": 1.952888011932373, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7318178415298462, + "num_tokens": 262587056.0, + "step": 10152 + }, + { + "epoch": 1.1149791346365034, + "grad_norm": 2.1139259338378906, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7236670851707458, + "num_tokens": 262609624.0, + "step": 10153 + }, + { + "epoch": 1.115088952339117, + "grad_norm": 1.8856101036071777, + "learning_rate": 5e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7305305600166321, + "num_tokens": 262636717.0, + "step": 10154 + }, + { + "epoch": 1.1151987700417307, + "grad_norm": 2.0384461879730225, + "learning_rate": 5e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7176697254180908, + "num_tokens": 262660411.0, + "step": 10155 + }, + { + "epoch": 1.1153085877443445, + "grad_norm": 1.7870333194732666, + "learning_rate": 5e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7525205612182617, + "num_tokens": 262687700.0, + "step": 10156 + }, + { + "epoch": 1.115418405446958, + "grad_norm": 1.6567318439483643, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7469710111618042, + "num_tokens": 262719052.0, + "step": 10157 + }, + { + "epoch": 1.1155282231495718, + "grad_norm": 1.8639684915542603, + "learning_rate": 5e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7486323118209839, + "num_tokens": 262743514.0, + "step": 10158 + }, + { + "epoch": 1.1156380408521853, + "grad_norm": 1.8110146522521973, + "learning_rate": 5e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7380207777023315, + "num_tokens": 262768171.0, + "step": 10159 + }, + { + "epoch": 1.115747858554799, + "grad_norm": 1.6831960678100586, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.750809371471405, + "num_tokens": 262798395.0, + "step": 10160 + }, + { + "epoch": 1.1158576762574126, + "grad_norm": 1.9663416147232056, + "learning_rate": 5e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7503122687339783, + "num_tokens": 262822272.0, + "step": 10161 + }, + { + "epoch": 1.1159674939600264, + "grad_norm": 1.966214656829834, + "learning_rate": 5e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7435542345046997, + "num_tokens": 262844884.0, + "step": 10162 + }, + { + "epoch": 1.11607731166264, + "grad_norm": 1.969612956047058, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7156373262405396, + "num_tokens": 262870575.0, + "step": 10163 + }, + { + "epoch": 1.1161871293652537, + "grad_norm": 1.9001809358596802, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7411950826644897, + "num_tokens": 262896680.0, + "step": 10164 + }, + { + "epoch": 1.1162969470678674, + "grad_norm": 1.9922338724136353, + "learning_rate": 5e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7215964198112488, + "num_tokens": 262922178.0, + "step": 10165 + }, + { + "epoch": 1.116406764770481, + "grad_norm": 1.9914344549179077, + "learning_rate": 5e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7590492963790894, + "num_tokens": 262944050.0, + "step": 10166 + }, + { + "epoch": 1.1165165824730947, + "grad_norm": 2.025766134262085, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7327525019645691, + "num_tokens": 262967315.0, + "step": 10167 + }, + { + "epoch": 1.1166264001757082, + "grad_norm": 1.725494623184204, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7237854599952698, + "num_tokens": 262997891.0, + "step": 10168 + }, + { + "epoch": 1.116736217878322, + "grad_norm": 1.9272873401641846, + "learning_rate": 5e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7293235063552856, + "num_tokens": 263021946.0, + "step": 10169 + }, + { + "epoch": 1.1168460355809358, + "grad_norm": 2.215081214904785, + "learning_rate": 5e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7364418506622314, + "num_tokens": 263042631.0, + "step": 10170 + }, + { + "epoch": 1.1169558532835493, + "grad_norm": 1.7615596055984497, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.751625657081604, + "num_tokens": 263071182.0, + "step": 10171 + }, + { + "epoch": 1.117065670986163, + "grad_norm": 2.0670368671417236, + "learning_rate": 5e-06, + "loss": 0.6768, + "mean_token_accuracy": 0.7716051340103149, + "num_tokens": 263092628.0, + "step": 10172 + }, + { + "epoch": 1.1171754886887766, + "grad_norm": 1.9954742193222046, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7173333168029785, + "num_tokens": 263117799.0, + "step": 10173 + }, + { + "epoch": 1.1172853063913903, + "grad_norm": 1.7776565551757812, + "learning_rate": 5e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7211311459541321, + "num_tokens": 263150094.0, + "step": 10174 + }, + { + "epoch": 1.1173951240940039, + "grad_norm": 1.7675760984420776, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7353770732879639, + "num_tokens": 263179266.0, + "step": 10175 + }, + { + "epoch": 1.1175049417966176, + "grad_norm": 1.940630555152893, + "learning_rate": 5e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7174394130706787, + "num_tokens": 263207127.0, + "step": 10176 + }, + { + "epoch": 1.1176147594992312, + "grad_norm": 1.8404080867767334, + "learning_rate": 5e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7230551838874817, + "num_tokens": 263234243.0, + "step": 10177 + }, + { + "epoch": 1.117724577201845, + "grad_norm": 1.793880581855774, + "learning_rate": 5e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7438350915908813, + "num_tokens": 263262384.0, + "step": 10178 + }, + { + "epoch": 1.1178343949044587, + "grad_norm": 1.7736716270446777, + "learning_rate": 5e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7093123197555542, + "num_tokens": 263290561.0, + "step": 10179 + }, + { + "epoch": 1.1179442126070722, + "grad_norm": 1.8309485912322998, + "learning_rate": 5e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7548836469650269, + "num_tokens": 263316399.0, + "step": 10180 + }, + { + "epoch": 1.118054030309686, + "grad_norm": 1.8499705791473389, + "learning_rate": 5e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7463854551315308, + "num_tokens": 263342874.0, + "step": 10181 + }, + { + "epoch": 1.1181638480122995, + "grad_norm": 1.892356514930725, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7312178611755371, + "num_tokens": 263369651.0, + "step": 10182 + }, + { + "epoch": 1.1182736657149133, + "grad_norm": 1.6256730556488037, + "learning_rate": 5e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7214899063110352, + "num_tokens": 263403522.0, + "step": 10183 + }, + { + "epoch": 1.1183834834175268, + "grad_norm": 1.6875044107437134, + "learning_rate": 5e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7077978253364563, + "num_tokens": 263435966.0, + "step": 10184 + }, + { + "epoch": 1.1184933011201406, + "grad_norm": 1.8864165544509888, + "learning_rate": 5e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.738588809967041, + "num_tokens": 263460770.0, + "step": 10185 + }, + { + "epoch": 1.1186031188227543, + "grad_norm": 2.016591787338257, + "learning_rate": 5e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7264615893363953, + "num_tokens": 263484957.0, + "step": 10186 + }, + { + "epoch": 1.1187129365253679, + "grad_norm": 2.061506748199463, + "learning_rate": 5e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.718352198600769, + "num_tokens": 263508514.0, + "step": 10187 + }, + { + "epoch": 1.1188227542279816, + "grad_norm": 2.069483995437622, + "learning_rate": 5e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7379388809204102, + "num_tokens": 263531781.0, + "step": 10188 + }, + { + "epoch": 1.1189325719305951, + "grad_norm": 2.1982357501983643, + "learning_rate": 5e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7557041645050049, + "num_tokens": 263554172.0, + "step": 10189 + }, + { + "epoch": 1.119042389633209, + "grad_norm": 1.903093934059143, + "learning_rate": 5e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.738710880279541, + "num_tokens": 263579808.0, + "step": 10190 + }, + { + "epoch": 1.1191522073358224, + "grad_norm": 1.8121135234832764, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.720868706703186, + "num_tokens": 263609475.0, + "step": 10191 + }, + { + "epoch": 1.1192620250384362, + "grad_norm": 1.8173081874847412, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7182905077934265, + "num_tokens": 263635526.0, + "step": 10192 + }, + { + "epoch": 1.11937184274105, + "grad_norm": 1.7822643518447876, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7402342557907104, + "num_tokens": 263661488.0, + "step": 10193 + }, + { + "epoch": 1.1194816604436635, + "grad_norm": 1.9203462600708008, + "learning_rate": 5e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7313605546951294, + "num_tokens": 263685952.0, + "step": 10194 + }, + { + "epoch": 1.1195914781462772, + "grad_norm": 1.892765760421753, + "learning_rate": 5e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7322652339935303, + "num_tokens": 263711614.0, + "step": 10195 + }, + { + "epoch": 1.1197012958488908, + "grad_norm": 2.0920135974884033, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7573747634887695, + "num_tokens": 263732693.0, + "step": 10196 + }, + { + "epoch": 1.1198111135515045, + "grad_norm": 1.8881261348724365, + "learning_rate": 5e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.726178765296936, + "num_tokens": 263758113.0, + "step": 10197 + }, + { + "epoch": 1.119920931254118, + "grad_norm": 1.8192781209945679, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7344568967819214, + "num_tokens": 263784260.0, + "step": 10198 + }, + { + "epoch": 1.1200307489567318, + "grad_norm": 2.0906198024749756, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7667924165725708, + "num_tokens": 263803412.0, + "step": 10199 + }, + { + "epoch": 1.1201405666593456, + "grad_norm": 1.7190743684768677, + "learning_rate": 5e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7211917638778687, + "num_tokens": 263832967.0, + "step": 10200 + }, + { + "epoch": 1.1202503843619591, + "grad_norm": 1.8478097915649414, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7226690649986267, + "num_tokens": 263862409.0, + "step": 10201 + }, + { + "epoch": 1.1203602020645729, + "grad_norm": 1.780908465385437, + "learning_rate": 5e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7593374848365784, + "num_tokens": 263888280.0, + "step": 10202 + }, + { + "epoch": 1.1204700197671864, + "grad_norm": 1.9052751064300537, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7218836545944214, + "num_tokens": 263913885.0, + "step": 10203 + }, + { + "epoch": 1.1205798374698002, + "grad_norm": 1.8773123025894165, + "learning_rate": 5e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7328367829322815, + "num_tokens": 263942022.0, + "step": 10204 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 2.005662679672241, + "learning_rate": 5e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7417624592781067, + "num_tokens": 263964759.0, + "step": 10205 + }, + { + "epoch": 1.1207994728750275, + "grad_norm": 2.0621514320373535, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7297292351722717, + "num_tokens": 263987843.0, + "step": 10206 + }, + { + "epoch": 1.1209092905776412, + "grad_norm": 1.8781769275665283, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7569491267204285, + "num_tokens": 264012440.0, + "step": 10207 + }, + { + "epoch": 1.1210191082802548, + "grad_norm": 1.8914031982421875, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.727009654045105, + "num_tokens": 264041143.0, + "step": 10208 + }, + { + "epoch": 1.1211289259828685, + "grad_norm": 2.023824691772461, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7609294056892395, + "num_tokens": 264063121.0, + "step": 10209 + }, + { + "epoch": 1.121238743685482, + "grad_norm": 1.900595784187317, + "learning_rate": 5e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7460047006607056, + "num_tokens": 264088001.0, + "step": 10210 + }, + { + "epoch": 1.1213485613880958, + "grad_norm": 2.0907201766967773, + "learning_rate": 5e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7302818298339844, + "num_tokens": 264111597.0, + "step": 10211 + }, + { + "epoch": 1.1214583790907093, + "grad_norm": 1.8183592557907104, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7478235960006714, + "num_tokens": 264139593.0, + "step": 10212 + }, + { + "epoch": 1.121568196793323, + "grad_norm": 1.7050912380218506, + "learning_rate": 5e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7663565874099731, + "num_tokens": 264168275.0, + "step": 10213 + }, + { + "epoch": 1.1216780144959368, + "grad_norm": 1.9803887605667114, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7244411706924438, + "num_tokens": 264192153.0, + "step": 10214 + }, + { + "epoch": 1.1217878321985504, + "grad_norm": 1.9721742868423462, + "learning_rate": 5e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7430144548416138, + "num_tokens": 264213923.0, + "step": 10215 + }, + { + "epoch": 1.1218976499011641, + "grad_norm": 1.760022759437561, + "learning_rate": 5e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7232910394668579, + "num_tokens": 264245076.0, + "step": 10216 + }, + { + "epoch": 1.1220074676037777, + "grad_norm": 1.9398516416549683, + "learning_rate": 5e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.782492995262146, + "num_tokens": 264265765.0, + "step": 10217 + }, + { + "epoch": 1.1221172853063914, + "grad_norm": 2.0006563663482666, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7241556644439697, + "num_tokens": 264289796.0, + "step": 10218 + }, + { + "epoch": 1.122227103009005, + "grad_norm": 2.0568060874938965, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7338138818740845, + "num_tokens": 264313095.0, + "step": 10219 + }, + { + "epoch": 1.1223369207116187, + "grad_norm": 1.7554138898849487, + "learning_rate": 5e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7354905605316162, + "num_tokens": 264342514.0, + "step": 10220 + }, + { + "epoch": 1.1224467384142325, + "grad_norm": 1.860268235206604, + "learning_rate": 5e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7478533983230591, + "num_tokens": 264367774.0, + "step": 10221 + }, + { + "epoch": 1.122556556116846, + "grad_norm": 1.8181672096252441, + "learning_rate": 5e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7191709280014038, + "num_tokens": 264395949.0, + "step": 10222 + }, + { + "epoch": 1.1226663738194598, + "grad_norm": 2.0727884769439697, + "learning_rate": 5e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7325257062911987, + "num_tokens": 264418524.0, + "step": 10223 + }, + { + "epoch": 1.1227761915220733, + "grad_norm": 1.9343494176864624, + "learning_rate": 5e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7521553039550781, + "num_tokens": 264443021.0, + "step": 10224 + }, + { + "epoch": 1.122886009224687, + "grad_norm": 1.9688836336135864, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7522903680801392, + "num_tokens": 264465929.0, + "step": 10225 + }, + { + "epoch": 1.1229958269273006, + "grad_norm": 1.8462861776351929, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7332007884979248, + "num_tokens": 264490195.0, + "step": 10226 + }, + { + "epoch": 1.1231056446299144, + "grad_norm": 1.6074903011322021, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7308856844902039, + "num_tokens": 264523812.0, + "step": 10227 + }, + { + "epoch": 1.123215462332528, + "grad_norm": 2.153738021850586, + "learning_rate": 5e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7159836292266846, + "num_tokens": 264547251.0, + "step": 10228 + }, + { + "epoch": 1.1233252800351416, + "grad_norm": 1.753441333770752, + "learning_rate": 5e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7193621397018433, + "num_tokens": 264576179.0, + "step": 10229 + }, + { + "epoch": 1.1234350977377554, + "grad_norm": 1.775469183921814, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7206836342811584, + "num_tokens": 264604702.0, + "step": 10230 + }, + { + "epoch": 1.123544915440369, + "grad_norm": 2.096980333328247, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7318710684776306, + "num_tokens": 264626202.0, + "step": 10231 + }, + { + "epoch": 1.1236547331429827, + "grad_norm": 1.9641103744506836, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7272506356239319, + "num_tokens": 264650098.0, + "step": 10232 + }, + { + "epoch": 1.1237645508455962, + "grad_norm": 1.8568799495697021, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7197264432907104, + "num_tokens": 264677869.0, + "step": 10233 + }, + { + "epoch": 1.12387436854821, + "grad_norm": 1.8400969505310059, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7454721927642822, + "num_tokens": 264703289.0, + "step": 10234 + }, + { + "epoch": 1.1239841862508237, + "grad_norm": 1.780097484588623, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7351139783859253, + "num_tokens": 264730850.0, + "step": 10235 + }, + { + "epoch": 1.1240940039534373, + "grad_norm": 1.806550145149231, + "learning_rate": 5e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7258809804916382, + "num_tokens": 264759794.0, + "step": 10236 + }, + { + "epoch": 1.124203821656051, + "grad_norm": 1.8249367475509644, + "learning_rate": 5e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7250359058380127, + "num_tokens": 264785325.0, + "step": 10237 + }, + { + "epoch": 1.1243136393586646, + "grad_norm": 1.9407496452331543, + "learning_rate": 5e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7402589321136475, + "num_tokens": 264809197.0, + "step": 10238 + }, + { + "epoch": 1.1244234570612783, + "grad_norm": 1.9596542119979858, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7649304270744324, + "num_tokens": 264832253.0, + "step": 10239 + }, + { + "epoch": 1.1245332747638919, + "grad_norm": 2.1612603664398193, + "learning_rate": 5e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7369434237480164, + "num_tokens": 264852423.0, + "step": 10240 + }, + { + "epoch": 1.1246430924665056, + "grad_norm": 1.883066177368164, + "learning_rate": 5e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.729302704334259, + "num_tokens": 264880252.0, + "step": 10241 + }, + { + "epoch": 1.1247529101691192, + "grad_norm": 1.7140637636184692, + "learning_rate": 5e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7459770441055298, + "num_tokens": 264912115.0, + "step": 10242 + }, + { + "epoch": 1.124862727871733, + "grad_norm": 1.6875633001327515, + "learning_rate": 5e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7535902261734009, + "num_tokens": 264941800.0, + "step": 10243 + }, + { + "epoch": 1.1249725455743467, + "grad_norm": 1.8850651979446411, + "learning_rate": 5e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7306488156318665, + "num_tokens": 264969901.0, + "step": 10244 + }, + { + "epoch": 1.1250823632769602, + "grad_norm": 1.9764118194580078, + "learning_rate": 5e-06, + "loss": 0.81, + "mean_token_accuracy": 0.744531512260437, + "num_tokens": 264994381.0, + "step": 10245 + }, + { + "epoch": 1.125192180979574, + "grad_norm": 1.7926791906356812, + "learning_rate": 5e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7464680075645447, + "num_tokens": 265022868.0, + "step": 10246 + }, + { + "epoch": 1.1253019986821875, + "grad_norm": 2.3833515644073486, + "learning_rate": 5e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7342466115951538, + "num_tokens": 265042434.0, + "step": 10247 + }, + { + "epoch": 1.1254118163848013, + "grad_norm": 1.7717466354370117, + "learning_rate": 5e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.720777153968811, + "num_tokens": 265071780.0, + "step": 10248 + }, + { + "epoch": 1.125521634087415, + "grad_norm": 1.9935567378997803, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7196347713470459, + "num_tokens": 265097974.0, + "step": 10249 + }, + { + "epoch": 1.1256314517900285, + "grad_norm": 1.7617950439453125, + "learning_rate": 5e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7359859943389893, + "num_tokens": 265127958.0, + "step": 10250 + }, + { + "epoch": 1.1257412694926423, + "grad_norm": 1.8974536657333374, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.726194441318512, + "num_tokens": 265155269.0, + "step": 10251 + }, + { + "epoch": 1.1258510871952558, + "grad_norm": 1.9120774269104004, + "learning_rate": 5e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7666449546813965, + "num_tokens": 265177732.0, + "step": 10252 + }, + { + "epoch": 1.1259609048978696, + "grad_norm": 2.0292060375213623, + "learning_rate": 5e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7362707257270813, + "num_tokens": 265201323.0, + "step": 10253 + }, + { + "epoch": 1.1260707226004831, + "grad_norm": 2.08905029296875, + "learning_rate": 5e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7216029167175293, + "num_tokens": 265224176.0, + "step": 10254 + }, + { + "epoch": 1.1261805403030969, + "grad_norm": 1.9083237648010254, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7331193685531616, + "num_tokens": 265250684.0, + "step": 10255 + }, + { + "epoch": 1.1262903580057104, + "grad_norm": 1.7980269193649292, + "learning_rate": 5e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7408175468444824, + "num_tokens": 265275155.0, + "step": 10256 + }, + { + "epoch": 1.1264001757083242, + "grad_norm": 1.66655695438385, + "learning_rate": 5e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7406854629516602, + "num_tokens": 265305693.0, + "step": 10257 + }, + { + "epoch": 1.126509993410938, + "grad_norm": 1.7503756284713745, + "learning_rate": 5e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.741978645324707, + "num_tokens": 265335704.0, + "step": 10258 + }, + { + "epoch": 1.1266198111135515, + "grad_norm": 1.8969578742980957, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7213344573974609, + "num_tokens": 265364575.0, + "step": 10259 + }, + { + "epoch": 1.1267296288161652, + "grad_norm": 1.9904403686523438, + "learning_rate": 5e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7534173727035522, + "num_tokens": 265387603.0, + "step": 10260 + }, + { + "epoch": 1.1268394465187788, + "grad_norm": 1.8275576829910278, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.72349613904953, + "num_tokens": 265414319.0, + "step": 10261 + }, + { + "epoch": 1.1269492642213925, + "grad_norm": 1.8910410404205322, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7244935631752014, + "num_tokens": 265440801.0, + "step": 10262 + }, + { + "epoch": 1.127059081924006, + "grad_norm": 1.7978938817977905, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7280333042144775, + "num_tokens": 265467765.0, + "step": 10263 + }, + { + "epoch": 1.1271688996266198, + "grad_norm": 2.011383056640625, + "learning_rate": 5e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.737868070602417, + "num_tokens": 265491570.0, + "step": 10264 + }, + { + "epoch": 1.1272787173292333, + "grad_norm": 1.7643516063690186, + "learning_rate": 5e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.700294017791748, + "num_tokens": 265524665.0, + "step": 10265 + }, + { + "epoch": 1.127388535031847, + "grad_norm": 1.924208641052246, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7351354360580444, + "num_tokens": 265550260.0, + "step": 10266 + }, + { + "epoch": 1.1274983527344609, + "grad_norm": 1.777025818824768, + "learning_rate": 5e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7335687279701233, + "num_tokens": 265578521.0, + "step": 10267 + }, + { + "epoch": 1.1276081704370744, + "grad_norm": 1.6847541332244873, + "learning_rate": 5e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7205857038497925, + "num_tokens": 265609648.0, + "step": 10268 + }, + { + "epoch": 1.1277179881396882, + "grad_norm": 1.7749890089035034, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7292776703834534, + "num_tokens": 265636625.0, + "step": 10269 + }, + { + "epoch": 1.1278278058423017, + "grad_norm": 1.8761262893676758, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7443939447402954, + "num_tokens": 265660917.0, + "step": 10270 + }, + { + "epoch": 1.1279376235449154, + "grad_norm": 2.035900354385376, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7277206182479858, + "num_tokens": 265683546.0, + "step": 10271 + }, + { + "epoch": 1.1280474412475292, + "grad_norm": 1.7574344873428345, + "learning_rate": 5e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7383304834365845, + "num_tokens": 265711590.0, + "step": 10272 + }, + { + "epoch": 1.1281572589501427, + "grad_norm": 2.063819646835327, + "learning_rate": 5e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7434083223342896, + "num_tokens": 265732602.0, + "step": 10273 + }, + { + "epoch": 1.1282670766527565, + "grad_norm": 1.7287085056304932, + "learning_rate": 5e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7406444549560547, + "num_tokens": 265764670.0, + "step": 10274 + }, + { + "epoch": 1.12837689435537, + "grad_norm": 1.834088921546936, + "learning_rate": 5e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7089998722076416, + "num_tokens": 265794913.0, + "step": 10275 + }, + { + "epoch": 1.1284867120579838, + "grad_norm": 1.7780150175094604, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7276158332824707, + "num_tokens": 265823140.0, + "step": 10276 + }, + { + "epoch": 1.1285965297605973, + "grad_norm": 2.0725276470184326, + "learning_rate": 5e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7390033602714539, + "num_tokens": 265844332.0, + "step": 10277 + }, + { + "epoch": 1.128706347463211, + "grad_norm": 1.9684889316558838, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7050480842590332, + "num_tokens": 265870862.0, + "step": 10278 + }, + { + "epoch": 1.1288161651658246, + "grad_norm": 1.7850897312164307, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7396441698074341, + "num_tokens": 265897775.0, + "step": 10279 + }, + { + "epoch": 1.1289259828684384, + "grad_norm": 1.9052067995071411, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.742091178894043, + "num_tokens": 265920950.0, + "step": 10280 + }, + { + "epoch": 1.1290358005710521, + "grad_norm": 1.849336862564087, + "learning_rate": 5e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7417160272598267, + "num_tokens": 265947256.0, + "step": 10281 + }, + { + "epoch": 1.1291456182736657, + "grad_norm": 1.8160035610198975, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.717704176902771, + "num_tokens": 265976170.0, + "step": 10282 + }, + { + "epoch": 1.1292554359762794, + "grad_norm": 2.0022902488708496, + "learning_rate": 5e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7371826171875, + "num_tokens": 265999840.0, + "step": 10283 + }, + { + "epoch": 1.129365253678893, + "grad_norm": 1.8718106746673584, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7355935573577881, + "num_tokens": 266025528.0, + "step": 10284 + }, + { + "epoch": 1.1294750713815067, + "grad_norm": 1.9671586751937866, + "learning_rate": 5e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7539869546890259, + "num_tokens": 266049848.0, + "step": 10285 + }, + { + "epoch": 1.1295848890841205, + "grad_norm": 2.0276236534118652, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7541923522949219, + "num_tokens": 266074935.0, + "step": 10286 + }, + { + "epoch": 1.129694706786734, + "grad_norm": 2.0355732440948486, + "learning_rate": 5e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.72797691822052, + "num_tokens": 266097539.0, + "step": 10287 + }, + { + "epoch": 1.1298045244893478, + "grad_norm": 1.8688247203826904, + "learning_rate": 5e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.709633469581604, + "num_tokens": 266123465.0, + "step": 10288 + }, + { + "epoch": 1.1299143421919613, + "grad_norm": 1.8923145532608032, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7161394953727722, + "num_tokens": 266150277.0, + "step": 10289 + }, + { + "epoch": 1.130024159894575, + "grad_norm": 1.6432636976242065, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7294877171516418, + "num_tokens": 266181434.0, + "step": 10290 + }, + { + "epoch": 1.1301339775971886, + "grad_norm": 2.0608971118927, + "learning_rate": 5e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7651540040969849, + "num_tokens": 266201474.0, + "step": 10291 + }, + { + "epoch": 1.1302437952998023, + "grad_norm": 2.035576581954956, + "learning_rate": 5e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7391723394393921, + "num_tokens": 266226097.0, + "step": 10292 + }, + { + "epoch": 1.1303536130024159, + "grad_norm": 2.0462229251861572, + "learning_rate": 5e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7366153001785278, + "num_tokens": 266248239.0, + "step": 10293 + }, + { + "epoch": 1.1304634307050296, + "grad_norm": 1.972592830657959, + "learning_rate": 5e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7472706437110901, + "num_tokens": 266270400.0, + "step": 10294 + }, + { + "epoch": 1.1305732484076434, + "grad_norm": 2.331540107727051, + "learning_rate": 5e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7561732530593872, + "num_tokens": 266288271.0, + "step": 10295 + }, + { + "epoch": 1.130683066110257, + "grad_norm": 1.8323453664779663, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7302932739257812, + "num_tokens": 266317644.0, + "step": 10296 + }, + { + "epoch": 1.1307928838128707, + "grad_norm": 1.830150842666626, + "learning_rate": 5e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7043105363845825, + "num_tokens": 266348593.0, + "step": 10297 + }, + { + "epoch": 1.1309027015154842, + "grad_norm": 1.9981178045272827, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7575414180755615, + "num_tokens": 266371914.0, + "step": 10298 + }, + { + "epoch": 1.131012519218098, + "grad_norm": 2.014552116394043, + "learning_rate": 5e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7375268936157227, + "num_tokens": 266398397.0, + "step": 10299 + }, + { + "epoch": 1.1311223369207117, + "grad_norm": 2.1468863487243652, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7228581309318542, + "num_tokens": 266420691.0, + "step": 10300 + }, + { + "epoch": 1.1312321546233253, + "grad_norm": 1.890356421470642, + "learning_rate": 5e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7473759651184082, + "num_tokens": 266445276.0, + "step": 10301 + }, + { + "epoch": 1.131341972325939, + "grad_norm": 1.989200234413147, + "learning_rate": 5e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7269383072853088, + "num_tokens": 266470694.0, + "step": 10302 + }, + { + "epoch": 1.1314517900285526, + "grad_norm": 2.067281484603882, + "learning_rate": 5e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7138645648956299, + "num_tokens": 266493836.0, + "step": 10303 + }, + { + "epoch": 1.1315616077311663, + "grad_norm": 1.8961471319198608, + "learning_rate": 5e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7491726875305176, + "num_tokens": 266518601.0, + "step": 10304 + }, + { + "epoch": 1.1316714254337799, + "grad_norm": 2.016820192337036, + "learning_rate": 5e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7453491687774658, + "num_tokens": 266541642.0, + "step": 10305 + }, + { + "epoch": 1.1317812431363936, + "grad_norm": 1.9001967906951904, + "learning_rate": 5e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7292273640632629, + "num_tokens": 266565772.0, + "step": 10306 + }, + { + "epoch": 1.1318910608390071, + "grad_norm": 2.033510684967041, + "learning_rate": 5e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7522232532501221, + "num_tokens": 266589402.0, + "step": 10307 + }, + { + "epoch": 1.132000878541621, + "grad_norm": 1.8334439992904663, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7412259578704834, + "num_tokens": 266616903.0, + "step": 10308 + }, + { + "epoch": 1.1321106962442347, + "grad_norm": 1.808637261390686, + "learning_rate": 5e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7296087145805359, + "num_tokens": 266644243.0, + "step": 10309 + }, + { + "epoch": 1.1322205139468482, + "grad_norm": 2.1754302978515625, + "learning_rate": 5e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7233944535255432, + "num_tokens": 266665280.0, + "step": 10310 + }, + { + "epoch": 1.132330331649462, + "grad_norm": 1.7942712306976318, + "learning_rate": 5e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.725243091583252, + "num_tokens": 266695890.0, + "step": 10311 + }, + { + "epoch": 1.1324401493520755, + "grad_norm": 1.8486618995666504, + "learning_rate": 5e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7305566072463989, + "num_tokens": 266721896.0, + "step": 10312 + }, + { + "epoch": 1.1325499670546892, + "grad_norm": 2.0045816898345947, + "learning_rate": 5e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.744996190071106, + "num_tokens": 266745382.0, + "step": 10313 + }, + { + "epoch": 1.132659784757303, + "grad_norm": 1.9568965435028076, + "learning_rate": 5e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7452257871627808, + "num_tokens": 266768597.0, + "step": 10314 + }, + { + "epoch": 1.1327696024599165, + "grad_norm": 1.7806276082992554, + "learning_rate": 5e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7245242595672607, + "num_tokens": 266794423.0, + "step": 10315 + }, + { + "epoch": 1.1328794201625303, + "grad_norm": 1.7759901285171509, + "learning_rate": 5e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7371845245361328, + "num_tokens": 266821907.0, + "step": 10316 + }, + { + "epoch": 1.1329892378651438, + "grad_norm": 1.988303542137146, + "learning_rate": 5e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.778351902961731, + "num_tokens": 266842914.0, + "step": 10317 + }, + { + "epoch": 1.1330990555677576, + "grad_norm": 1.892325520515442, + "learning_rate": 5e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7354663610458374, + "num_tokens": 266869793.0, + "step": 10318 + }, + { + "epoch": 1.1332088732703711, + "grad_norm": 2.059986114501953, + "learning_rate": 5e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.718569815158844, + "num_tokens": 266892390.0, + "step": 10319 + }, + { + "epoch": 1.1333186909729849, + "grad_norm": 2.0825397968292236, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7308648824691772, + "num_tokens": 266915028.0, + "step": 10320 + }, + { + "epoch": 1.1334285086755984, + "grad_norm": 1.880938172340393, + "learning_rate": 5e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7616454362869263, + "num_tokens": 266938239.0, + "step": 10321 + }, + { + "epoch": 1.1335383263782122, + "grad_norm": 1.8394558429718018, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7295336723327637, + "num_tokens": 266962770.0, + "step": 10322 + }, + { + "epoch": 1.133648144080826, + "grad_norm": 1.612240195274353, + "learning_rate": 5e-06, + "loss": 0.784, + "mean_token_accuracy": 0.7494621872901917, + "num_tokens": 266992789.0, + "step": 10323 + }, + { + "epoch": 1.1337579617834395, + "grad_norm": 2.009694814682007, + "learning_rate": 5e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7106114625930786, + "num_tokens": 267018576.0, + "step": 10324 + }, + { + "epoch": 1.1338677794860532, + "grad_norm": 1.9481608867645264, + "learning_rate": 5e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7263352870941162, + "num_tokens": 267046032.0, + "step": 10325 + }, + { + "epoch": 1.1339775971886668, + "grad_norm": 1.9160360097885132, + "learning_rate": 5e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7314930558204651, + "num_tokens": 267074742.0, + "step": 10326 + }, + { + "epoch": 1.1340874148912805, + "grad_norm": 2.2287795543670654, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7489765882492065, + "num_tokens": 267094691.0, + "step": 10327 + }, + { + "epoch": 1.134197232593894, + "grad_norm": 1.8333008289337158, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7284404635429382, + "num_tokens": 267122695.0, + "step": 10328 + }, + { + "epoch": 1.1343070502965078, + "grad_norm": 2.066436767578125, + "learning_rate": 5e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7202847599983215, + "num_tokens": 267147553.0, + "step": 10329 + }, + { + "epoch": 1.1344168679991213, + "grad_norm": 1.70428466796875, + "learning_rate": 5e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7344731688499451, + "num_tokens": 267178044.0, + "step": 10330 + }, + { + "epoch": 1.134526685701735, + "grad_norm": 1.7112131118774414, + "learning_rate": 5e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7435646653175354, + "num_tokens": 267207029.0, + "step": 10331 + }, + { + "epoch": 1.1346365034043489, + "grad_norm": 1.8481857776641846, + "learning_rate": 5e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7107189297676086, + "num_tokens": 267235923.0, + "step": 10332 + }, + { + "epoch": 1.1347463211069624, + "grad_norm": 1.882333755493164, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.715643584728241, + "num_tokens": 267264996.0, + "step": 10333 + }, + { + "epoch": 1.1348561388095761, + "grad_norm": 1.9215787649154663, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7410789728164673, + "num_tokens": 267290674.0, + "step": 10334 + }, + { + "epoch": 1.1349659565121897, + "grad_norm": 1.884810209274292, + "learning_rate": 5e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.729337751865387, + "num_tokens": 267318206.0, + "step": 10335 + }, + { + "epoch": 1.1350757742148034, + "grad_norm": 2.043541431427002, + "learning_rate": 5e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7511749267578125, + "num_tokens": 267341095.0, + "step": 10336 + }, + { + "epoch": 1.1351855919174172, + "grad_norm": 1.9891644716262817, + "learning_rate": 5e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7247568368911743, + "num_tokens": 267365496.0, + "step": 10337 + }, + { + "epoch": 1.1352954096200307, + "grad_norm": 1.9443222284317017, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7249606251716614, + "num_tokens": 267390711.0, + "step": 10338 + }, + { + "epoch": 1.1354052273226445, + "grad_norm": 1.7421208620071411, + "learning_rate": 5e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7485003471374512, + "num_tokens": 267416993.0, + "step": 10339 + }, + { + "epoch": 1.135515045025258, + "grad_norm": 1.908634901046753, + "learning_rate": 5e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.740164041519165, + "num_tokens": 267441750.0, + "step": 10340 + }, + { + "epoch": 1.1356248627278718, + "grad_norm": 2.0909907817840576, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7474629878997803, + "num_tokens": 267461957.0, + "step": 10341 + }, + { + "epoch": 1.1357346804304853, + "grad_norm": 1.997694730758667, + "learning_rate": 5e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7500133514404297, + "num_tokens": 267485477.0, + "step": 10342 + }, + { + "epoch": 1.135844498133099, + "grad_norm": 2.2435920238494873, + "learning_rate": 5e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7729015946388245, + "num_tokens": 267505829.0, + "step": 10343 + }, + { + "epoch": 1.1359543158357126, + "grad_norm": 1.8618428707122803, + "learning_rate": 5e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7001669406890869, + "num_tokens": 267534981.0, + "step": 10344 + }, + { + "epoch": 1.1360641335383264, + "grad_norm": 1.9834790229797363, + "learning_rate": 5e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7513043880462646, + "num_tokens": 267559003.0, + "step": 10345 + }, + { + "epoch": 1.1361739512409401, + "grad_norm": 1.8846299648284912, + "learning_rate": 5e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7491528987884521, + "num_tokens": 267583765.0, + "step": 10346 + }, + { + "epoch": 1.1362837689435537, + "grad_norm": 1.8338956832885742, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7280063629150391, + "num_tokens": 267613361.0, + "step": 10347 + }, + { + "epoch": 1.1363935866461674, + "grad_norm": 2.0672693252563477, + "learning_rate": 5e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7291121482849121, + "num_tokens": 267636564.0, + "step": 10348 + }, + { + "epoch": 1.136503404348781, + "grad_norm": 1.8261139392852783, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7471848726272583, + "num_tokens": 267662172.0, + "step": 10349 + }, + { + "epoch": 1.1366132220513947, + "grad_norm": 1.7895863056182861, + "learning_rate": 5e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7549477815628052, + "num_tokens": 267689095.0, + "step": 10350 + }, + { + "epoch": 1.1367230397540085, + "grad_norm": 2.1052234172821045, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7262505888938904, + "num_tokens": 267711846.0, + "step": 10351 + }, + { + "epoch": 1.136832857456622, + "grad_norm": 1.864933967590332, + "learning_rate": 5e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7408164143562317, + "num_tokens": 267737603.0, + "step": 10352 + }, + { + "epoch": 1.1369426751592357, + "grad_norm": 1.632753610610962, + "learning_rate": 5e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7422206401824951, + "num_tokens": 267770535.0, + "step": 10353 + }, + { + "epoch": 1.1370524928618493, + "grad_norm": 1.8823291063308716, + "learning_rate": 5e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7445807456970215, + "num_tokens": 267796388.0, + "step": 10354 + }, + { + "epoch": 1.137162310564463, + "grad_norm": 2.0469071865081787, + "learning_rate": 5e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7243515849113464, + "num_tokens": 267818240.0, + "step": 10355 + }, + { + "epoch": 1.1372721282670766, + "grad_norm": 1.9605562686920166, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7231755256652832, + "num_tokens": 267840531.0, + "step": 10356 + }, + { + "epoch": 1.1373819459696903, + "grad_norm": 2.1579926013946533, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7424322962760925, + "num_tokens": 267860513.0, + "step": 10357 + }, + { + "epoch": 1.1374917636723039, + "grad_norm": 1.7066749334335327, + "learning_rate": 5e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7156375050544739, + "num_tokens": 267892154.0, + "step": 10358 + }, + { + "epoch": 1.1376015813749176, + "grad_norm": 1.6473785638809204, + "learning_rate": 5e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7487548589706421, + "num_tokens": 267921737.0, + "step": 10359 + }, + { + "epoch": 1.1377113990775314, + "grad_norm": 2.2760109901428223, + "learning_rate": 5e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7451647520065308, + "num_tokens": 267940446.0, + "step": 10360 + }, + { + "epoch": 1.137821216780145, + "grad_norm": 1.8297638893127441, + "learning_rate": 5e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7530033588409424, + "num_tokens": 267963790.0, + "step": 10361 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 1.9260376691818237, + "learning_rate": 5e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7379517555236816, + "num_tokens": 267989058.0, + "step": 10362 + }, + { + "epoch": 1.1380408521853722, + "grad_norm": 1.9459753036499023, + "learning_rate": 5e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.741303563117981, + "num_tokens": 268013531.0, + "step": 10363 + }, + { + "epoch": 1.138150669887986, + "grad_norm": 1.7500802278518677, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7192306518554688, + "num_tokens": 268046624.0, + "step": 10364 + }, + { + "epoch": 1.1382604875905997, + "grad_norm": 2.1036033630371094, + "learning_rate": 5e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.7500562071800232, + "num_tokens": 268067006.0, + "step": 10365 + }, + { + "epoch": 1.1383703052932133, + "grad_norm": 1.7789968252182007, + "learning_rate": 5e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7188866138458252, + "num_tokens": 268094554.0, + "step": 10366 + }, + { + "epoch": 1.138480122995827, + "grad_norm": 1.7406896352767944, + "learning_rate": 5e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6962267160415649, + "num_tokens": 268127112.0, + "step": 10367 + }, + { + "epoch": 1.1385899406984406, + "grad_norm": 1.875299334526062, + "learning_rate": 5e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7514991164207458, + "num_tokens": 268151050.0, + "step": 10368 + }, + { + "epoch": 1.1386997584010543, + "grad_norm": 1.7893935441970825, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7386608719825745, + "num_tokens": 268180391.0, + "step": 10369 + }, + { + "epoch": 1.1388095761036678, + "grad_norm": 1.8677583932876587, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7272317409515381, + "num_tokens": 268207578.0, + "step": 10370 + }, + { + "epoch": 1.1389193938062816, + "grad_norm": 1.9520856142044067, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7198251485824585, + "num_tokens": 268233249.0, + "step": 10371 + }, + { + "epoch": 1.1390292115088951, + "grad_norm": 1.5750887393951416, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7536946535110474, + "num_tokens": 268265746.0, + "step": 10372 + }, + { + "epoch": 1.139139029211509, + "grad_norm": 2.0325279235839844, + "learning_rate": 5e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7515823841094971, + "num_tokens": 268286765.0, + "step": 10373 + }, + { + "epoch": 1.1392488469141226, + "grad_norm": 1.9074088335037231, + "learning_rate": 5e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7408711314201355, + "num_tokens": 268311538.0, + "step": 10374 + }, + { + "epoch": 1.1393586646167362, + "grad_norm": 1.7170428037643433, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7235994338989258, + "num_tokens": 268343128.0, + "step": 10375 + }, + { + "epoch": 1.13946848231935, + "grad_norm": 1.8648266792297363, + "learning_rate": 5e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7391703128814697, + "num_tokens": 268369830.0, + "step": 10376 + }, + { + "epoch": 1.1395783000219635, + "grad_norm": 1.777361512184143, + "learning_rate": 5e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7246614098548889, + "num_tokens": 268398898.0, + "step": 10377 + }, + { + "epoch": 1.1396881177245772, + "grad_norm": 1.9498106241226196, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7403606176376343, + "num_tokens": 268422066.0, + "step": 10378 + }, + { + "epoch": 1.1397979354271908, + "grad_norm": 1.7773492336273193, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7269535064697266, + "num_tokens": 268449640.0, + "step": 10379 + }, + { + "epoch": 1.1399077531298045, + "grad_norm": 2.1066083908081055, + "learning_rate": 5e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7602186799049377, + "num_tokens": 268469260.0, + "step": 10380 + }, + { + "epoch": 1.1400175708324183, + "grad_norm": 1.8591581583023071, + "learning_rate": 5e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7173137068748474, + "num_tokens": 268496664.0, + "step": 10381 + }, + { + "epoch": 1.1401273885350318, + "grad_norm": 1.8013849258422852, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7132686972618103, + "num_tokens": 268522740.0, + "step": 10382 + }, + { + "epoch": 1.1402372062376456, + "grad_norm": 2.018059253692627, + "learning_rate": 5e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7443875670433044, + "num_tokens": 268545900.0, + "step": 10383 + }, + { + "epoch": 1.140347023940259, + "grad_norm": 1.7262282371520996, + "learning_rate": 5e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.744459331035614, + "num_tokens": 268576527.0, + "step": 10384 + }, + { + "epoch": 1.1404568416428729, + "grad_norm": 1.856674313545227, + "learning_rate": 5e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7423462867736816, + "num_tokens": 268605528.0, + "step": 10385 + }, + { + "epoch": 1.1405666593454864, + "grad_norm": 2.2024548053741455, + "learning_rate": 5e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7473546266555786, + "num_tokens": 268625254.0, + "step": 10386 + }, + { + "epoch": 1.1406764770481002, + "grad_norm": 1.777669072151184, + "learning_rate": 5e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7024568319320679, + "num_tokens": 268655231.0, + "step": 10387 + }, + { + "epoch": 1.140786294750714, + "grad_norm": 1.7672481536865234, + "learning_rate": 5e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7449864745140076, + "num_tokens": 268682805.0, + "step": 10388 + }, + { + "epoch": 1.1408961124533274, + "grad_norm": 1.906378984451294, + "learning_rate": 5e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7353002429008484, + "num_tokens": 268706802.0, + "step": 10389 + }, + { + "epoch": 1.1410059301559412, + "grad_norm": 2.1057844161987305, + "learning_rate": 5e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7532844543457031, + "num_tokens": 268727434.0, + "step": 10390 + }, + { + "epoch": 1.1411157478585547, + "grad_norm": 1.7948907613754272, + "learning_rate": 5e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7388625144958496, + "num_tokens": 268753884.0, + "step": 10391 + }, + { + "epoch": 1.1412255655611685, + "grad_norm": 1.6138111352920532, + "learning_rate": 5e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7301506996154785, + "num_tokens": 268786071.0, + "step": 10392 + }, + { + "epoch": 1.141335383263782, + "grad_norm": 1.9410591125488281, + "learning_rate": 5e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7345683574676514, + "num_tokens": 268809413.0, + "step": 10393 + }, + { + "epoch": 1.1414452009663958, + "grad_norm": 1.7779532670974731, + "learning_rate": 5e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7455506324768066, + "num_tokens": 268834952.0, + "step": 10394 + }, + { + "epoch": 1.1415550186690093, + "grad_norm": 1.7059935331344604, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7216566801071167, + "num_tokens": 268863937.0, + "step": 10395 + }, + { + "epoch": 1.141664836371623, + "grad_norm": 2.0688607692718506, + "learning_rate": 5e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.733478844165802, + "num_tokens": 268888414.0, + "step": 10396 + }, + { + "epoch": 1.1417746540742368, + "grad_norm": 2.017838716506958, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.70762038230896, + "num_tokens": 268913387.0, + "step": 10397 + }, + { + "epoch": 1.1418844717768504, + "grad_norm": 1.9246490001678467, + "learning_rate": 5e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7531334161758423, + "num_tokens": 268937133.0, + "step": 10398 + }, + { + "epoch": 1.1419942894794641, + "grad_norm": 2.0683553218841553, + "learning_rate": 5e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7329518795013428, + "num_tokens": 268960856.0, + "step": 10399 + }, + { + "epoch": 1.1421041071820777, + "grad_norm": 1.7793024778366089, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7436001300811768, + "num_tokens": 268987076.0, + "step": 10400 + }, + { + "epoch": 1.1422139248846914, + "grad_norm": 1.7891533374786377, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7387568950653076, + "num_tokens": 269012429.0, + "step": 10401 + }, + { + "epoch": 1.1423237425873052, + "grad_norm": 1.7486191987991333, + "learning_rate": 5e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7416322231292725, + "num_tokens": 269042771.0, + "step": 10402 + }, + { + "epoch": 1.1424335602899187, + "grad_norm": 1.7301466464996338, + "learning_rate": 5e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7381272912025452, + "num_tokens": 269070653.0, + "step": 10403 + }, + { + "epoch": 1.1425433779925325, + "grad_norm": 2.160466194152832, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7681272625923157, + "num_tokens": 269090686.0, + "step": 10404 + }, + { + "epoch": 1.142653195695146, + "grad_norm": 1.8374608755111694, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7300666570663452, + "num_tokens": 269119250.0, + "step": 10405 + }, + { + "epoch": 1.1427630133977598, + "grad_norm": 1.874393343925476, + "learning_rate": 5e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7213722467422485, + "num_tokens": 269143943.0, + "step": 10406 + }, + { + "epoch": 1.1428728311003733, + "grad_norm": 1.835020899772644, + "learning_rate": 5e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7425161600112915, + "num_tokens": 269170728.0, + "step": 10407 + }, + { + "epoch": 1.142982648802987, + "grad_norm": 2.11185359954834, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7544800043106079, + "num_tokens": 269192810.0, + "step": 10408 + }, + { + "epoch": 1.1430924665056006, + "grad_norm": 1.9651286602020264, + "learning_rate": 5e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7163997888565063, + "num_tokens": 269219023.0, + "step": 10409 + }, + { + "epoch": 1.1432022842082143, + "grad_norm": 2.114426374435425, + "learning_rate": 5e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7217602729797363, + "num_tokens": 269242617.0, + "step": 10410 + }, + { + "epoch": 1.143312101910828, + "grad_norm": 2.017808675765991, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7391676902770996, + "num_tokens": 269267016.0, + "step": 10411 + }, + { + "epoch": 1.1434219196134416, + "grad_norm": 1.761852502822876, + "learning_rate": 5e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7276136875152588, + "num_tokens": 269295880.0, + "step": 10412 + }, + { + "epoch": 1.1435317373160554, + "grad_norm": 1.9861019849777222, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7322463989257812, + "num_tokens": 269320144.0, + "step": 10413 + }, + { + "epoch": 1.143641555018669, + "grad_norm": 2.012096881866455, + "learning_rate": 5e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7611371278762817, + "num_tokens": 269340747.0, + "step": 10414 + }, + { + "epoch": 1.1437513727212827, + "grad_norm": 1.8193857669830322, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7412242889404297, + "num_tokens": 269368093.0, + "step": 10415 + }, + { + "epoch": 1.1438611904238964, + "grad_norm": 1.855982780456543, + "learning_rate": 5e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.734115481376648, + "num_tokens": 269397063.0, + "step": 10416 + }, + { + "epoch": 1.14397100812651, + "grad_norm": 2.0621979236602783, + "learning_rate": 5e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7533285617828369, + "num_tokens": 269417752.0, + "step": 10417 + }, + { + "epoch": 1.1440808258291237, + "grad_norm": 1.9742603302001953, + "learning_rate": 5e-06, + "loss": 0.924, + "mean_token_accuracy": 0.717761218547821, + "num_tokens": 269443223.0, + "step": 10418 + }, + { + "epoch": 1.1441906435317373, + "grad_norm": 2.0239243507385254, + "learning_rate": 5e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7198473811149597, + "num_tokens": 269465286.0, + "step": 10419 + }, + { + "epoch": 1.144300461234351, + "grad_norm": 1.8842307329177856, + "learning_rate": 5e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7306316494941711, + "num_tokens": 269491101.0, + "step": 10420 + }, + { + "epoch": 1.1444102789369646, + "grad_norm": 1.882619023323059, + "learning_rate": 5e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7116094827651978, + "num_tokens": 269518154.0, + "step": 10421 + }, + { + "epoch": 1.1445200966395783, + "grad_norm": 2.090517044067383, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7624760270118713, + "num_tokens": 269538682.0, + "step": 10422 + }, + { + "epoch": 1.1446299143421919, + "grad_norm": 1.8994054794311523, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7262759208679199, + "num_tokens": 269567474.0, + "step": 10423 + }, + { + "epoch": 1.1447397320448056, + "grad_norm": 2.0031466484069824, + "learning_rate": 5e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7219498157501221, + "num_tokens": 269591427.0, + "step": 10424 + }, + { + "epoch": 1.1448495497474194, + "grad_norm": 1.705257534980774, + "learning_rate": 5e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7345134019851685, + "num_tokens": 269622293.0, + "step": 10425 + }, + { + "epoch": 1.144959367450033, + "grad_norm": 2.074483633041382, + "learning_rate": 5e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7510846853256226, + "num_tokens": 269642658.0, + "step": 10426 + }, + { + "epoch": 1.1450691851526467, + "grad_norm": 1.9350082874298096, + "learning_rate": 5e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7174965143203735, + "num_tokens": 269668999.0, + "step": 10427 + }, + { + "epoch": 1.1451790028552602, + "grad_norm": 1.8787434101104736, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7216917872428894, + "num_tokens": 269694964.0, + "step": 10428 + }, + { + "epoch": 1.145288820557874, + "grad_norm": 1.948797583580017, + "learning_rate": 5e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.721208393573761, + "num_tokens": 269721828.0, + "step": 10429 + }, + { + "epoch": 1.1453986382604877, + "grad_norm": 1.8889070749282837, + "learning_rate": 5e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7186406254768372, + "num_tokens": 269749425.0, + "step": 10430 + }, + { + "epoch": 1.1455084559631012, + "grad_norm": 1.7993355989456177, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7234551906585693, + "num_tokens": 269777762.0, + "step": 10431 + }, + { + "epoch": 1.145618273665715, + "grad_norm": 2.0697336196899414, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7615204453468323, + "num_tokens": 269797771.0, + "step": 10432 + }, + { + "epoch": 1.1457280913683285, + "grad_norm": 1.977924108505249, + "learning_rate": 5e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7321836948394775, + "num_tokens": 269821742.0, + "step": 10433 + }, + { + "epoch": 1.1458379090709423, + "grad_norm": 2.0927984714508057, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.754673182964325, + "num_tokens": 269846454.0, + "step": 10434 + }, + { + "epoch": 1.1459477267735558, + "grad_norm": 1.6417194604873657, + "learning_rate": 5e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.724513053894043, + "num_tokens": 269879900.0, + "step": 10435 + }, + { + "epoch": 1.1460575444761696, + "grad_norm": 1.7795846462249756, + "learning_rate": 5e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7325153350830078, + "num_tokens": 269909564.0, + "step": 10436 + }, + { + "epoch": 1.1461673621787831, + "grad_norm": 1.838525414466858, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.722925066947937, + "num_tokens": 269937472.0, + "step": 10437 + }, + { + "epoch": 1.1462771798813969, + "grad_norm": 1.8460294008255005, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7457220554351807, + "num_tokens": 269965012.0, + "step": 10438 + }, + { + "epoch": 1.1463869975840106, + "grad_norm": 2.167630195617676, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7410435676574707, + "num_tokens": 269987167.0, + "step": 10439 + }, + { + "epoch": 1.1464968152866242, + "grad_norm": 2.0159473419189453, + "learning_rate": 5e-06, + "loss": 0.845, + "mean_token_accuracy": 0.725403904914856, + "num_tokens": 270010327.0, + "step": 10440 + }, + { + "epoch": 1.146606632989238, + "grad_norm": 1.7109346389770508, + "learning_rate": 5e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7520121335983276, + "num_tokens": 270039314.0, + "step": 10441 + }, + { + "epoch": 1.1467164506918515, + "grad_norm": 1.6166540384292603, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7466473579406738, + "num_tokens": 270072316.0, + "step": 10442 + }, + { + "epoch": 1.1468262683944652, + "grad_norm": 1.9793612957000732, + "learning_rate": 5e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7316229343414307, + "num_tokens": 270095936.0, + "step": 10443 + }, + { + "epoch": 1.1469360860970788, + "grad_norm": 1.844164252281189, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7113825082778931, + "num_tokens": 270121427.0, + "step": 10444 + }, + { + "epoch": 1.1470459037996925, + "grad_norm": 1.7600504159927368, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7525392770767212, + "num_tokens": 270148703.0, + "step": 10445 + }, + { + "epoch": 1.147155721502306, + "grad_norm": 1.8285273313522339, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7178180813789368, + "num_tokens": 270177071.0, + "step": 10446 + }, + { + "epoch": 1.1472655392049198, + "grad_norm": 1.991891860961914, + "learning_rate": 5e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7104576826095581, + "num_tokens": 270206453.0, + "step": 10447 + }, + { + "epoch": 1.1473753569075336, + "grad_norm": 1.8044657707214355, + "learning_rate": 5e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7393307685852051, + "num_tokens": 270233053.0, + "step": 10448 + }, + { + "epoch": 1.147485174610147, + "grad_norm": 1.8651912212371826, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7507714033126831, + "num_tokens": 270257094.0, + "step": 10449 + }, + { + "epoch": 1.1475949923127609, + "grad_norm": 1.8808598518371582, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7435379028320312, + "num_tokens": 270281182.0, + "step": 10450 + }, + { + "epoch": 1.1477048100153744, + "grad_norm": 1.8207553625106812, + "learning_rate": 5e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7399207353591919, + "num_tokens": 270311620.0, + "step": 10451 + }, + { + "epoch": 1.1478146277179881, + "grad_norm": 1.982848048210144, + "learning_rate": 5e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7450873851776123, + "num_tokens": 270336783.0, + "step": 10452 + }, + { + "epoch": 1.147924445420602, + "grad_norm": 2.059399366378784, + "learning_rate": 5e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7313081622123718, + "num_tokens": 270361190.0, + "step": 10453 + }, + { + "epoch": 1.1480342631232154, + "grad_norm": 1.955222487449646, + "learning_rate": 5e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7277141809463501, + "num_tokens": 270385253.0, + "step": 10454 + }, + { + "epoch": 1.1481440808258292, + "grad_norm": 1.7320414781570435, + "learning_rate": 5e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7399868965148926, + "num_tokens": 270415607.0, + "step": 10455 + }, + { + "epoch": 1.1482538985284427, + "grad_norm": 1.9856042861938477, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.722756028175354, + "num_tokens": 270441640.0, + "step": 10456 + }, + { + "epoch": 1.1483637162310565, + "grad_norm": 2.1299657821655273, + "learning_rate": 5e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7501358389854431, + "num_tokens": 270463278.0, + "step": 10457 + }, + { + "epoch": 1.14847353393367, + "grad_norm": 1.8243199586868286, + "learning_rate": 5e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7300844788551331, + "num_tokens": 270490344.0, + "step": 10458 + }, + { + "epoch": 1.1485833516362838, + "grad_norm": 1.8663873672485352, + "learning_rate": 5e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7487396001815796, + "num_tokens": 270518373.0, + "step": 10459 + }, + { + "epoch": 1.1486931693388973, + "grad_norm": 1.842496395111084, + "learning_rate": 5e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7292937636375427, + "num_tokens": 270548653.0, + "step": 10460 + }, + { + "epoch": 1.148802987041511, + "grad_norm": 1.9966225624084473, + "learning_rate": 5e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.746546745300293, + "num_tokens": 270572591.0, + "step": 10461 + }, + { + "epoch": 1.1489128047441248, + "grad_norm": 1.8869739770889282, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7300652265548706, + "num_tokens": 270599464.0, + "step": 10462 + }, + { + "epoch": 1.1490226224467384, + "grad_norm": 1.7681068181991577, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7532054781913757, + "num_tokens": 270628587.0, + "step": 10463 + }, + { + "epoch": 1.1491324401493521, + "grad_norm": 2.006748676300049, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7500244379043579, + "num_tokens": 270651346.0, + "step": 10464 + }, + { + "epoch": 1.1492422578519657, + "grad_norm": 2.065138578414917, + "learning_rate": 5e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7505528926849365, + "num_tokens": 270672342.0, + "step": 10465 + }, + { + "epoch": 1.1493520755545794, + "grad_norm": 2.144911766052246, + "learning_rate": 5e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.731399655342102, + "num_tokens": 270694242.0, + "step": 10466 + }, + { + "epoch": 1.1494618932571932, + "grad_norm": 1.962894082069397, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7312489748001099, + "num_tokens": 270720014.0, + "step": 10467 + }, + { + "epoch": 1.1495717109598067, + "grad_norm": 1.738773226737976, + "learning_rate": 5e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7349012494087219, + "num_tokens": 270749302.0, + "step": 10468 + }, + { + "epoch": 1.1496815286624205, + "grad_norm": 1.8754334449768066, + "learning_rate": 5e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7342350482940674, + "num_tokens": 270778713.0, + "step": 10469 + }, + { + "epoch": 1.149791346365034, + "grad_norm": 2.0633983612060547, + "learning_rate": 5e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7327634692192078, + "num_tokens": 270803863.0, + "step": 10470 + }, + { + "epoch": 1.1499011640676478, + "grad_norm": 2.569892406463623, + "learning_rate": 5e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.731415867805481, + "num_tokens": 270823802.0, + "step": 10471 + }, + { + "epoch": 1.1500109817702613, + "grad_norm": 1.866256594657898, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7251654267311096, + "num_tokens": 270851759.0, + "step": 10472 + }, + { + "epoch": 1.150120799472875, + "grad_norm": 1.9270623922348022, + "learning_rate": 5e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7160900831222534, + "num_tokens": 270878093.0, + "step": 10473 + }, + { + "epoch": 1.1502306171754886, + "grad_norm": 1.8799195289611816, + "learning_rate": 5e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7295328974723816, + "num_tokens": 270904610.0, + "step": 10474 + }, + { + "epoch": 1.1503404348781023, + "grad_norm": 1.804516315460205, + "learning_rate": 5e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7234776020050049, + "num_tokens": 270933048.0, + "step": 10475 + }, + { + "epoch": 1.150450252580716, + "grad_norm": 2.1357064247131348, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7599236369132996, + "num_tokens": 270954195.0, + "step": 10476 + }, + { + "epoch": 1.1505600702833296, + "grad_norm": 1.8408855199813843, + "learning_rate": 5e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.714543342590332, + "num_tokens": 270978926.0, + "step": 10477 + }, + { + "epoch": 1.1506698879859434, + "grad_norm": 2.00288462638855, + "learning_rate": 5e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7421910166740417, + "num_tokens": 271003086.0, + "step": 10478 + }, + { + "epoch": 1.150779705688557, + "grad_norm": 1.6542863845825195, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7447413206100464, + "num_tokens": 271031738.0, + "step": 10479 + }, + { + "epoch": 1.1508895233911707, + "grad_norm": 1.8376446962356567, + "learning_rate": 5e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.717188835144043, + "num_tokens": 271058880.0, + "step": 10480 + }, + { + "epoch": 1.1509993410937844, + "grad_norm": 1.829795241355896, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7505124807357788, + "num_tokens": 271086024.0, + "step": 10481 + }, + { + "epoch": 1.151109158796398, + "grad_norm": 1.8327022790908813, + "learning_rate": 5e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7229794859886169, + "num_tokens": 271113132.0, + "step": 10482 + }, + { + "epoch": 1.1512189764990117, + "grad_norm": 2.0991697311401367, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7513648271560669, + "num_tokens": 271133211.0, + "step": 10483 + }, + { + "epoch": 1.1513287942016253, + "grad_norm": 1.758589744567871, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7258366346359253, + "num_tokens": 271160102.0, + "step": 10484 + }, + { + "epoch": 1.151438611904239, + "grad_norm": 1.9803184270858765, + "learning_rate": 5e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7264102101325989, + "num_tokens": 271185990.0, + "step": 10485 + }, + { + "epoch": 1.1515484296068526, + "grad_norm": 2.066784143447876, + "learning_rate": 5e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7589597702026367, + "num_tokens": 271206636.0, + "step": 10486 + }, + { + "epoch": 1.1516582473094663, + "grad_norm": 1.7357581853866577, + "learning_rate": 5e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7122900485992432, + "num_tokens": 271238930.0, + "step": 10487 + }, + { + "epoch": 1.1517680650120798, + "grad_norm": 2.024439811706543, + "learning_rate": 5e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7523385286331177, + "num_tokens": 271261221.0, + "step": 10488 + }, + { + "epoch": 1.1518778827146936, + "grad_norm": 1.8528743982315063, + "learning_rate": 5e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7369650602340698, + "num_tokens": 271287006.0, + "step": 10489 + }, + { + "epoch": 1.1519877004173074, + "grad_norm": 2.031174659729004, + "learning_rate": 5e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7315368056297302, + "num_tokens": 271308913.0, + "step": 10490 + }, + { + "epoch": 1.152097518119921, + "grad_norm": 1.944876790046692, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7362728118896484, + "num_tokens": 271333875.0, + "step": 10491 + }, + { + "epoch": 1.1522073358225347, + "grad_norm": 2.0009613037109375, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7446075081825256, + "num_tokens": 271357211.0, + "step": 10492 + }, + { + "epoch": 1.1523171535251482, + "grad_norm": 1.6387696266174316, + "learning_rate": 5e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7157846689224243, + "num_tokens": 271390179.0, + "step": 10493 + }, + { + "epoch": 1.152426971227762, + "grad_norm": 1.8399739265441895, + "learning_rate": 5e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7068010568618774, + "num_tokens": 271417779.0, + "step": 10494 + }, + { + "epoch": 1.1525367889303757, + "grad_norm": 1.800068736076355, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7229565382003784, + "num_tokens": 271445906.0, + "step": 10495 + }, + { + "epoch": 1.1526466066329892, + "grad_norm": 1.6578608751296997, + "learning_rate": 5e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7513672113418579, + "num_tokens": 271474575.0, + "step": 10496 + }, + { + "epoch": 1.152756424335603, + "grad_norm": 2.129283905029297, + "learning_rate": 5e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7501605153083801, + "num_tokens": 271495021.0, + "step": 10497 + }, + { + "epoch": 1.1528662420382165, + "grad_norm": 1.9947905540466309, + "learning_rate": 5e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7438703775405884, + "num_tokens": 271520353.0, + "step": 10498 + }, + { + "epoch": 1.1529760597408303, + "grad_norm": 1.9719022512435913, + "learning_rate": 5e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7440582513809204, + "num_tokens": 271543432.0, + "step": 10499 + }, + { + "epoch": 1.1530858774434438, + "grad_norm": 1.8044407367706299, + "learning_rate": 5e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7560882568359375, + "num_tokens": 271567147.0, + "step": 10500 + }, + { + "epoch": 1.1531956951460576, + "grad_norm": 2.303424596786499, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7472966909408569, + "num_tokens": 271592392.0, + "step": 10501 + }, + { + "epoch": 1.153305512848671, + "grad_norm": 1.9621312618255615, + "learning_rate": 5e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7240993976593018, + "num_tokens": 271616161.0, + "step": 10502 + }, + { + "epoch": 1.1534153305512849, + "grad_norm": 1.806027889251709, + "learning_rate": 5e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7375186681747437, + "num_tokens": 271642295.0, + "step": 10503 + }, + { + "epoch": 1.1535251482538986, + "grad_norm": 2.2123522758483887, + "learning_rate": 5e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7219867706298828, + "num_tokens": 271665211.0, + "step": 10504 + }, + { + "epoch": 1.1536349659565122, + "grad_norm": 1.9805017709732056, + "learning_rate": 5e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7441322207450867, + "num_tokens": 271686687.0, + "step": 10505 + }, + { + "epoch": 1.153744783659126, + "grad_norm": 1.9539910554885864, + "learning_rate": 5e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7365840673446655, + "num_tokens": 271709432.0, + "step": 10506 + }, + { + "epoch": 1.1538546013617395, + "grad_norm": 2.10481858253479, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7335145473480225, + "num_tokens": 271732236.0, + "step": 10507 + }, + { + "epoch": 1.1539644190643532, + "grad_norm": 2.0138649940490723, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7196059226989746, + "num_tokens": 271758648.0, + "step": 10508 + }, + { + "epoch": 1.1540742367669667, + "grad_norm": 2.215938091278076, + "learning_rate": 5e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7434849739074707, + "num_tokens": 271779550.0, + "step": 10509 + }, + { + "epoch": 1.1541840544695805, + "grad_norm": 2.0083348751068115, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7288289070129395, + "num_tokens": 271803927.0, + "step": 10510 + }, + { + "epoch": 1.154293872172194, + "grad_norm": 2.102440357208252, + "learning_rate": 5e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7312736511230469, + "num_tokens": 271826345.0, + "step": 10511 + }, + { + "epoch": 1.1544036898748078, + "grad_norm": 2.237417459487915, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7483797669410706, + "num_tokens": 271844606.0, + "step": 10512 + }, + { + "epoch": 1.1545135075774215, + "grad_norm": 2.1981022357940674, + "learning_rate": 5e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7464327812194824, + "num_tokens": 271865565.0, + "step": 10513 + }, + { + "epoch": 1.154623325280035, + "grad_norm": 1.8417696952819824, + "learning_rate": 5e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7195974588394165, + "num_tokens": 271893713.0, + "step": 10514 + }, + { + "epoch": 1.1547331429826488, + "grad_norm": 1.8443963527679443, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7245221138000488, + "num_tokens": 271920737.0, + "step": 10515 + }, + { + "epoch": 1.1548429606852624, + "grad_norm": 1.8931565284729004, + "learning_rate": 5e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7220391035079956, + "num_tokens": 271949603.0, + "step": 10516 + }, + { + "epoch": 1.1549527783878761, + "grad_norm": 2.123182773590088, + "learning_rate": 5e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7341517210006714, + "num_tokens": 271970337.0, + "step": 10517 + }, + { + "epoch": 1.15506259609049, + "grad_norm": 2.0276191234588623, + "learning_rate": 5e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7419345378875732, + "num_tokens": 271994338.0, + "step": 10518 + }, + { + "epoch": 1.1551724137931034, + "grad_norm": 2.219165086746216, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7363380193710327, + "num_tokens": 272015769.0, + "step": 10519 + }, + { + "epoch": 1.1552822314957172, + "grad_norm": 1.7507864236831665, + "learning_rate": 5e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.719470202922821, + "num_tokens": 272047784.0, + "step": 10520 + }, + { + "epoch": 1.1553920491983307, + "grad_norm": 2.0741403102874756, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7301833629608154, + "num_tokens": 272069083.0, + "step": 10521 + }, + { + "epoch": 1.1555018669009445, + "grad_norm": 1.8471298217773438, + "learning_rate": 5e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7525962591171265, + "num_tokens": 272093487.0, + "step": 10522 + }, + { + "epoch": 1.155611684603558, + "grad_norm": 2.261660099029541, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7264158725738525, + "num_tokens": 272115094.0, + "step": 10523 + }, + { + "epoch": 1.1557215023061718, + "grad_norm": 1.9995472431182861, + "learning_rate": 5e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.748679518699646, + "num_tokens": 272138438.0, + "step": 10524 + }, + { + "epoch": 1.1558313200087853, + "grad_norm": 1.8333935737609863, + "learning_rate": 5e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7322182655334473, + "num_tokens": 272165609.0, + "step": 10525 + }, + { + "epoch": 1.155941137711399, + "grad_norm": 1.8871397972106934, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.736233115196228, + "num_tokens": 272189996.0, + "step": 10526 + }, + { + "epoch": 1.1560509554140128, + "grad_norm": 1.7848390340805054, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7642858028411865, + "num_tokens": 272214901.0, + "step": 10527 + }, + { + "epoch": 1.1561607731166264, + "grad_norm": 1.9387993812561035, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7193790674209595, + "num_tokens": 272239161.0, + "step": 10528 + }, + { + "epoch": 1.15627059081924, + "grad_norm": 1.991495132446289, + "learning_rate": 5e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7183820009231567, + "num_tokens": 272265446.0, + "step": 10529 + }, + { + "epoch": 1.1563804085218536, + "grad_norm": 1.9713945388793945, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7408016324043274, + "num_tokens": 272290235.0, + "step": 10530 + }, + { + "epoch": 1.1564902262244674, + "grad_norm": 2.06085467338562, + "learning_rate": 5e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.7805024981498718, + "num_tokens": 272309637.0, + "step": 10531 + }, + { + "epoch": 1.1566000439270812, + "grad_norm": 2.2195396423339844, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7338880300521851, + "num_tokens": 272329630.0, + "step": 10532 + }, + { + "epoch": 1.1567098616296947, + "grad_norm": 2.023160219192505, + "learning_rate": 5e-06, + "loss": 0.6517, + "mean_token_accuracy": 0.785786509513855, + "num_tokens": 272350013.0, + "step": 10533 + }, + { + "epoch": 1.1568196793323084, + "grad_norm": 1.7987638711929321, + "learning_rate": 5e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7101873159408569, + "num_tokens": 272378585.0, + "step": 10534 + }, + { + "epoch": 1.156929497034922, + "grad_norm": 2.028938055038452, + "learning_rate": 5e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7467580437660217, + "num_tokens": 272400888.0, + "step": 10535 + }, + { + "epoch": 1.1570393147375357, + "grad_norm": 1.950857162475586, + "learning_rate": 5e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7529542446136475, + "num_tokens": 272424124.0, + "step": 10536 + }, + { + "epoch": 1.1571491324401493, + "grad_norm": 2.1334171295166016, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7419737577438354, + "num_tokens": 272445423.0, + "step": 10537 + }, + { + "epoch": 1.157258950142763, + "grad_norm": 1.8458003997802734, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7296811938285828, + "num_tokens": 272472199.0, + "step": 10538 + }, + { + "epoch": 1.1573687678453766, + "grad_norm": 2.057340621948242, + "learning_rate": 5e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7486196756362915, + "num_tokens": 272493938.0, + "step": 10539 + }, + { + "epoch": 1.1574785855479903, + "grad_norm": 2.080808401107788, + "learning_rate": 5e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7322027683258057, + "num_tokens": 272516552.0, + "step": 10540 + }, + { + "epoch": 1.157588403250604, + "grad_norm": 1.6322067975997925, + "learning_rate": 5e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6976940035820007, + "num_tokens": 272557167.0, + "step": 10541 + }, + { + "epoch": 1.1576982209532176, + "grad_norm": 1.757501482963562, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7257646322250366, + "num_tokens": 272588700.0, + "step": 10542 + }, + { + "epoch": 1.1578080386558314, + "grad_norm": 2.0129733085632324, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.732933759689331, + "num_tokens": 272613723.0, + "step": 10543 + }, + { + "epoch": 1.157917856358445, + "grad_norm": 1.7463324069976807, + "learning_rate": 5e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7583907842636108, + "num_tokens": 272642143.0, + "step": 10544 + }, + { + "epoch": 1.1580276740610587, + "grad_norm": 2.0921342372894287, + "learning_rate": 5e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7261872887611389, + "num_tokens": 272666786.0, + "step": 10545 + }, + { + "epoch": 1.1581374917636724, + "grad_norm": 2.0322272777557373, + "learning_rate": 5e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7169973850250244, + "num_tokens": 272690478.0, + "step": 10546 + }, + { + "epoch": 1.158247309466286, + "grad_norm": 1.8172459602355957, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7238128185272217, + "num_tokens": 272718486.0, + "step": 10547 + }, + { + "epoch": 1.1583571271688997, + "grad_norm": 1.7420241832733154, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7294847965240479, + "num_tokens": 272745817.0, + "step": 10548 + }, + { + "epoch": 1.1584669448715132, + "grad_norm": 1.752456545829773, + "learning_rate": 5e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7129607796669006, + "num_tokens": 272775181.0, + "step": 10549 + }, + { + "epoch": 1.158576762574127, + "grad_norm": 1.7555310726165771, + "learning_rate": 5e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7129479646682739, + "num_tokens": 272803697.0, + "step": 10550 + }, + { + "epoch": 1.1586865802767405, + "grad_norm": 1.9931628704071045, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7287122011184692, + "num_tokens": 272826938.0, + "step": 10551 + }, + { + "epoch": 1.1587963979793543, + "grad_norm": 1.9625859260559082, + "learning_rate": 5e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7306732535362244, + "num_tokens": 272852118.0, + "step": 10552 + }, + { + "epoch": 1.1589062156819678, + "grad_norm": 1.882049322128296, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.727176308631897, + "num_tokens": 272879116.0, + "step": 10553 + }, + { + "epoch": 1.1590160333845816, + "grad_norm": 1.8842699527740479, + "learning_rate": 5e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7571164965629578, + "num_tokens": 272903442.0, + "step": 10554 + }, + { + "epoch": 1.1591258510871953, + "grad_norm": 1.693953275680542, + "learning_rate": 5e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7062090039253235, + "num_tokens": 272936470.0, + "step": 10555 + }, + { + "epoch": 1.1592356687898089, + "grad_norm": 1.7989956140518188, + "learning_rate": 5e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7419277429580688, + "num_tokens": 272963253.0, + "step": 10556 + }, + { + "epoch": 1.1593454864924226, + "grad_norm": 1.8768454790115356, + "learning_rate": 5e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7230695486068726, + "num_tokens": 272988450.0, + "step": 10557 + }, + { + "epoch": 1.1594553041950362, + "grad_norm": 1.7026479244232178, + "learning_rate": 5e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6970256567001343, + "num_tokens": 273023489.0, + "step": 10558 + }, + { + "epoch": 1.15956512189765, + "grad_norm": 2.049851655960083, + "learning_rate": 5e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.729634165763855, + "num_tokens": 273046915.0, + "step": 10559 + }, + { + "epoch": 1.1596749396002635, + "grad_norm": 1.709646463394165, + "learning_rate": 5e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.716311514377594, + "num_tokens": 273077424.0, + "step": 10560 + }, + { + "epoch": 1.1597847573028772, + "grad_norm": 1.7067323923110962, + "learning_rate": 5e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7623903155326843, + "num_tokens": 273106465.0, + "step": 10561 + }, + { + "epoch": 1.159894575005491, + "grad_norm": 1.858727216720581, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7294083833694458, + "num_tokens": 273133544.0, + "step": 10562 + }, + { + "epoch": 1.1600043927081045, + "grad_norm": 1.7536914348602295, + "learning_rate": 5e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7279942631721497, + "num_tokens": 273164226.0, + "step": 10563 + }, + { + "epoch": 1.1601142104107183, + "grad_norm": 2.078972101211548, + "learning_rate": 5e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7504535913467407, + "num_tokens": 273186565.0, + "step": 10564 + }, + { + "epoch": 1.1602240281133318, + "grad_norm": 1.8867685794830322, + "learning_rate": 5e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7299178838729858, + "num_tokens": 273212423.0, + "step": 10565 + }, + { + "epoch": 1.1603338458159456, + "grad_norm": 1.9153258800506592, + "learning_rate": 5e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7301157116889954, + "num_tokens": 273238461.0, + "step": 10566 + }, + { + "epoch": 1.160443663518559, + "grad_norm": 2.0490777492523193, + "learning_rate": 5e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7450621128082275, + "num_tokens": 273261005.0, + "step": 10567 + }, + { + "epoch": 1.1605534812211729, + "grad_norm": 2.10979962348938, + "learning_rate": 5e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7467595338821411, + "num_tokens": 273281984.0, + "step": 10568 + }, + { + "epoch": 1.1606632989237866, + "grad_norm": 1.7865581512451172, + "learning_rate": 5e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.712742805480957, + "num_tokens": 273310762.0, + "step": 10569 + }, + { + "epoch": 1.1607731166264001, + "grad_norm": 2.083752155303955, + "learning_rate": 5e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7437350749969482, + "num_tokens": 273331566.0, + "step": 10570 + }, + { + "epoch": 1.160882934329014, + "grad_norm": 1.9127297401428223, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7299920320510864, + "num_tokens": 273355944.0, + "step": 10571 + }, + { + "epoch": 1.1609927520316274, + "grad_norm": 1.8417232036590576, + "learning_rate": 5e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7322469353675842, + "num_tokens": 273382193.0, + "step": 10572 + }, + { + "epoch": 1.1611025697342412, + "grad_norm": 1.7452319860458374, + "learning_rate": 5e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7219883799552917, + "num_tokens": 273414137.0, + "step": 10573 + }, + { + "epoch": 1.1612123874368547, + "grad_norm": 1.9458717107772827, + "learning_rate": 5e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7279959917068481, + "num_tokens": 273438712.0, + "step": 10574 + }, + { + "epoch": 1.1613222051394685, + "grad_norm": 1.7607109546661377, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7408518195152283, + "num_tokens": 273466751.0, + "step": 10575 + }, + { + "epoch": 1.161432022842082, + "grad_norm": 1.6631884574890137, + "learning_rate": 5e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7395117282867432, + "num_tokens": 273495762.0, + "step": 10576 + }, + { + "epoch": 1.1615418405446958, + "grad_norm": 1.6653883457183838, + "learning_rate": 5e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7134019136428833, + "num_tokens": 273526807.0, + "step": 10577 + }, + { + "epoch": 1.1616516582473095, + "grad_norm": 2.2790582180023193, + "learning_rate": 5e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7363066077232361, + "num_tokens": 273545766.0, + "step": 10578 + }, + { + "epoch": 1.161761475949923, + "grad_norm": 1.7761307954788208, + "learning_rate": 5e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7353755235671997, + "num_tokens": 273572405.0, + "step": 10579 + }, + { + "epoch": 1.1618712936525368, + "grad_norm": 1.835791826248169, + "learning_rate": 5e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7111315131187439, + "num_tokens": 273599984.0, + "step": 10580 + }, + { + "epoch": 1.1619811113551504, + "grad_norm": 1.8599706888198853, + "learning_rate": 5e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7145674824714661, + "num_tokens": 273627781.0, + "step": 10581 + }, + { + "epoch": 1.1620909290577641, + "grad_norm": 1.6900049448013306, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7336826920509338, + "num_tokens": 273661182.0, + "step": 10582 + }, + { + "epoch": 1.1622007467603779, + "grad_norm": 1.9986051321029663, + "learning_rate": 5e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7111016511917114, + "num_tokens": 273686542.0, + "step": 10583 + }, + { + "epoch": 1.1623105644629914, + "grad_norm": 2.0185532569885254, + "learning_rate": 5e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7408128380775452, + "num_tokens": 273708113.0, + "step": 10584 + }, + { + "epoch": 1.1624203821656052, + "grad_norm": 1.9272359609603882, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7367233037948608, + "num_tokens": 273734490.0, + "step": 10585 + }, + { + "epoch": 1.1625301998682187, + "grad_norm": 2.0306694507598877, + "learning_rate": 5e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7432908415794373, + "num_tokens": 273755997.0, + "step": 10586 + }, + { + "epoch": 1.1626400175708325, + "grad_norm": 1.991093397140503, + "learning_rate": 5e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7279704809188843, + "num_tokens": 273780492.0, + "step": 10587 + }, + { + "epoch": 1.162749835273446, + "grad_norm": 1.8658077716827393, + "learning_rate": 5e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7526097297668457, + "num_tokens": 273804562.0, + "step": 10588 + }, + { + "epoch": 1.1628596529760598, + "grad_norm": 1.9244072437286377, + "learning_rate": 5e-06, + "loss": 0.6536, + "mean_token_accuracy": 0.7797623872756958, + "num_tokens": 273825454.0, + "step": 10589 + }, + { + "epoch": 1.1629694706786733, + "grad_norm": 1.8859624862670898, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.730202317237854, + "num_tokens": 273852225.0, + "step": 10590 + }, + { + "epoch": 1.163079288381287, + "grad_norm": 2.022481679916382, + "learning_rate": 5e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7476174831390381, + "num_tokens": 273874135.0, + "step": 10591 + }, + { + "epoch": 1.1631891060839008, + "grad_norm": 2.1417124271392822, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7209609746932983, + "num_tokens": 273899342.0, + "step": 10592 + }, + { + "epoch": 1.1632989237865143, + "grad_norm": 1.8035006523132324, + "learning_rate": 5e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7514302730560303, + "num_tokens": 273925522.0, + "step": 10593 + }, + { + "epoch": 1.163408741489128, + "grad_norm": 1.8488514423370361, + "learning_rate": 5e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7487296462059021, + "num_tokens": 273948550.0, + "step": 10594 + }, + { + "epoch": 1.1635185591917416, + "grad_norm": 2.270972728729248, + "learning_rate": 5e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7455736398696899, + "num_tokens": 273967462.0, + "step": 10595 + }, + { + "epoch": 1.1636283768943554, + "grad_norm": 1.9377213716506958, + "learning_rate": 5e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7518852949142456, + "num_tokens": 273994178.0, + "step": 10596 + }, + { + "epoch": 1.1637381945969691, + "grad_norm": 1.968873381614685, + "learning_rate": 5e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7242188453674316, + "num_tokens": 274018939.0, + "step": 10597 + }, + { + "epoch": 1.1638480122995827, + "grad_norm": 2.1419060230255127, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7344908714294434, + "num_tokens": 274040623.0, + "step": 10598 + }, + { + "epoch": 1.1639578300021964, + "grad_norm": 1.9515005350112915, + "learning_rate": 5e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7537168264389038, + "num_tokens": 274063931.0, + "step": 10599 + }, + { + "epoch": 1.16406764770481, + "grad_norm": 1.7854080200195312, + "learning_rate": 5e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.746955394744873, + "num_tokens": 274090656.0, + "step": 10600 + }, + { + "epoch": 1.1641774654074237, + "grad_norm": 1.8115476369857788, + "learning_rate": 5e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7062753438949585, + "num_tokens": 274120218.0, + "step": 10601 + }, + { + "epoch": 1.1642872831100373, + "grad_norm": 2.0426688194274902, + "learning_rate": 5e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7441262006759644, + "num_tokens": 274142430.0, + "step": 10602 + }, + { + "epoch": 1.164397100812651, + "grad_norm": 1.9851396083831787, + "learning_rate": 5e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7248865365982056, + "num_tokens": 274167637.0, + "step": 10603 + }, + { + "epoch": 1.1645069185152646, + "grad_norm": 1.7377632856369019, + "learning_rate": 5e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7033340930938721, + "num_tokens": 274198069.0, + "step": 10604 + }, + { + "epoch": 1.1646167362178783, + "grad_norm": 1.7513182163238525, + "learning_rate": 5e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7348959445953369, + "num_tokens": 274228840.0, + "step": 10605 + }, + { + "epoch": 1.164726553920492, + "grad_norm": 1.740086317062378, + "learning_rate": 5e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.748742401599884, + "num_tokens": 274257841.0, + "step": 10606 + }, + { + "epoch": 1.1648363716231056, + "grad_norm": 1.7639561891555786, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7527008652687073, + "num_tokens": 274283573.0, + "step": 10607 + }, + { + "epoch": 1.1649461893257194, + "grad_norm": 1.978610634803772, + "learning_rate": 5e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7240497469902039, + "num_tokens": 274309001.0, + "step": 10608 + }, + { + "epoch": 1.165056007028333, + "grad_norm": 2.0626933574676514, + "learning_rate": 5e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7541956901550293, + "num_tokens": 274330574.0, + "step": 10609 + }, + { + "epoch": 1.1651658247309467, + "grad_norm": 1.735432505607605, + "learning_rate": 5e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7560186386108398, + "num_tokens": 274359075.0, + "step": 10610 + }, + { + "epoch": 1.1652756424335604, + "grad_norm": 1.7039835453033447, + "learning_rate": 5e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7174086570739746, + "num_tokens": 274390065.0, + "step": 10611 + }, + { + "epoch": 1.165385460136174, + "grad_norm": 1.8531817197799683, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7281880974769592, + "num_tokens": 274416837.0, + "step": 10612 + }, + { + "epoch": 1.1654952778387877, + "grad_norm": 1.8514858484268188, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7507219314575195, + "num_tokens": 274440420.0, + "step": 10613 + }, + { + "epoch": 1.1656050955414012, + "grad_norm": 1.9512975215911865, + "learning_rate": 5e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7156111598014832, + "num_tokens": 274466446.0, + "step": 10614 + }, + { + "epoch": 1.165714913244015, + "grad_norm": 1.8585524559020996, + "learning_rate": 5e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7165307998657227, + "num_tokens": 274493566.0, + "step": 10615 + }, + { + "epoch": 1.1658247309466285, + "grad_norm": 2.0382063388824463, + "learning_rate": 5e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7397465705871582, + "num_tokens": 274517064.0, + "step": 10616 + }, + { + "epoch": 1.1659345486492423, + "grad_norm": 1.899644136428833, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7230477333068848, + "num_tokens": 274542364.0, + "step": 10617 + }, + { + "epoch": 1.1660443663518558, + "grad_norm": 1.9416357278823853, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7077193856239319, + "num_tokens": 274567731.0, + "step": 10618 + }, + { + "epoch": 1.1661541840544696, + "grad_norm": 1.8474526405334473, + "learning_rate": 5e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7515511512756348, + "num_tokens": 274593459.0, + "step": 10619 + }, + { + "epoch": 1.1662640017570833, + "grad_norm": 1.9961116313934326, + "learning_rate": 5e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.721912145614624, + "num_tokens": 274619547.0, + "step": 10620 + }, + { + "epoch": 1.1663738194596969, + "grad_norm": 1.7008256912231445, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7156109809875488, + "num_tokens": 274651879.0, + "step": 10621 + }, + { + "epoch": 1.1664836371623106, + "grad_norm": 1.923423409461975, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7539176344871521, + "num_tokens": 274675864.0, + "step": 10622 + }, + { + "epoch": 1.1665934548649242, + "grad_norm": 1.9724215269088745, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7256979942321777, + "num_tokens": 274698897.0, + "step": 10623 + }, + { + "epoch": 1.166703272567538, + "grad_norm": 1.6429013013839722, + "learning_rate": 5e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7309561371803284, + "num_tokens": 274730552.0, + "step": 10624 + }, + { + "epoch": 1.1668130902701515, + "grad_norm": 1.8463616371154785, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7307721972465515, + "num_tokens": 274759180.0, + "step": 10625 + }, + { + "epoch": 1.1669229079727652, + "grad_norm": 1.985592007637024, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7443746328353882, + "num_tokens": 274783473.0, + "step": 10626 + }, + { + "epoch": 1.167032725675379, + "grad_norm": 1.8199716806411743, + "learning_rate": 5e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7322493195533752, + "num_tokens": 274809167.0, + "step": 10627 + }, + { + "epoch": 1.1671425433779925, + "grad_norm": 2.0779361724853516, + "learning_rate": 5e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7564600706100464, + "num_tokens": 274828556.0, + "step": 10628 + }, + { + "epoch": 1.1672523610806063, + "grad_norm": 2.0094292163848877, + "learning_rate": 5e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7384135723114014, + "num_tokens": 274851369.0, + "step": 10629 + }, + { + "epoch": 1.1673621787832198, + "grad_norm": 1.8813763856887817, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7288031578063965, + "num_tokens": 274879070.0, + "step": 10630 + }, + { + "epoch": 1.1674719964858336, + "grad_norm": 1.8124630451202393, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7351080179214478, + "num_tokens": 274907049.0, + "step": 10631 + }, + { + "epoch": 1.167581814188447, + "grad_norm": 2.0490822792053223, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7345620393753052, + "num_tokens": 274930473.0, + "step": 10632 + }, + { + "epoch": 1.1676916318910608, + "grad_norm": 1.872280478477478, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.758979320526123, + "num_tokens": 274953878.0, + "step": 10633 + }, + { + "epoch": 1.1678014495936746, + "grad_norm": 2.025054693222046, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7365533113479614, + "num_tokens": 274976419.0, + "step": 10634 + }, + { + "epoch": 1.1679112672962881, + "grad_norm": 1.8700963258743286, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7286176681518555, + "num_tokens": 275001325.0, + "step": 10635 + }, + { + "epoch": 1.168021084998902, + "grad_norm": 1.9237200021743774, + "learning_rate": 5e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.740427553653717, + "num_tokens": 275025488.0, + "step": 10636 + }, + { + "epoch": 1.1681309027015154, + "grad_norm": 1.9263951778411865, + "learning_rate": 5e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7283794283866882, + "num_tokens": 275051534.0, + "step": 10637 + }, + { + "epoch": 1.1682407204041292, + "grad_norm": 1.7939975261688232, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7144770622253418, + "num_tokens": 275080818.0, + "step": 10638 + }, + { + "epoch": 1.1683505381067427, + "grad_norm": 1.855334997177124, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7505134344100952, + "num_tokens": 275105068.0, + "step": 10639 + }, + { + "epoch": 1.1684603558093565, + "grad_norm": 1.838838815689087, + "learning_rate": 5e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7210416793823242, + "num_tokens": 275134053.0, + "step": 10640 + }, + { + "epoch": 1.16857017351197, + "grad_norm": 1.9197865724563599, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7686509490013123, + "num_tokens": 275158138.0, + "step": 10641 + }, + { + "epoch": 1.1686799912145838, + "grad_norm": 1.8626741170883179, + "learning_rate": 5e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7135630249977112, + "num_tokens": 275186952.0, + "step": 10642 + }, + { + "epoch": 1.1687898089171975, + "grad_norm": 1.7307711839675903, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7440376877784729, + "num_tokens": 275215448.0, + "step": 10643 + }, + { + "epoch": 1.168899626619811, + "grad_norm": 1.850476861000061, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7543242573738098, + "num_tokens": 275241781.0, + "step": 10644 + }, + { + "epoch": 1.1690094443224248, + "grad_norm": 1.8163524866104126, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7249722480773926, + "num_tokens": 275266749.0, + "step": 10645 + }, + { + "epoch": 1.1691192620250384, + "grad_norm": 2.0079410076141357, + "learning_rate": 5e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7325226068496704, + "num_tokens": 275290864.0, + "step": 10646 + }, + { + "epoch": 1.169229079727652, + "grad_norm": 1.960439682006836, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7254246473312378, + "num_tokens": 275315829.0, + "step": 10647 + }, + { + "epoch": 1.1693388974302659, + "grad_norm": 2.0957579612731934, + "learning_rate": 5e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7486317157745361, + "num_tokens": 275337598.0, + "step": 10648 + }, + { + "epoch": 1.1694487151328794, + "grad_norm": 1.8614861965179443, + "learning_rate": 5e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7293044328689575, + "num_tokens": 275365625.0, + "step": 10649 + }, + { + "epoch": 1.1695585328354932, + "grad_norm": 1.9366602897644043, + "learning_rate": 5e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7475544810295105, + "num_tokens": 275389278.0, + "step": 10650 + }, + { + "epoch": 1.1696683505381067, + "grad_norm": 1.7783092260360718, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7114715576171875, + "num_tokens": 275420025.0, + "step": 10651 + }, + { + "epoch": 1.1697781682407205, + "grad_norm": 1.7029926776885986, + "learning_rate": 5e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7090944051742554, + "num_tokens": 275448953.0, + "step": 10652 + }, + { + "epoch": 1.169887985943334, + "grad_norm": 1.979049801826477, + "learning_rate": 5e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7084773778915405, + "num_tokens": 275473096.0, + "step": 10653 + }, + { + "epoch": 1.1699978036459477, + "grad_norm": 1.7575737237930298, + "learning_rate": 5e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7472624778747559, + "num_tokens": 275500930.0, + "step": 10654 + }, + { + "epoch": 1.1701076213485613, + "grad_norm": 1.909661889076233, + "learning_rate": 5e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7141997814178467, + "num_tokens": 275526464.0, + "step": 10655 + }, + { + "epoch": 1.170217439051175, + "grad_norm": 1.8410594463348389, + "learning_rate": 5e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7116118669509888, + "num_tokens": 275555583.0, + "step": 10656 + }, + { + "epoch": 1.1703272567537888, + "grad_norm": 2.0261971950531006, + "learning_rate": 5e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7433117032051086, + "num_tokens": 275577196.0, + "step": 10657 + }, + { + "epoch": 1.1704370744564023, + "grad_norm": 1.6124521493911743, + "learning_rate": 5e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.737311840057373, + "num_tokens": 275608920.0, + "step": 10658 + }, + { + "epoch": 1.170546892159016, + "grad_norm": 1.9926583766937256, + "learning_rate": 5e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7437450885772705, + "num_tokens": 275632776.0, + "step": 10659 + }, + { + "epoch": 1.1706567098616296, + "grad_norm": 2.0459535121917725, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7277336120605469, + "num_tokens": 275656431.0, + "step": 10660 + }, + { + "epoch": 1.1707665275642434, + "grad_norm": 1.7474123239517212, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7260426878929138, + "num_tokens": 275684669.0, + "step": 10661 + }, + { + "epoch": 1.1708763452668571, + "grad_norm": 1.8953335285186768, + "learning_rate": 5e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7282750606536865, + "num_tokens": 275710808.0, + "step": 10662 + }, + { + "epoch": 1.1709861629694707, + "grad_norm": 1.9293396472930908, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7421209812164307, + "num_tokens": 275737596.0, + "step": 10663 + }, + { + "epoch": 1.1710959806720844, + "grad_norm": 2.178757429122925, + "learning_rate": 5e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7286769151687622, + "num_tokens": 275760234.0, + "step": 10664 + }, + { + "epoch": 1.171205798374698, + "grad_norm": 1.957894206047058, + "learning_rate": 5e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7542878985404968, + "num_tokens": 275782572.0, + "step": 10665 + }, + { + "epoch": 1.1713156160773117, + "grad_norm": 1.702562689781189, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7363376021385193, + "num_tokens": 275809774.0, + "step": 10666 + }, + { + "epoch": 1.1714254337799253, + "grad_norm": 1.923916220664978, + "learning_rate": 5e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7399958968162537, + "num_tokens": 275834696.0, + "step": 10667 + }, + { + "epoch": 1.171535251482539, + "grad_norm": 2.0042662620544434, + "learning_rate": 5e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7338384389877319, + "num_tokens": 275860232.0, + "step": 10668 + }, + { + "epoch": 1.1716450691851525, + "grad_norm": 2.2202565670013428, + "learning_rate": 5e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7369463443756104, + "num_tokens": 275880611.0, + "step": 10669 + }, + { + "epoch": 1.1717548868877663, + "grad_norm": 1.9373987913131714, + "learning_rate": 5e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.726506233215332, + "num_tokens": 275908871.0, + "step": 10670 + }, + { + "epoch": 1.17186470459038, + "grad_norm": 1.9118014574050903, + "learning_rate": 5e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7343786954879761, + "num_tokens": 275933271.0, + "step": 10671 + }, + { + "epoch": 1.1719745222929936, + "grad_norm": 2.0505728721618652, + "learning_rate": 5e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7396059036254883, + "num_tokens": 275955195.0, + "step": 10672 + }, + { + "epoch": 1.1720843399956073, + "grad_norm": 1.9256523847579956, + "learning_rate": 5e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7018441557884216, + "num_tokens": 275979249.0, + "step": 10673 + }, + { + "epoch": 1.1721941576982209, + "grad_norm": 1.8617173433303833, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7381848096847534, + "num_tokens": 276003765.0, + "step": 10674 + }, + { + "epoch": 1.1723039754008346, + "grad_norm": 1.8356001377105713, + "learning_rate": 5e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7327255010604858, + "num_tokens": 276032987.0, + "step": 10675 + }, + { + "epoch": 1.1724137931034484, + "grad_norm": 1.7568325996398926, + "learning_rate": 5e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.750465989112854, + "num_tokens": 276060539.0, + "step": 10676 + }, + { + "epoch": 1.172523610806062, + "grad_norm": 1.6782150268554688, + "learning_rate": 5e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7433409690856934, + "num_tokens": 276090130.0, + "step": 10677 + }, + { + "epoch": 1.1726334285086757, + "grad_norm": 1.7343990802764893, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7168447971343994, + "num_tokens": 276118191.0, + "step": 10678 + }, + { + "epoch": 1.1727432462112892, + "grad_norm": 1.914072036743164, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7155485153198242, + "num_tokens": 276144584.0, + "step": 10679 + }, + { + "epoch": 1.172853063913903, + "grad_norm": 1.8730192184448242, + "learning_rate": 5e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7249918580055237, + "num_tokens": 276170783.0, + "step": 10680 + }, + { + "epoch": 1.1729628816165165, + "grad_norm": 1.8878039121627808, + "learning_rate": 5e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7508386969566345, + "num_tokens": 276195969.0, + "step": 10681 + }, + { + "epoch": 1.1730726993191303, + "grad_norm": 2.049330949783325, + "learning_rate": 5e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7362109422683716, + "num_tokens": 276218965.0, + "step": 10682 + }, + { + "epoch": 1.1731825170217438, + "grad_norm": 1.8816713094711304, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7308014631271362, + "num_tokens": 276245754.0, + "step": 10683 + }, + { + "epoch": 1.1732923347243576, + "grad_norm": 2.2899487018585205, + "learning_rate": 5e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7243395447731018, + "num_tokens": 276273109.0, + "step": 10684 + }, + { + "epoch": 1.1734021524269713, + "grad_norm": 2.0290451049804688, + "learning_rate": 5e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7053065299987793, + "num_tokens": 276298431.0, + "step": 10685 + }, + { + "epoch": 1.1735119701295849, + "grad_norm": 1.9689364433288574, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7447739243507385, + "num_tokens": 276322387.0, + "step": 10686 + }, + { + "epoch": 1.1736217878321986, + "grad_norm": 2.2289180755615234, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7496100664138794, + "num_tokens": 276341076.0, + "step": 10687 + }, + { + "epoch": 1.1737316055348122, + "grad_norm": 2.035083055496216, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7370812296867371, + "num_tokens": 276363551.0, + "step": 10688 + }, + { + "epoch": 1.173841423237426, + "grad_norm": 1.8330003023147583, + "learning_rate": 5e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7481220960617065, + "num_tokens": 276388066.0, + "step": 10689 + }, + { + "epoch": 1.1739512409400394, + "grad_norm": 1.994673252105713, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7287024855613708, + "num_tokens": 276413117.0, + "step": 10690 + }, + { + "epoch": 1.1740610586426532, + "grad_norm": 1.9293841123580933, + "learning_rate": 5e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7202211618423462, + "num_tokens": 276438918.0, + "step": 10691 + }, + { + "epoch": 1.1741708763452667, + "grad_norm": 1.8869237899780273, + "learning_rate": 5e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7085404992103577, + "num_tokens": 276464826.0, + "step": 10692 + }, + { + "epoch": 1.1742806940478805, + "grad_norm": 1.6382348537445068, + "learning_rate": 5e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7330753803253174, + "num_tokens": 276494814.0, + "step": 10693 + }, + { + "epoch": 1.1743905117504942, + "grad_norm": 1.9857017993927002, + "learning_rate": 5e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7406094074249268, + "num_tokens": 276517120.0, + "step": 10694 + }, + { + "epoch": 1.1745003294531078, + "grad_norm": 2.1325387954711914, + "learning_rate": 5e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7548563480377197, + "num_tokens": 276536971.0, + "step": 10695 + }, + { + "epoch": 1.1746101471557215, + "grad_norm": 2.0705697536468506, + "learning_rate": 5e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7391988039016724, + "num_tokens": 276560206.0, + "step": 10696 + }, + { + "epoch": 1.174719964858335, + "grad_norm": 1.9085346460342407, + "learning_rate": 5e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7252585887908936, + "num_tokens": 276585305.0, + "step": 10697 + }, + { + "epoch": 1.1748297825609488, + "grad_norm": 2.009052276611328, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7500030398368835, + "num_tokens": 276609676.0, + "step": 10698 + }, + { + "epoch": 1.1749396002635626, + "grad_norm": 1.9694148302078247, + "learning_rate": 5e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7094662189483643, + "num_tokens": 276635502.0, + "step": 10699 + }, + { + "epoch": 1.1750494179661761, + "grad_norm": 2.0064775943756104, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7323685884475708, + "num_tokens": 276659520.0, + "step": 10700 + }, + { + "epoch": 1.1751592356687899, + "grad_norm": 1.8650494813919067, + "learning_rate": 5e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7239710688591003, + "num_tokens": 276686063.0, + "step": 10701 + }, + { + "epoch": 1.1752690533714034, + "grad_norm": 1.8220751285552979, + "learning_rate": 5e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7570140361785889, + "num_tokens": 276711708.0, + "step": 10702 + }, + { + "epoch": 1.1753788710740172, + "grad_norm": 1.7813631296157837, + "learning_rate": 5e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7427916526794434, + "num_tokens": 276738087.0, + "step": 10703 + }, + { + "epoch": 1.1754886887766307, + "grad_norm": 1.8890292644500732, + "learning_rate": 5e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7219062447547913, + "num_tokens": 276764004.0, + "step": 10704 + }, + { + "epoch": 1.1755985064792445, + "grad_norm": 1.918250322341919, + "learning_rate": 5e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.75035560131073, + "num_tokens": 276786577.0, + "step": 10705 + }, + { + "epoch": 1.175708324181858, + "grad_norm": 1.7958956956863403, + "learning_rate": 5e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7479116916656494, + "num_tokens": 276811687.0, + "step": 10706 + }, + { + "epoch": 1.1758181418844718, + "grad_norm": 1.7977436780929565, + "learning_rate": 5e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7288846373558044, + "num_tokens": 276840882.0, + "step": 10707 + }, + { + "epoch": 1.1759279595870855, + "grad_norm": 1.9424151182174683, + "learning_rate": 5e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7297995090484619, + "num_tokens": 276868398.0, + "step": 10708 + }, + { + "epoch": 1.176037777289699, + "grad_norm": 1.8282097578048706, + "learning_rate": 5e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7467092275619507, + "num_tokens": 276895524.0, + "step": 10709 + }, + { + "epoch": 1.1761475949923128, + "grad_norm": 1.8291678428649902, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7490002512931824, + "num_tokens": 276919809.0, + "step": 10710 + }, + { + "epoch": 1.1762574126949263, + "grad_norm": 2.1305503845214844, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7599872350692749, + "num_tokens": 276940724.0, + "step": 10711 + }, + { + "epoch": 1.17636723039754, + "grad_norm": 1.9172371625900269, + "learning_rate": 5e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7431274652481079, + "num_tokens": 276963657.0, + "step": 10712 + }, + { + "epoch": 1.1764770481001539, + "grad_norm": 2.0784285068511963, + "learning_rate": 5e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7336759567260742, + "num_tokens": 276988122.0, + "step": 10713 + }, + { + "epoch": 1.1765868658027674, + "grad_norm": 1.785135269165039, + "learning_rate": 5e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7153069972991943, + "num_tokens": 277018749.0, + "step": 10714 + }, + { + "epoch": 1.1766966835053811, + "grad_norm": 1.8963605165481567, + "learning_rate": 5e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7323964834213257, + "num_tokens": 277044290.0, + "step": 10715 + }, + { + "epoch": 1.1768065012079947, + "grad_norm": 1.8734716176986694, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7421841621398926, + "num_tokens": 277070189.0, + "step": 10716 + }, + { + "epoch": 1.1769163189106084, + "grad_norm": 1.851086139678955, + "learning_rate": 5e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7410173416137695, + "num_tokens": 277095709.0, + "step": 10717 + }, + { + "epoch": 1.177026136613222, + "grad_norm": 2.017012119293213, + "learning_rate": 5e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7069189548492432, + "num_tokens": 277120848.0, + "step": 10718 + }, + { + "epoch": 1.1771359543158357, + "grad_norm": 1.6879019737243652, + "learning_rate": 5e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7033519744873047, + "num_tokens": 277154862.0, + "step": 10719 + }, + { + "epoch": 1.1772457720184493, + "grad_norm": 1.8291313648223877, + "learning_rate": 5e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7331346869468689, + "num_tokens": 277182415.0, + "step": 10720 + }, + { + "epoch": 1.177355589721063, + "grad_norm": 2.0712952613830566, + "learning_rate": 5e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.7601956725120544, + "num_tokens": 277205591.0, + "step": 10721 + }, + { + "epoch": 1.1774654074236768, + "grad_norm": 1.7698882818222046, + "learning_rate": 5e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7308239936828613, + "num_tokens": 277232272.0, + "step": 10722 + }, + { + "epoch": 1.1775752251262903, + "grad_norm": 1.8563131093978882, + "learning_rate": 5e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7219369411468506, + "num_tokens": 277259433.0, + "step": 10723 + }, + { + "epoch": 1.177685042828904, + "grad_norm": 1.8851779699325562, + "learning_rate": 5e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.740935206413269, + "num_tokens": 277285840.0, + "step": 10724 + }, + { + "epoch": 1.1777948605315176, + "grad_norm": 1.9311683177947998, + "learning_rate": 5e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7281012535095215, + "num_tokens": 277310548.0, + "step": 10725 + }, + { + "epoch": 1.1779046782341314, + "grad_norm": 1.7959052324295044, + "learning_rate": 5e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7273092269897461, + "num_tokens": 277338268.0, + "step": 10726 + }, + { + "epoch": 1.1780144959367451, + "grad_norm": 2.050387382507324, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7316139340400696, + "num_tokens": 277360226.0, + "step": 10727 + }, + { + "epoch": 1.1781243136393587, + "grad_norm": 1.7726867198944092, + "learning_rate": 5e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.718366801738739, + "num_tokens": 277388993.0, + "step": 10728 + }, + { + "epoch": 1.1782341313419724, + "grad_norm": 1.882380485534668, + "learning_rate": 5e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7365455031394958, + "num_tokens": 277414350.0, + "step": 10729 + }, + { + "epoch": 1.178343949044586, + "grad_norm": 2.1769492626190186, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7251381278038025, + "num_tokens": 277436114.0, + "step": 10730 + }, + { + "epoch": 1.1784537667471997, + "grad_norm": 1.920684814453125, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7263898253440857, + "num_tokens": 277461972.0, + "step": 10731 + }, + { + "epoch": 1.1785635844498132, + "grad_norm": 1.7195706367492676, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7620097398757935, + "num_tokens": 277489246.0, + "step": 10732 + }, + { + "epoch": 1.178673402152427, + "grad_norm": 1.976577877998352, + "learning_rate": 5e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7118139863014221, + "num_tokens": 277514351.0, + "step": 10733 + }, + { + "epoch": 1.1787832198550405, + "grad_norm": 1.7454814910888672, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.715432345867157, + "num_tokens": 277546212.0, + "step": 10734 + }, + { + "epoch": 1.1788930375576543, + "grad_norm": 1.7875279188156128, + "learning_rate": 5e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7433212995529175, + "num_tokens": 277576255.0, + "step": 10735 + }, + { + "epoch": 1.179002855260268, + "grad_norm": 2.0279901027679443, + "learning_rate": 5e-06, + "loss": 0.832, + "mean_token_accuracy": 0.735561728477478, + "num_tokens": 277598523.0, + "step": 10736 + }, + { + "epoch": 1.1791126729628816, + "grad_norm": 1.9482907056808472, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7263429164886475, + "num_tokens": 277625068.0, + "step": 10737 + }, + { + "epoch": 1.1792224906654953, + "grad_norm": 1.831398606300354, + "learning_rate": 5e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7277688980102539, + "num_tokens": 277650789.0, + "step": 10738 + }, + { + "epoch": 1.1793323083681089, + "grad_norm": 2.208508014678955, + "learning_rate": 5e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7478874325752258, + "num_tokens": 277670997.0, + "step": 10739 + }, + { + "epoch": 1.1794421260707226, + "grad_norm": 1.747652292251587, + "learning_rate": 5e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7085641026496887, + "num_tokens": 277700469.0, + "step": 10740 + }, + { + "epoch": 1.1795519437733364, + "grad_norm": 2.254988670349121, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7565789222717285, + "num_tokens": 277720591.0, + "step": 10741 + }, + { + "epoch": 1.17966176147595, + "grad_norm": 1.8222925662994385, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.736682653427124, + "num_tokens": 277746250.0, + "step": 10742 + }, + { + "epoch": 1.1797715791785637, + "grad_norm": 2.083700180053711, + "learning_rate": 5e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7468277215957642, + "num_tokens": 277767739.0, + "step": 10743 + }, + { + "epoch": 1.1798813968811772, + "grad_norm": 1.8632701635360718, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.733002781867981, + "num_tokens": 277795234.0, + "step": 10744 + }, + { + "epoch": 1.179991214583791, + "grad_norm": 1.8853951692581177, + "learning_rate": 5e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7409011125564575, + "num_tokens": 277819419.0, + "step": 10745 + }, + { + "epoch": 1.1801010322864045, + "grad_norm": 1.7115243673324585, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7447977066040039, + "num_tokens": 277846633.0, + "step": 10746 + }, + { + "epoch": 1.1802108499890183, + "grad_norm": 1.841388463973999, + "learning_rate": 5e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7451951503753662, + "num_tokens": 277872362.0, + "step": 10747 + }, + { + "epoch": 1.1803206676916318, + "grad_norm": 1.9161436557769775, + "learning_rate": 5e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7294502854347229, + "num_tokens": 277899214.0, + "step": 10748 + }, + { + "epoch": 1.1804304853942456, + "grad_norm": 1.934116244316101, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7219198942184448, + "num_tokens": 277926346.0, + "step": 10749 + }, + { + "epoch": 1.1805403030968593, + "grad_norm": 2.1951537132263184, + "learning_rate": 5e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7398126125335693, + "num_tokens": 277947249.0, + "step": 10750 + }, + { + "epoch": 1.1806501207994728, + "grad_norm": 1.8637187480926514, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7627928256988525, + "num_tokens": 277970381.0, + "step": 10751 + }, + { + "epoch": 1.1807599385020866, + "grad_norm": 1.7204118967056274, + "learning_rate": 5e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7248528003692627, + "num_tokens": 278004730.0, + "step": 10752 + }, + { + "epoch": 1.1808697562047001, + "grad_norm": 1.8379669189453125, + "learning_rate": 5e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7610492706298828, + "num_tokens": 278029519.0, + "step": 10753 + }, + { + "epoch": 1.180979573907314, + "grad_norm": 1.9125999212265015, + "learning_rate": 5e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7437307834625244, + "num_tokens": 278053371.0, + "step": 10754 + }, + { + "epoch": 1.1810893916099274, + "grad_norm": 1.8684444427490234, + "learning_rate": 5e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7395724058151245, + "num_tokens": 278081428.0, + "step": 10755 + }, + { + "epoch": 1.1811992093125412, + "grad_norm": 2.1352486610412598, + "learning_rate": 5e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7300475239753723, + "num_tokens": 278101535.0, + "step": 10756 + }, + { + "epoch": 1.1813090270151547, + "grad_norm": 1.8970279693603516, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7317962646484375, + "num_tokens": 278125992.0, + "step": 10757 + }, + { + "epoch": 1.1814188447177685, + "grad_norm": 1.89361572265625, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7410805821418762, + "num_tokens": 278151319.0, + "step": 10758 + }, + { + "epoch": 1.1815286624203822, + "grad_norm": 2.011068820953369, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7446601390838623, + "num_tokens": 278173414.0, + "step": 10759 + }, + { + "epoch": 1.1816384801229958, + "grad_norm": 2.1521236896514893, + "learning_rate": 5e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7357816696166992, + "num_tokens": 278196101.0, + "step": 10760 + }, + { + "epoch": 1.1817482978256095, + "grad_norm": 1.949453592300415, + "learning_rate": 5e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7298677563667297, + "num_tokens": 278221905.0, + "step": 10761 + }, + { + "epoch": 1.181858115528223, + "grad_norm": 1.872595191001892, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7317343950271606, + "num_tokens": 278246105.0, + "step": 10762 + }, + { + "epoch": 1.1819679332308368, + "grad_norm": 1.8002992868423462, + "learning_rate": 5e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7261359691619873, + "num_tokens": 278274721.0, + "step": 10763 + }, + { + "epoch": 1.1820777509334506, + "grad_norm": 1.7987005710601807, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7289539575576782, + "num_tokens": 278302594.0, + "step": 10764 + }, + { + "epoch": 1.1821875686360641, + "grad_norm": 1.8254011869430542, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7193877100944519, + "num_tokens": 278330020.0, + "step": 10765 + }, + { + "epoch": 1.1822973863386779, + "grad_norm": 1.9357765913009644, + "learning_rate": 5e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7430037260055542, + "num_tokens": 278354703.0, + "step": 10766 + }, + { + "epoch": 1.1824072040412914, + "grad_norm": 1.9863522052764893, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7299737930297852, + "num_tokens": 278381301.0, + "step": 10767 + }, + { + "epoch": 1.1825170217439052, + "grad_norm": 1.803858757019043, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7260040640830994, + "num_tokens": 278409879.0, + "step": 10768 + }, + { + "epoch": 1.1826268394465187, + "grad_norm": 1.9482879638671875, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7329671382904053, + "num_tokens": 278433606.0, + "step": 10769 + }, + { + "epoch": 1.1827366571491325, + "grad_norm": 2.062307119369507, + "learning_rate": 5e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7707494497299194, + "num_tokens": 278451866.0, + "step": 10770 + }, + { + "epoch": 1.182846474851746, + "grad_norm": 1.8073246479034424, + "learning_rate": 5e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.738715648651123, + "num_tokens": 278479860.0, + "step": 10771 + }, + { + "epoch": 1.1829562925543597, + "grad_norm": 1.8569146394729614, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7184673547744751, + "num_tokens": 278509210.0, + "step": 10772 + }, + { + "epoch": 1.1830661102569735, + "grad_norm": 1.8315390348434448, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7210741639137268, + "num_tokens": 278538170.0, + "step": 10773 + }, + { + "epoch": 1.183175927959587, + "grad_norm": 1.8206937313079834, + "learning_rate": 5e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7505936622619629, + "num_tokens": 278565622.0, + "step": 10774 + }, + { + "epoch": 1.1832857456622008, + "grad_norm": 1.7941381931304932, + "learning_rate": 5e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7074034214019775, + "num_tokens": 278596360.0, + "step": 10775 + }, + { + "epoch": 1.1833955633648143, + "grad_norm": 1.7595653533935547, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7103060483932495, + "num_tokens": 278627514.0, + "step": 10776 + }, + { + "epoch": 1.183505381067428, + "grad_norm": 1.673628568649292, + "learning_rate": 5e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7252933979034424, + "num_tokens": 278657788.0, + "step": 10777 + }, + { + "epoch": 1.1836151987700418, + "grad_norm": 1.9372365474700928, + "learning_rate": 5e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.740705668926239, + "num_tokens": 278680489.0, + "step": 10778 + }, + { + "epoch": 1.1837250164726554, + "grad_norm": 1.6318649053573608, + "learning_rate": 5e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7219996452331543, + "num_tokens": 278711392.0, + "step": 10779 + }, + { + "epoch": 1.1838348341752691, + "grad_norm": 1.7669183015823364, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7300045490264893, + "num_tokens": 278738490.0, + "step": 10780 + }, + { + "epoch": 1.1839446518778827, + "grad_norm": 2.1150963306427, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7356202006340027, + "num_tokens": 278758692.0, + "step": 10781 + }, + { + "epoch": 1.1840544695804964, + "grad_norm": 1.812078833580017, + "learning_rate": 5e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7393797636032104, + "num_tokens": 278786060.0, + "step": 10782 + }, + { + "epoch": 1.18416428728311, + "grad_norm": 1.9299553632736206, + "learning_rate": 5e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7135113477706909, + "num_tokens": 278812927.0, + "step": 10783 + }, + { + "epoch": 1.1842741049857237, + "grad_norm": 1.967795729637146, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7317672967910767, + "num_tokens": 278834742.0, + "step": 10784 + }, + { + "epoch": 1.1843839226883373, + "grad_norm": 1.9212932586669922, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7198410630226135, + "num_tokens": 278859979.0, + "step": 10785 + }, + { + "epoch": 1.184493740390951, + "grad_norm": 1.8779494762420654, + "learning_rate": 5e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7160687446594238, + "num_tokens": 278885503.0, + "step": 10786 + }, + { + "epoch": 1.1846035580935648, + "grad_norm": 1.7930477857589722, + "learning_rate": 5e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7061269283294678, + "num_tokens": 278913773.0, + "step": 10787 + }, + { + "epoch": 1.1847133757961783, + "grad_norm": 1.9613745212554932, + "learning_rate": 5e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7227838039398193, + "num_tokens": 278937484.0, + "step": 10788 + }, + { + "epoch": 1.184823193498792, + "grad_norm": 1.8125364780426025, + "learning_rate": 5e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7139643430709839, + "num_tokens": 278964658.0, + "step": 10789 + }, + { + "epoch": 1.1849330112014056, + "grad_norm": 1.9159785509109497, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7541753649711609, + "num_tokens": 278986604.0, + "step": 10790 + }, + { + "epoch": 1.1850428289040194, + "grad_norm": 1.9605909585952759, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7342362403869629, + "num_tokens": 279009034.0, + "step": 10791 + }, + { + "epoch": 1.185152646606633, + "grad_norm": 1.9809945821762085, + "learning_rate": 5e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7562419176101685, + "num_tokens": 279030004.0, + "step": 10792 + }, + { + "epoch": 1.1852624643092466, + "grad_norm": 1.6853011846542358, + "learning_rate": 5e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7423487901687622, + "num_tokens": 279060236.0, + "step": 10793 + }, + { + "epoch": 1.1853722820118604, + "grad_norm": 1.8720901012420654, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7315594553947449, + "num_tokens": 279084648.0, + "step": 10794 + }, + { + "epoch": 1.185482099714474, + "grad_norm": 2.1504015922546387, + "learning_rate": 5e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7626712322235107, + "num_tokens": 279104700.0, + "step": 10795 + }, + { + "epoch": 1.1855919174170877, + "grad_norm": 1.7838207483291626, + "learning_rate": 5e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7423611879348755, + "num_tokens": 279133197.0, + "step": 10796 + }, + { + "epoch": 1.1857017351197012, + "grad_norm": 1.6213157176971436, + "learning_rate": 5e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7302750945091248, + "num_tokens": 279164523.0, + "step": 10797 + }, + { + "epoch": 1.185811552822315, + "grad_norm": 2.015256881713867, + "learning_rate": 5e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7327532172203064, + "num_tokens": 279189351.0, + "step": 10798 + }, + { + "epoch": 1.1859213705249285, + "grad_norm": 1.7036099433898926, + "learning_rate": 5e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7412135601043701, + "num_tokens": 279217977.0, + "step": 10799 + }, + { + "epoch": 1.1860311882275423, + "grad_norm": 2.0241098403930664, + "learning_rate": 5e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7638323903083801, + "num_tokens": 279242637.0, + "step": 10800 + }, + { + "epoch": 1.186141005930156, + "grad_norm": 1.9302446842193604, + "learning_rate": 5e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7211201786994934, + "num_tokens": 279270211.0, + "step": 10801 + }, + { + "epoch": 1.1862508236327696, + "grad_norm": 1.732767105102539, + "learning_rate": 5e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7253666520118713, + "num_tokens": 279300836.0, + "step": 10802 + }, + { + "epoch": 1.1863606413353833, + "grad_norm": 1.9151039123535156, + "learning_rate": 5e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7300184965133667, + "num_tokens": 279326615.0, + "step": 10803 + }, + { + "epoch": 1.1864704590379969, + "grad_norm": 1.9001046419143677, + "learning_rate": 5e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.732395350933075, + "num_tokens": 279352496.0, + "step": 10804 + }, + { + "epoch": 1.1865802767406106, + "grad_norm": 1.8307815790176392, + "learning_rate": 5e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.725989580154419, + "num_tokens": 279380451.0, + "step": 10805 + }, + { + "epoch": 1.1866900944432242, + "grad_norm": 2.09386944770813, + "learning_rate": 5e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7396687865257263, + "num_tokens": 279402023.0, + "step": 10806 + }, + { + "epoch": 1.186799912145838, + "grad_norm": 2.0943658351898193, + "learning_rate": 5e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7429898977279663, + "num_tokens": 279424807.0, + "step": 10807 + }, + { + "epoch": 1.1869097298484517, + "grad_norm": 1.809836506843567, + "learning_rate": 5e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7557387351989746, + "num_tokens": 279449078.0, + "step": 10808 + }, + { + "epoch": 1.1870195475510652, + "grad_norm": 1.9636484384536743, + "learning_rate": 5e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7396880388259888, + "num_tokens": 279471528.0, + "step": 10809 + }, + { + "epoch": 1.187129365253679, + "grad_norm": 2.059223175048828, + "learning_rate": 5e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7299385070800781, + "num_tokens": 279495498.0, + "step": 10810 + }, + { + "epoch": 1.1872391829562925, + "grad_norm": 1.9822564125061035, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7368236780166626, + "num_tokens": 279518425.0, + "step": 10811 + }, + { + "epoch": 1.1873490006589063, + "grad_norm": 1.7318841218948364, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.741136372089386, + "num_tokens": 279548168.0, + "step": 10812 + }, + { + "epoch": 1.1874588183615198, + "grad_norm": 1.8789458274841309, + "learning_rate": 5e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7386645078659058, + "num_tokens": 279574484.0, + "step": 10813 + }, + { + "epoch": 1.1875686360641335, + "grad_norm": 2.2561421394348145, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7507284283638, + "num_tokens": 279594418.0, + "step": 10814 + }, + { + "epoch": 1.1876784537667473, + "grad_norm": 1.736094355583191, + "learning_rate": 5e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7281894087791443, + "num_tokens": 279626661.0, + "step": 10815 + }, + { + "epoch": 1.1877882714693608, + "grad_norm": 2.0550153255462646, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7534313201904297, + "num_tokens": 279646812.0, + "step": 10816 + }, + { + "epoch": 1.1878980891719746, + "grad_norm": 1.8478195667266846, + "learning_rate": 5e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7285415530204773, + "num_tokens": 279674330.0, + "step": 10817 + }, + { + "epoch": 1.1880079068745881, + "grad_norm": 1.9575552940368652, + "learning_rate": 5e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7327451705932617, + "num_tokens": 279697608.0, + "step": 10818 + }, + { + "epoch": 1.1881177245772019, + "grad_norm": 1.7871296405792236, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7461297512054443, + "num_tokens": 279725090.0, + "step": 10819 + }, + { + "epoch": 1.1882275422798154, + "grad_norm": 1.7819231748580933, + "learning_rate": 5e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7155396938323975, + "num_tokens": 279755958.0, + "step": 10820 + }, + { + "epoch": 1.1883373599824292, + "grad_norm": 1.8627814054489136, + "learning_rate": 5e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7167384028434753, + "num_tokens": 279784286.0, + "step": 10821 + }, + { + "epoch": 1.1884471776850427, + "grad_norm": 1.6245747804641724, + "learning_rate": 5e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7287955284118652, + "num_tokens": 279819317.0, + "step": 10822 + }, + { + "epoch": 1.1885569953876565, + "grad_norm": 2.031583547592163, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7269923090934753, + "num_tokens": 279841256.0, + "step": 10823 + }, + { + "epoch": 1.1886668130902702, + "grad_norm": 1.7200900316238403, + "learning_rate": 5e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7284231781959534, + "num_tokens": 279871187.0, + "step": 10824 + }, + { + "epoch": 1.1887766307928838, + "grad_norm": 1.7694038152694702, + "learning_rate": 5e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7120351791381836, + "num_tokens": 279899942.0, + "step": 10825 + }, + { + "epoch": 1.1888864484954975, + "grad_norm": 1.8690626621246338, + "learning_rate": 5e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7168325781822205, + "num_tokens": 279926386.0, + "step": 10826 + }, + { + "epoch": 1.188996266198111, + "grad_norm": 1.7809717655181885, + "learning_rate": 5e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7342256903648376, + "num_tokens": 279954975.0, + "step": 10827 + }, + { + "epoch": 1.1891060839007248, + "grad_norm": 1.9190760850906372, + "learning_rate": 5e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7365726232528687, + "num_tokens": 279980436.0, + "step": 10828 + }, + { + "epoch": 1.1892159016033386, + "grad_norm": 2.0381228923797607, + "learning_rate": 5e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.743416428565979, + "num_tokens": 280003502.0, + "step": 10829 + }, + { + "epoch": 1.189325719305952, + "grad_norm": 1.6809823513031006, + "learning_rate": 5e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7393487691879272, + "num_tokens": 280034073.0, + "step": 10830 + }, + { + "epoch": 1.1894355370085659, + "grad_norm": 1.8287888765335083, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7320172786712646, + "num_tokens": 280061107.0, + "step": 10831 + }, + { + "epoch": 1.1895453547111794, + "grad_norm": 1.7119972705841064, + "learning_rate": 5e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.747035562992096, + "num_tokens": 280091806.0, + "step": 10832 + }, + { + "epoch": 1.1896551724137931, + "grad_norm": 1.7680360078811646, + "learning_rate": 5e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7345401048660278, + "num_tokens": 280120458.0, + "step": 10833 + }, + { + "epoch": 1.1897649901164067, + "grad_norm": 2.0901362895965576, + "learning_rate": 5e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7508975267410278, + "num_tokens": 280141282.0, + "step": 10834 + }, + { + "epoch": 1.1898748078190204, + "grad_norm": 1.8679324388504028, + "learning_rate": 5e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7209286689758301, + "num_tokens": 280168745.0, + "step": 10835 + }, + { + "epoch": 1.189984625521634, + "grad_norm": 1.7919639348983765, + "learning_rate": 5e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7171198129653931, + "num_tokens": 280197820.0, + "step": 10836 + }, + { + "epoch": 1.1900944432242477, + "grad_norm": 1.913253664970398, + "learning_rate": 5e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7360422015190125, + "num_tokens": 280221432.0, + "step": 10837 + }, + { + "epoch": 1.1902042609268615, + "grad_norm": 1.792786717414856, + "learning_rate": 5e-06, + "loss": 0.794, + "mean_token_accuracy": 0.7444767951965332, + "num_tokens": 280248330.0, + "step": 10838 + }, + { + "epoch": 1.190314078629475, + "grad_norm": 1.8104437589645386, + "learning_rate": 5e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7491288185119629, + "num_tokens": 280273114.0, + "step": 10839 + }, + { + "epoch": 1.1904238963320888, + "grad_norm": 1.9136632680892944, + "learning_rate": 5e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7574259638786316, + "num_tokens": 280294356.0, + "step": 10840 + }, + { + "epoch": 1.1905337140347023, + "grad_norm": 1.9915798902511597, + "learning_rate": 5e-06, + "loss": 0.848, + "mean_token_accuracy": 0.732758641242981, + "num_tokens": 280319219.0, + "step": 10841 + }, + { + "epoch": 1.190643531737316, + "grad_norm": 1.924384355545044, + "learning_rate": 5e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7412746548652649, + "num_tokens": 280342622.0, + "step": 10842 + }, + { + "epoch": 1.1907533494399298, + "grad_norm": 1.738205909729004, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7369034886360168, + "num_tokens": 280373004.0, + "step": 10843 + }, + { + "epoch": 1.1908631671425434, + "grad_norm": 1.8346350193023682, + "learning_rate": 5e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7377537488937378, + "num_tokens": 280401683.0, + "step": 10844 + }, + { + "epoch": 1.1909729848451571, + "grad_norm": 2.089759588241577, + "learning_rate": 5e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7408417463302612, + "num_tokens": 280422663.0, + "step": 10845 + }, + { + "epoch": 1.1910828025477707, + "grad_norm": 2.0399842262268066, + "learning_rate": 5e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7570346593856812, + "num_tokens": 280445532.0, + "step": 10846 + }, + { + "epoch": 1.1911926202503844, + "grad_norm": 1.9579662084579468, + "learning_rate": 5e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.754584550857544, + "num_tokens": 280467855.0, + "step": 10847 + }, + { + "epoch": 1.191302437952998, + "grad_norm": 1.8418861627578735, + "learning_rate": 5e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7537712454795837, + "num_tokens": 280493691.0, + "step": 10848 + }, + { + "epoch": 1.1914122556556117, + "grad_norm": 1.9380916357040405, + "learning_rate": 5e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7377673387527466, + "num_tokens": 280517121.0, + "step": 10849 + }, + { + "epoch": 1.1915220733582252, + "grad_norm": 1.7680492401123047, + "learning_rate": 5e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7271478176116943, + "num_tokens": 280546054.0, + "step": 10850 + }, + { + "epoch": 1.191631891060839, + "grad_norm": 2.0656206607818604, + "learning_rate": 5e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.754440426826477, + "num_tokens": 280566501.0, + "step": 10851 + }, + { + "epoch": 1.1917417087634528, + "grad_norm": 1.816202163696289, + "learning_rate": 5e-06, + "loss": 0.904, + "mean_token_accuracy": 0.71131432056427, + "num_tokens": 280594476.0, + "step": 10852 + }, + { + "epoch": 1.1918515264660663, + "grad_norm": 1.8592236042022705, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7290560007095337, + "num_tokens": 280619493.0, + "step": 10853 + }, + { + "epoch": 1.19196134416868, + "grad_norm": 1.80643892288208, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7261530160903931, + "num_tokens": 280651446.0, + "step": 10854 + }, + { + "epoch": 1.1920711618712936, + "grad_norm": 1.9543498754501343, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7214287519454956, + "num_tokens": 280675958.0, + "step": 10855 + }, + { + "epoch": 1.1921809795739073, + "grad_norm": 1.915545105934143, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7257974147796631, + "num_tokens": 280699664.0, + "step": 10856 + }, + { + "epoch": 1.192290797276521, + "grad_norm": 1.7251832485198975, + "learning_rate": 5e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7190086245536804, + "num_tokens": 280730189.0, + "step": 10857 + }, + { + "epoch": 1.1924006149791346, + "grad_norm": 1.64875066280365, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7304491400718689, + "num_tokens": 280758731.0, + "step": 10858 + }, + { + "epoch": 1.1925104326817484, + "grad_norm": 1.9363716840744019, + "learning_rate": 5e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7120661735534668, + "num_tokens": 280786605.0, + "step": 10859 + }, + { + "epoch": 1.192620250384362, + "grad_norm": 1.8597251176834106, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7248474955558777, + "num_tokens": 280811881.0, + "step": 10860 + }, + { + "epoch": 1.1927300680869757, + "grad_norm": 1.8247630596160889, + "learning_rate": 5e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7237664461135864, + "num_tokens": 280841129.0, + "step": 10861 + }, + { + "epoch": 1.1928398857895892, + "grad_norm": 1.8094522953033447, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7167354822158813, + "num_tokens": 280870418.0, + "step": 10862 + }, + { + "epoch": 1.192949703492203, + "grad_norm": 1.8540480136871338, + "learning_rate": 5e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7172239422798157, + "num_tokens": 280895206.0, + "step": 10863 + }, + { + "epoch": 1.1930595211948165, + "grad_norm": 1.544836401939392, + "learning_rate": 5e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7304149866104126, + "num_tokens": 280930967.0, + "step": 10864 + }, + { + "epoch": 1.1931693388974303, + "grad_norm": 1.7689610719680786, + "learning_rate": 5e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7142437696456909, + "num_tokens": 280960023.0, + "step": 10865 + }, + { + "epoch": 1.193279156600044, + "grad_norm": 1.701166033744812, + "learning_rate": 5e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7033569812774658, + "num_tokens": 280990194.0, + "step": 10866 + }, + { + "epoch": 1.1933889743026576, + "grad_norm": 1.9193917512893677, + "learning_rate": 5e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7377184629440308, + "num_tokens": 281016850.0, + "step": 10867 + }, + { + "epoch": 1.1934987920052713, + "grad_norm": 1.745957612991333, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7100277543067932, + "num_tokens": 281046598.0, + "step": 10868 + }, + { + "epoch": 1.1936086097078848, + "grad_norm": 1.6989046335220337, + "learning_rate": 5e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7315539121627808, + "num_tokens": 281078623.0, + "step": 10869 + }, + { + "epoch": 1.1937184274104986, + "grad_norm": 2.1450798511505127, + "learning_rate": 5e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.71009361743927, + "num_tokens": 281102588.0, + "step": 10870 + }, + { + "epoch": 1.1938282451131121, + "grad_norm": 1.7055563926696777, + "learning_rate": 5e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7212318181991577, + "num_tokens": 281133143.0, + "step": 10871 + }, + { + "epoch": 1.193938062815726, + "grad_norm": 1.5113728046417236, + "learning_rate": 5e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7379264831542969, + "num_tokens": 281165395.0, + "step": 10872 + }, + { + "epoch": 1.1940478805183394, + "grad_norm": 1.6817272901535034, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7200374603271484, + "num_tokens": 281195478.0, + "step": 10873 + }, + { + "epoch": 1.1941576982209532, + "grad_norm": 1.9611181020736694, + "learning_rate": 5e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7199325561523438, + "num_tokens": 281219685.0, + "step": 10874 + }, + { + "epoch": 1.194267515923567, + "grad_norm": 1.692325234413147, + "learning_rate": 5e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7227832674980164, + "num_tokens": 281249265.0, + "step": 10875 + }, + { + "epoch": 1.1943773336261805, + "grad_norm": 1.6596481800079346, + "learning_rate": 5e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7280098795890808, + "num_tokens": 281279784.0, + "step": 10876 + }, + { + "epoch": 1.1944871513287942, + "grad_norm": 1.816377878189087, + "learning_rate": 5e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7385408282279968, + "num_tokens": 281306762.0, + "step": 10877 + }, + { + "epoch": 1.1945969690314078, + "grad_norm": 1.654266357421875, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7187825441360474, + "num_tokens": 281338167.0, + "step": 10878 + }, + { + "epoch": 1.1947067867340215, + "grad_norm": 1.7683862447738647, + "learning_rate": 5e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7476211190223694, + "num_tokens": 281365770.0, + "step": 10879 + }, + { + "epoch": 1.1948166044366353, + "grad_norm": 1.7665092945098877, + "learning_rate": 5e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.732384443283081, + "num_tokens": 281393897.0, + "step": 10880 + }, + { + "epoch": 1.1949264221392488, + "grad_norm": 1.9180066585540771, + "learning_rate": 5e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7494949102401733, + "num_tokens": 281416865.0, + "step": 10881 + }, + { + "epoch": 1.1950362398418626, + "grad_norm": 1.732163667678833, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7268646955490112, + "num_tokens": 281447888.0, + "step": 10882 + }, + { + "epoch": 1.1951460575444761, + "grad_norm": 1.7271736860275269, + "learning_rate": 5e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7114508152008057, + "num_tokens": 281478551.0, + "step": 10883 + }, + { + "epoch": 1.1952558752470899, + "grad_norm": 1.7482914924621582, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.733553409576416, + "num_tokens": 281505132.0, + "step": 10884 + }, + { + "epoch": 1.1953656929497034, + "grad_norm": 1.901463270187378, + "learning_rate": 5e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7434593439102173, + "num_tokens": 281529694.0, + "step": 10885 + }, + { + "epoch": 1.1954755106523172, + "grad_norm": 1.8443708419799805, + "learning_rate": 5e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7483931183815002, + "num_tokens": 281554632.0, + "step": 10886 + }, + { + "epoch": 1.1955853283549307, + "grad_norm": 2.0772438049316406, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7504995465278625, + "num_tokens": 281577477.0, + "step": 10887 + }, + { + "epoch": 1.1956951460575445, + "grad_norm": 1.546974778175354, + "learning_rate": 5e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7227745056152344, + "num_tokens": 281612403.0, + "step": 10888 + }, + { + "epoch": 1.1958049637601582, + "grad_norm": 1.903909683227539, + "learning_rate": 5e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7268756628036499, + "num_tokens": 281637664.0, + "step": 10889 + }, + { + "epoch": 1.1959147814627717, + "grad_norm": 1.9317642450332642, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7498893737792969, + "num_tokens": 281663452.0, + "step": 10890 + }, + { + "epoch": 1.1960245991653855, + "grad_norm": 1.9989734888076782, + "learning_rate": 5e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7268035411834717, + "num_tokens": 281686245.0, + "step": 10891 + }, + { + "epoch": 1.196134416867999, + "grad_norm": 1.941144347190857, + "learning_rate": 5e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.728316605091095, + "num_tokens": 281713953.0, + "step": 10892 + }, + { + "epoch": 1.1962442345706128, + "grad_norm": 1.851445198059082, + "learning_rate": 5e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.6994582414627075, + "num_tokens": 281742642.0, + "step": 10893 + }, + { + "epoch": 1.1963540522732266, + "grad_norm": 1.7646456956863403, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7396621704101562, + "num_tokens": 281769863.0, + "step": 10894 + }, + { + "epoch": 1.19646386997584, + "grad_norm": 1.8711737394332886, + "learning_rate": 5e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7257587313652039, + "num_tokens": 281796728.0, + "step": 10895 + }, + { + "epoch": 1.1965736876784538, + "grad_norm": 1.9951210021972656, + "learning_rate": 5e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7531366348266602, + "num_tokens": 281818161.0, + "step": 10896 + }, + { + "epoch": 1.1966835053810674, + "grad_norm": 1.8958590030670166, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7157715559005737, + "num_tokens": 281846513.0, + "step": 10897 + }, + { + "epoch": 1.1967933230836811, + "grad_norm": 2.0206289291381836, + "learning_rate": 5e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7246150970458984, + "num_tokens": 281871061.0, + "step": 10898 + }, + { + "epoch": 1.1969031407862947, + "grad_norm": 2.142408847808838, + "learning_rate": 5e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7120888829231262, + "num_tokens": 281894477.0, + "step": 10899 + }, + { + "epoch": 1.1970129584889084, + "grad_norm": 2.1745147705078125, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.739721417427063, + "num_tokens": 281915763.0, + "step": 10900 + }, + { + "epoch": 1.197122776191522, + "grad_norm": 1.9704383611679077, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7403850555419922, + "num_tokens": 281939030.0, + "step": 10901 + }, + { + "epoch": 1.1972325938941357, + "grad_norm": 1.8120036125183105, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7136470079421997, + "num_tokens": 281967225.0, + "step": 10902 + }, + { + "epoch": 1.1973424115967495, + "grad_norm": 1.7602301836013794, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7270352840423584, + "num_tokens": 281995641.0, + "step": 10903 + }, + { + "epoch": 1.197452229299363, + "grad_norm": 1.9614787101745605, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7254763841629028, + "num_tokens": 282020405.0, + "step": 10904 + }, + { + "epoch": 1.1975620470019768, + "grad_norm": 1.7711056470870972, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7488569021224976, + "num_tokens": 282046954.0, + "step": 10905 + }, + { + "epoch": 1.1976718647045903, + "grad_norm": 1.8110017776489258, + "learning_rate": 5e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7169361114501953, + "num_tokens": 282075141.0, + "step": 10906 + }, + { + "epoch": 1.197781682407204, + "grad_norm": 1.721272587776184, + "learning_rate": 5e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7126749753952026, + "num_tokens": 282105323.0, + "step": 10907 + }, + { + "epoch": 1.1978915001098178, + "grad_norm": 2.0782129764556885, + "learning_rate": 5e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7344000935554504, + "num_tokens": 282129169.0, + "step": 10908 + }, + { + "epoch": 1.1980013178124314, + "grad_norm": 2.005905866622925, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7248401641845703, + "num_tokens": 282154383.0, + "step": 10909 + }, + { + "epoch": 1.1981111355150451, + "grad_norm": 1.9094815254211426, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7401602268218994, + "num_tokens": 282180989.0, + "step": 10910 + }, + { + "epoch": 1.1982209532176586, + "grad_norm": 1.8099024295806885, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7106248140335083, + "num_tokens": 282210464.0, + "step": 10911 + }, + { + "epoch": 1.1983307709202724, + "grad_norm": 1.7133188247680664, + "learning_rate": 5e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7515630125999451, + "num_tokens": 282238548.0, + "step": 10912 + }, + { + "epoch": 1.198440588622886, + "grad_norm": 1.8368932008743286, + "learning_rate": 5e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7344447374343872, + "num_tokens": 282266207.0, + "step": 10913 + }, + { + "epoch": 1.1985504063254997, + "grad_norm": 1.7962732315063477, + "learning_rate": 5e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7557489275932312, + "num_tokens": 282290091.0, + "step": 10914 + }, + { + "epoch": 1.1986602240281132, + "grad_norm": 1.7753219604492188, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7406975626945496, + "num_tokens": 282317144.0, + "step": 10915 + }, + { + "epoch": 1.198770041730727, + "grad_norm": 1.7700862884521484, + "learning_rate": 5e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.731843113899231, + "num_tokens": 282345967.0, + "step": 10916 + }, + { + "epoch": 1.1988798594333407, + "grad_norm": 1.9844725131988525, + "learning_rate": 5e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.750991702079773, + "num_tokens": 282370726.0, + "step": 10917 + }, + { + "epoch": 1.1989896771359543, + "grad_norm": 1.8629099130630493, + "learning_rate": 5e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7087584137916565, + "num_tokens": 282399886.0, + "step": 10918 + }, + { + "epoch": 1.199099494838568, + "grad_norm": 1.7351216077804565, + "learning_rate": 5e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7428805232048035, + "num_tokens": 282427202.0, + "step": 10919 + }, + { + "epoch": 1.1992093125411816, + "grad_norm": 2.035569667816162, + "learning_rate": 5e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7227116823196411, + "num_tokens": 282450479.0, + "step": 10920 + }, + { + "epoch": 1.1993191302437953, + "grad_norm": 1.7856248617172241, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.742023766040802, + "num_tokens": 282479090.0, + "step": 10921 + }, + { + "epoch": 1.199428947946409, + "grad_norm": 1.9418445825576782, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7379015684127808, + "num_tokens": 282503840.0, + "step": 10922 + }, + { + "epoch": 1.1995387656490226, + "grad_norm": 1.773261547088623, + "learning_rate": 5e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7476972341537476, + "num_tokens": 282529998.0, + "step": 10923 + }, + { + "epoch": 1.1996485833516364, + "grad_norm": 1.8084765672683716, + "learning_rate": 5e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7389047145843506, + "num_tokens": 282557234.0, + "step": 10924 + }, + { + "epoch": 1.19975840105425, + "grad_norm": 2.146249294281006, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7255271673202515, + "num_tokens": 282578984.0, + "step": 10925 + }, + { + "epoch": 1.1998682187568637, + "grad_norm": 1.7160249948501587, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7223264575004578, + "num_tokens": 282605601.0, + "step": 10926 + }, + { + "epoch": 1.1999780364594772, + "grad_norm": 1.7894611358642578, + "learning_rate": 5e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7239570617675781, + "num_tokens": 282634098.0, + "step": 10927 + }, + { + "epoch": 1.200087854162091, + "grad_norm": 1.8514155149459839, + "learning_rate": 5e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7368156909942627, + "num_tokens": 282662370.0, + "step": 10928 + }, + { + "epoch": 1.2001976718647045, + "grad_norm": 1.885663628578186, + "learning_rate": 5e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.750044584274292, + "num_tokens": 282685332.0, + "step": 10929 + }, + { + "epoch": 1.2003074895673183, + "grad_norm": 1.8019688129425049, + "learning_rate": 5e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7486665844917297, + "num_tokens": 282712650.0, + "step": 10930 + }, + { + "epoch": 1.200417307269932, + "grad_norm": 1.7383517026901245, + "learning_rate": 5e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7214584350585938, + "num_tokens": 282743171.0, + "step": 10931 + }, + { + "epoch": 1.2005271249725455, + "grad_norm": 1.8976664543151855, + "learning_rate": 5e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7434730529785156, + "num_tokens": 282767247.0, + "step": 10932 + }, + { + "epoch": 1.2006369426751593, + "grad_norm": 1.9355237483978271, + "learning_rate": 5e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7222366333007812, + "num_tokens": 282792160.0, + "step": 10933 + }, + { + "epoch": 1.2007467603777728, + "grad_norm": 1.8378546237945557, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.741508960723877, + "num_tokens": 282816710.0, + "step": 10934 + }, + { + "epoch": 1.2008565780803866, + "grad_norm": 1.7251286506652832, + "learning_rate": 5e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7324658632278442, + "num_tokens": 282846358.0, + "step": 10935 + }, + { + "epoch": 1.2009663957830001, + "grad_norm": 1.6486483812332153, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.706080436706543, + "num_tokens": 282882330.0, + "step": 10936 + }, + { + "epoch": 1.2010762134856139, + "grad_norm": 1.9191666841506958, + "learning_rate": 5e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7374242544174194, + "num_tokens": 282906403.0, + "step": 10937 + }, + { + "epoch": 1.2011860311882274, + "grad_norm": 1.7902671098709106, + "learning_rate": 5e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7424769401550293, + "num_tokens": 282932412.0, + "step": 10938 + }, + { + "epoch": 1.2012958488908412, + "grad_norm": 1.88884699344635, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7362405061721802, + "num_tokens": 282957944.0, + "step": 10939 + }, + { + "epoch": 1.201405666593455, + "grad_norm": 2.0986921787261963, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7364519834518433, + "num_tokens": 282977685.0, + "step": 10940 + }, + { + "epoch": 1.2015154842960685, + "grad_norm": 1.820741057395935, + "learning_rate": 5e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7342792749404907, + "num_tokens": 283003288.0, + "step": 10941 + }, + { + "epoch": 1.2016253019986822, + "grad_norm": 1.8449887037277222, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7221387624740601, + "num_tokens": 283030756.0, + "step": 10942 + }, + { + "epoch": 1.2017351197012958, + "grad_norm": 1.8123486042022705, + "learning_rate": 5e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7180792689323425, + "num_tokens": 283057019.0, + "step": 10943 + }, + { + "epoch": 1.2018449374039095, + "grad_norm": 1.61445152759552, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7315237522125244, + "num_tokens": 283090247.0, + "step": 10944 + }, + { + "epoch": 1.2019547551065233, + "grad_norm": 2.2480905055999756, + "learning_rate": 5e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7246825695037842, + "num_tokens": 283111108.0, + "step": 10945 + }, + { + "epoch": 1.2020645728091368, + "grad_norm": 1.82432222366333, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7592717409133911, + "num_tokens": 283134977.0, + "step": 10946 + }, + { + "epoch": 1.2021743905117506, + "grad_norm": 1.7863938808441162, + "learning_rate": 5e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7146642208099365, + "num_tokens": 283163923.0, + "step": 10947 + }, + { + "epoch": 1.202284208214364, + "grad_norm": 1.7745983600616455, + "learning_rate": 5e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7371387481689453, + "num_tokens": 283193185.0, + "step": 10948 + }, + { + "epoch": 1.2023940259169779, + "grad_norm": 1.945923924446106, + "learning_rate": 5e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7202556729316711, + "num_tokens": 283218016.0, + "step": 10949 + }, + { + "epoch": 1.2025038436195914, + "grad_norm": 1.660637617111206, + "learning_rate": 5e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7378674745559692, + "num_tokens": 283248091.0, + "step": 10950 + }, + { + "epoch": 1.2026136613222052, + "grad_norm": 1.7972975969314575, + "learning_rate": 5e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7117047309875488, + "num_tokens": 283278027.0, + "step": 10951 + }, + { + "epoch": 1.2027234790248187, + "grad_norm": 1.8346012830734253, + "learning_rate": 5e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7183631658554077, + "num_tokens": 283306900.0, + "step": 10952 + }, + { + "epoch": 1.2028332967274324, + "grad_norm": 1.860695242881775, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7293380498886108, + "num_tokens": 283335795.0, + "step": 10953 + }, + { + "epoch": 1.2029431144300462, + "grad_norm": 1.7096374034881592, + "learning_rate": 5e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7362892627716064, + "num_tokens": 283363613.0, + "step": 10954 + }, + { + "epoch": 1.2030529321326597, + "grad_norm": 1.8631891012191772, + "learning_rate": 5e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7524941563606262, + "num_tokens": 283389388.0, + "step": 10955 + }, + { + "epoch": 1.2031627498352735, + "grad_norm": 1.8225345611572266, + "learning_rate": 5e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7222402095794678, + "num_tokens": 283417486.0, + "step": 10956 + }, + { + "epoch": 1.203272567537887, + "grad_norm": 1.9488707780838013, + "learning_rate": 5e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7376195192337036, + "num_tokens": 283441558.0, + "step": 10957 + }, + { + "epoch": 1.2033823852405008, + "grad_norm": 1.9677698612213135, + "learning_rate": 5e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7381010055541992, + "num_tokens": 283465565.0, + "step": 10958 + }, + { + "epoch": 1.2034922029431145, + "grad_norm": 1.6267706155776978, + "learning_rate": 5e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7224124670028687, + "num_tokens": 283498704.0, + "step": 10959 + }, + { + "epoch": 1.203602020645728, + "grad_norm": 1.891211986541748, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7416117191314697, + "num_tokens": 283524238.0, + "step": 10960 + }, + { + "epoch": 1.2037118383483418, + "grad_norm": 2.271613836288452, + "learning_rate": 5e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7580012083053589, + "num_tokens": 283543013.0, + "step": 10961 + }, + { + "epoch": 1.2038216560509554, + "grad_norm": 1.9103527069091797, + "learning_rate": 5e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7465945482254028, + "num_tokens": 283568651.0, + "step": 10962 + }, + { + "epoch": 1.2039314737535691, + "grad_norm": 1.8621338605880737, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7179539203643799, + "num_tokens": 283596019.0, + "step": 10963 + }, + { + "epoch": 1.2040412914561827, + "grad_norm": 1.8510217666625977, + "learning_rate": 5e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7196802496910095, + "num_tokens": 283623423.0, + "step": 10964 + }, + { + "epoch": 1.2041511091587964, + "grad_norm": 1.7856227159500122, + "learning_rate": 5e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7242423295974731, + "num_tokens": 283653330.0, + "step": 10965 + }, + { + "epoch": 1.20426092686141, + "grad_norm": 1.6772935390472412, + "learning_rate": 5e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7193082571029663, + "num_tokens": 283685420.0, + "step": 10966 + }, + { + "epoch": 1.2043707445640237, + "grad_norm": 1.64035964012146, + "learning_rate": 5e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7338719964027405, + "num_tokens": 283715005.0, + "step": 10967 + }, + { + "epoch": 1.2044805622666375, + "grad_norm": 2.0357959270477295, + "learning_rate": 5e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7370067238807678, + "num_tokens": 283738077.0, + "step": 10968 + }, + { + "epoch": 1.204590379969251, + "grad_norm": 1.8274471759796143, + "learning_rate": 5e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7383179664611816, + "num_tokens": 283766353.0, + "step": 10969 + }, + { + "epoch": 1.2047001976718648, + "grad_norm": 1.8089736700057983, + "learning_rate": 5e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7450354695320129, + "num_tokens": 283793852.0, + "step": 10970 + }, + { + "epoch": 1.2048100153744783, + "grad_norm": 2.1000125408172607, + "learning_rate": 5e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7462595701217651, + "num_tokens": 283817333.0, + "step": 10971 + }, + { + "epoch": 1.204919833077092, + "grad_norm": 1.938523292541504, + "learning_rate": 5e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7254632115364075, + "num_tokens": 283843240.0, + "step": 10972 + }, + { + "epoch": 1.2050296507797058, + "grad_norm": 1.9065362215042114, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7341457605361938, + "num_tokens": 283867418.0, + "step": 10973 + }, + { + "epoch": 1.2051394684823193, + "grad_norm": 1.8821138143539429, + "learning_rate": 5e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7480846643447876, + "num_tokens": 283893258.0, + "step": 10974 + }, + { + "epoch": 1.205249286184933, + "grad_norm": 2.0594592094421387, + "learning_rate": 5e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7512564063072205, + "num_tokens": 283914844.0, + "step": 10975 + }, + { + "epoch": 1.2053591038875466, + "grad_norm": 2.1088693141937256, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7366843223571777, + "num_tokens": 283937936.0, + "step": 10976 + }, + { + "epoch": 1.2054689215901604, + "grad_norm": 1.8988364934921265, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7561849355697632, + "num_tokens": 283963749.0, + "step": 10977 + }, + { + "epoch": 1.205578739292774, + "grad_norm": 2.0310444831848145, + "learning_rate": 5e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7335491180419922, + "num_tokens": 283987019.0, + "step": 10978 + }, + { + "epoch": 1.2056885569953877, + "grad_norm": 2.0460405349731445, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7303628325462341, + "num_tokens": 284010337.0, + "step": 10979 + }, + { + "epoch": 1.2057983746980012, + "grad_norm": 1.8687893152236938, + "learning_rate": 5e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7102714776992798, + "num_tokens": 284036636.0, + "step": 10980 + }, + { + "epoch": 1.205908192400615, + "grad_norm": 1.5793805122375488, + "learning_rate": 5e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7229111194610596, + "num_tokens": 284076018.0, + "step": 10981 + }, + { + "epoch": 1.2060180101032287, + "grad_norm": 2.0351645946502686, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7218881845474243, + "num_tokens": 284100571.0, + "step": 10982 + }, + { + "epoch": 1.2061278278058423, + "grad_norm": 1.7418060302734375, + "learning_rate": 5e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7324317693710327, + "num_tokens": 284128163.0, + "step": 10983 + }, + { + "epoch": 1.206237645508456, + "grad_norm": 1.9810739755630493, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7766934633255005, + "num_tokens": 284151345.0, + "step": 10984 + }, + { + "epoch": 1.2063474632110696, + "grad_norm": 2.042330026626587, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7242215871810913, + "num_tokens": 284176484.0, + "step": 10985 + }, + { + "epoch": 1.2064572809136833, + "grad_norm": 2.069220542907715, + "learning_rate": 5e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7263906002044678, + "num_tokens": 284199357.0, + "step": 10986 + }, + { + "epoch": 1.2065670986162969, + "grad_norm": 1.8960037231445312, + "learning_rate": 5e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7448416948318481, + "num_tokens": 284223357.0, + "step": 10987 + }, + { + "epoch": 1.2066769163189106, + "grad_norm": 1.873282790184021, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.755071222782135, + "num_tokens": 284248861.0, + "step": 10988 + }, + { + "epoch": 1.2067867340215244, + "grad_norm": 1.8603203296661377, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7576768398284912, + "num_tokens": 284272641.0, + "step": 10989 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 1.9136908054351807, + "learning_rate": 5e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7117334008216858, + "num_tokens": 284298734.0, + "step": 10990 + }, + { + "epoch": 1.2070063694267517, + "grad_norm": 2.0923004150390625, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7371886968612671, + "num_tokens": 284321361.0, + "step": 10991 + }, + { + "epoch": 1.2071161871293652, + "grad_norm": 1.901650071144104, + "learning_rate": 5e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7377990484237671, + "num_tokens": 284345810.0, + "step": 10992 + }, + { + "epoch": 1.207226004831979, + "grad_norm": 1.8667579889297485, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.754443883895874, + "num_tokens": 284370339.0, + "step": 10993 + }, + { + "epoch": 1.2073358225345925, + "grad_norm": 1.9728885889053345, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7322659492492676, + "num_tokens": 284393841.0, + "step": 10994 + }, + { + "epoch": 1.2074456402372062, + "grad_norm": 1.8464035987854004, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7215571403503418, + "num_tokens": 284419465.0, + "step": 10995 + }, + { + "epoch": 1.20755545793982, + "grad_norm": 1.9636601209640503, + "learning_rate": 5e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7414727210998535, + "num_tokens": 284441838.0, + "step": 10996 + }, + { + "epoch": 1.2076652756424335, + "grad_norm": 1.7042231559753418, + "learning_rate": 5e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7304506897926331, + "num_tokens": 284473029.0, + "step": 10997 + }, + { + "epoch": 1.2077750933450473, + "grad_norm": 2.040421485900879, + "learning_rate": 5e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7575942873954773, + "num_tokens": 284493271.0, + "step": 10998 + }, + { + "epoch": 1.2078849110476608, + "grad_norm": 1.7771931886672974, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7217074632644653, + "num_tokens": 284522054.0, + "step": 10999 + }, + { + "epoch": 1.2079947287502746, + "grad_norm": 1.8993443250656128, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7273427248001099, + "num_tokens": 284546684.0, + "step": 11000 + }, + { + "epoch": 1.2081045464528881, + "grad_norm": 1.743139386177063, + "learning_rate": 5e-06, + "loss": 0.959, + "mean_token_accuracy": 0.697834312915802, + "num_tokens": 284577312.0, + "step": 11001 + }, + { + "epoch": 1.2082143641555019, + "grad_norm": 1.9975757598876953, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7400569319725037, + "num_tokens": 284600384.0, + "step": 11002 + }, + { + "epoch": 1.2083241818581154, + "grad_norm": 1.8674418926239014, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7434400320053101, + "num_tokens": 284625764.0, + "step": 11003 + }, + { + "epoch": 1.2084339995607292, + "grad_norm": 1.755070686340332, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7451985478401184, + "num_tokens": 284653041.0, + "step": 11004 + }, + { + "epoch": 1.208543817263343, + "grad_norm": 1.7666505575180054, + "learning_rate": 5e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7381629347801208, + "num_tokens": 284681899.0, + "step": 11005 + }, + { + "epoch": 1.2086536349659565, + "grad_norm": 2.099245309829712, + "learning_rate": 5e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7368125915527344, + "num_tokens": 284702129.0, + "step": 11006 + }, + { + "epoch": 1.2087634526685702, + "grad_norm": 2.029771566390991, + "learning_rate": 5e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7378661632537842, + "num_tokens": 284725975.0, + "step": 11007 + }, + { + "epoch": 1.2088732703711838, + "grad_norm": 1.7630712985992432, + "learning_rate": 5e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7298598289489746, + "num_tokens": 284754697.0, + "step": 11008 + }, + { + "epoch": 1.2089830880737975, + "grad_norm": 2.1373534202575684, + "learning_rate": 5e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7421355843544006, + "num_tokens": 284775173.0, + "step": 11009 + }, + { + "epoch": 1.2090929057764113, + "grad_norm": 1.8497165441513062, + "learning_rate": 5e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7173856496810913, + "num_tokens": 284802810.0, + "step": 11010 + }, + { + "epoch": 1.2092027234790248, + "grad_norm": 1.935357928276062, + "learning_rate": 5e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7421313524246216, + "num_tokens": 284827564.0, + "step": 11011 + }, + { + "epoch": 1.2093125411816386, + "grad_norm": 1.8076932430267334, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7366425395011902, + "num_tokens": 284854101.0, + "step": 11012 + }, + { + "epoch": 1.209422358884252, + "grad_norm": 1.7950711250305176, + "learning_rate": 5e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7188160419464111, + "num_tokens": 284881468.0, + "step": 11013 + }, + { + "epoch": 1.2095321765868658, + "grad_norm": 2.0060853958129883, + "learning_rate": 5e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7362595796585083, + "num_tokens": 284904533.0, + "step": 11014 + }, + { + "epoch": 1.2096419942894794, + "grad_norm": 1.9612566232681274, + "learning_rate": 5e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7249550223350525, + "num_tokens": 284930646.0, + "step": 11015 + }, + { + "epoch": 1.2097518119920931, + "grad_norm": 1.6421092748641968, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7332807183265686, + "num_tokens": 284963977.0, + "step": 11016 + }, + { + "epoch": 1.2098616296947067, + "grad_norm": 1.6985008716583252, + "learning_rate": 5e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7278977632522583, + "num_tokens": 284993725.0, + "step": 11017 + }, + { + "epoch": 1.2099714473973204, + "grad_norm": 1.933305025100708, + "learning_rate": 5e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7101448774337769, + "num_tokens": 285019160.0, + "step": 11018 + }, + { + "epoch": 1.2100812650999342, + "grad_norm": 2.23642897605896, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7300871014595032, + "num_tokens": 285038760.0, + "step": 11019 + }, + { + "epoch": 1.2101910828025477, + "grad_norm": 2.082602024078369, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7478498816490173, + "num_tokens": 285061302.0, + "step": 11020 + }, + { + "epoch": 1.2103009005051615, + "grad_norm": 1.8644548654556274, + "learning_rate": 5e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7317306995391846, + "num_tokens": 285087194.0, + "step": 11021 + }, + { + "epoch": 1.210410718207775, + "grad_norm": 2.088469982147217, + "learning_rate": 5e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7404012680053711, + "num_tokens": 285108114.0, + "step": 11022 + }, + { + "epoch": 1.2105205359103888, + "grad_norm": 2.0601916313171387, + "learning_rate": 5e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7255721092224121, + "num_tokens": 285132230.0, + "step": 11023 + }, + { + "epoch": 1.2106303536130025, + "grad_norm": 1.90890371799469, + "learning_rate": 5e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7543668150901794, + "num_tokens": 285157733.0, + "step": 11024 + }, + { + "epoch": 1.210740171315616, + "grad_norm": 1.893776535987854, + "learning_rate": 5e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7227444648742676, + "num_tokens": 285182273.0, + "step": 11025 + }, + { + "epoch": 1.2108499890182298, + "grad_norm": 1.9979099035263062, + "learning_rate": 5e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7556375861167908, + "num_tokens": 285205053.0, + "step": 11026 + }, + { + "epoch": 1.2109598067208434, + "grad_norm": 1.7115697860717773, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7239019870758057, + "num_tokens": 285236277.0, + "step": 11027 + }, + { + "epoch": 1.2110696244234571, + "grad_norm": 2.1426196098327637, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7439641952514648, + "num_tokens": 285258264.0, + "step": 11028 + }, + { + "epoch": 1.2111794421260706, + "grad_norm": 1.7667254209518433, + "learning_rate": 5e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7064365148544312, + "num_tokens": 285289537.0, + "step": 11029 + }, + { + "epoch": 1.2112892598286844, + "grad_norm": 1.728842854499817, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7318530678749084, + "num_tokens": 285316783.0, + "step": 11030 + }, + { + "epoch": 1.211399077531298, + "grad_norm": 1.7396013736724854, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7387372255325317, + "num_tokens": 285346597.0, + "step": 11031 + }, + { + "epoch": 1.2115088952339117, + "grad_norm": 1.856852412223816, + "learning_rate": 5e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7400635480880737, + "num_tokens": 285373835.0, + "step": 11032 + }, + { + "epoch": 1.2116187129365255, + "grad_norm": 1.843252182006836, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.749703049659729, + "num_tokens": 285398656.0, + "step": 11033 + }, + { + "epoch": 1.211728530639139, + "grad_norm": 1.9329482316970825, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7351099252700806, + "num_tokens": 285422586.0, + "step": 11034 + }, + { + "epoch": 1.2118383483417527, + "grad_norm": 2.0128870010375977, + "learning_rate": 5e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7372356653213501, + "num_tokens": 285447302.0, + "step": 11035 + }, + { + "epoch": 1.2119481660443663, + "grad_norm": 2.148933172225952, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7509631514549255, + "num_tokens": 285466313.0, + "step": 11036 + }, + { + "epoch": 1.21205798374698, + "grad_norm": 1.7873040437698364, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.731613039970398, + "num_tokens": 285493219.0, + "step": 11037 + }, + { + "epoch": 1.2121678014495938, + "grad_norm": 1.926334023475647, + "learning_rate": 5e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7322732210159302, + "num_tokens": 285517559.0, + "step": 11038 + }, + { + "epoch": 1.2122776191522073, + "grad_norm": 2.040827989578247, + "learning_rate": 5e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7412317395210266, + "num_tokens": 285538473.0, + "step": 11039 + }, + { + "epoch": 1.212387436854821, + "grad_norm": 2.0367648601531982, + "learning_rate": 5e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.7537339329719543, + "num_tokens": 285559518.0, + "step": 11040 + }, + { + "epoch": 1.2124972545574346, + "grad_norm": 1.659674048423767, + "learning_rate": 5e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7337161302566528, + "num_tokens": 285592089.0, + "step": 11041 + }, + { + "epoch": 1.2126070722600484, + "grad_norm": 1.9628921747207642, + "learning_rate": 5e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7464921474456787, + "num_tokens": 285616016.0, + "step": 11042 + }, + { + "epoch": 1.212716889962662, + "grad_norm": 1.902463674545288, + "learning_rate": 5e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7222337126731873, + "num_tokens": 285645639.0, + "step": 11043 + }, + { + "epoch": 1.2128267076652757, + "grad_norm": 1.8674581050872803, + "learning_rate": 5e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7606324553489685, + "num_tokens": 285669698.0, + "step": 11044 + }, + { + "epoch": 1.2129365253678892, + "grad_norm": 2.060063600540161, + "learning_rate": 5e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7424197196960449, + "num_tokens": 285693478.0, + "step": 11045 + }, + { + "epoch": 1.213046343070503, + "grad_norm": 1.7448383569717407, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7470163106918335, + "num_tokens": 285721572.0, + "step": 11046 + }, + { + "epoch": 1.2131561607731167, + "grad_norm": 1.806087851524353, + "learning_rate": 5e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.7560005187988281, + "num_tokens": 285748270.0, + "step": 11047 + }, + { + "epoch": 1.2132659784757303, + "grad_norm": 1.902211308479309, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7489015460014343, + "num_tokens": 285773981.0, + "step": 11048 + }, + { + "epoch": 1.213375796178344, + "grad_norm": 1.7789877653121948, + "learning_rate": 5e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7112435102462769, + "num_tokens": 285801498.0, + "step": 11049 + }, + { + "epoch": 1.2134856138809575, + "grad_norm": 1.8785898685455322, + "learning_rate": 5e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7230662703514099, + "num_tokens": 285827101.0, + "step": 11050 + }, + { + "epoch": 1.2135954315835713, + "grad_norm": 1.8917337656021118, + "learning_rate": 5e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7164183259010315, + "num_tokens": 285855220.0, + "step": 11051 + }, + { + "epoch": 1.2137052492861848, + "grad_norm": 1.7920036315917969, + "learning_rate": 5e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7587572336196899, + "num_tokens": 285881740.0, + "step": 11052 + }, + { + "epoch": 1.2138150669887986, + "grad_norm": 2.0089175701141357, + "learning_rate": 5e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7399141788482666, + "num_tokens": 285904387.0, + "step": 11053 + }, + { + "epoch": 1.2139248846914121, + "grad_norm": 2.073823928833008, + "learning_rate": 5e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7355374693870544, + "num_tokens": 285927706.0, + "step": 11054 + }, + { + "epoch": 1.2140347023940259, + "grad_norm": 1.9301906824111938, + "learning_rate": 5e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7107201218605042, + "num_tokens": 285954704.0, + "step": 11055 + }, + { + "epoch": 1.2141445200966396, + "grad_norm": 1.851922869682312, + "learning_rate": 5e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7386852502822876, + "num_tokens": 285983649.0, + "step": 11056 + }, + { + "epoch": 1.2142543377992532, + "grad_norm": 2.2011806964874268, + "learning_rate": 5e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7266382575035095, + "num_tokens": 286004449.0, + "step": 11057 + }, + { + "epoch": 1.214364155501867, + "grad_norm": 1.9062671661376953, + "learning_rate": 5e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7462009787559509, + "num_tokens": 286029284.0, + "step": 11058 + }, + { + "epoch": 1.2144739732044805, + "grad_norm": 1.7664250135421753, + "learning_rate": 5e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7243155241012573, + "num_tokens": 286060154.0, + "step": 11059 + }, + { + "epoch": 1.2145837909070942, + "grad_norm": 1.9218482971191406, + "learning_rate": 5e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7491400241851807, + "num_tokens": 286084612.0, + "step": 11060 + }, + { + "epoch": 1.214693608609708, + "grad_norm": 1.7787662744522095, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7275545597076416, + "num_tokens": 286111038.0, + "step": 11061 + }, + { + "epoch": 1.2148034263123215, + "grad_norm": 2.167668581008911, + "learning_rate": 5e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7588571310043335, + "num_tokens": 286131519.0, + "step": 11062 + }, + { + "epoch": 1.2149132440149353, + "grad_norm": 1.7898755073547363, + "learning_rate": 5e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7165290713310242, + "num_tokens": 286162048.0, + "step": 11063 + }, + { + "epoch": 1.2150230617175488, + "grad_norm": 1.9163719415664673, + "learning_rate": 5e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7383259534835815, + "num_tokens": 286185498.0, + "step": 11064 + }, + { + "epoch": 1.2151328794201626, + "grad_norm": 1.7006877660751343, + "learning_rate": 5e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.715299129486084, + "num_tokens": 286217860.0, + "step": 11065 + }, + { + "epoch": 1.215242697122776, + "grad_norm": 1.8675495386123657, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7305728197097778, + "num_tokens": 286243106.0, + "step": 11066 + }, + { + "epoch": 1.2153525148253899, + "grad_norm": 1.7922488451004028, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7407345771789551, + "num_tokens": 286270368.0, + "step": 11067 + }, + { + "epoch": 1.2154623325280034, + "grad_norm": 1.9568419456481934, + "learning_rate": 5e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7312296628952026, + "num_tokens": 286294174.0, + "step": 11068 + }, + { + "epoch": 1.2155721502306172, + "grad_norm": 2.1208038330078125, + "learning_rate": 5e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7517390251159668, + "num_tokens": 286314233.0, + "step": 11069 + }, + { + "epoch": 1.215681967933231, + "grad_norm": 1.9203659296035767, + "learning_rate": 5e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.75577312707901, + "num_tokens": 286337187.0, + "step": 11070 + }, + { + "epoch": 1.2157917856358444, + "grad_norm": 1.7823891639709473, + "learning_rate": 5e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7478322982788086, + "num_tokens": 286365602.0, + "step": 11071 + }, + { + "epoch": 1.2159016033384582, + "grad_norm": 1.7491410970687866, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.731250524520874, + "num_tokens": 286393828.0, + "step": 11072 + }, + { + "epoch": 1.2160114210410717, + "grad_norm": 1.6891796588897705, + "learning_rate": 5e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7446391582489014, + "num_tokens": 286424497.0, + "step": 11073 + }, + { + "epoch": 1.2161212387436855, + "grad_norm": 1.7705445289611816, + "learning_rate": 5e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7359616756439209, + "num_tokens": 286452815.0, + "step": 11074 + }, + { + "epoch": 1.2162310564462993, + "grad_norm": 2.239435911178589, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7194446921348572, + "num_tokens": 286475039.0, + "step": 11075 + }, + { + "epoch": 1.2163408741489128, + "grad_norm": 1.709022045135498, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7321755886077881, + "num_tokens": 286506146.0, + "step": 11076 + }, + { + "epoch": 1.2164506918515265, + "grad_norm": 1.872802734375, + "learning_rate": 5e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7405603528022766, + "num_tokens": 286532090.0, + "step": 11077 + }, + { + "epoch": 1.21656050955414, + "grad_norm": 1.9947108030319214, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.72957444190979, + "num_tokens": 286555910.0, + "step": 11078 + }, + { + "epoch": 1.2166703272567538, + "grad_norm": 1.8537737131118774, + "learning_rate": 5e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.725210428237915, + "num_tokens": 286581631.0, + "step": 11079 + }, + { + "epoch": 1.2167801449593674, + "grad_norm": 1.7590399980545044, + "learning_rate": 5e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7094115018844604, + "num_tokens": 286614257.0, + "step": 11080 + }, + { + "epoch": 1.2168899626619811, + "grad_norm": 2.054497241973877, + "learning_rate": 5e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.736632227897644, + "num_tokens": 286634837.0, + "step": 11081 + }, + { + "epoch": 1.2169997803645947, + "grad_norm": 1.8353346586227417, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7305856943130493, + "num_tokens": 286662830.0, + "step": 11082 + }, + { + "epoch": 1.2171095980672084, + "grad_norm": 2.14747953414917, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7498255968093872, + "num_tokens": 286685184.0, + "step": 11083 + }, + { + "epoch": 1.2172194157698222, + "grad_norm": 1.7273050546646118, + "learning_rate": 5e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7226704359054565, + "num_tokens": 286713016.0, + "step": 11084 + }, + { + "epoch": 1.2173292334724357, + "grad_norm": 1.7270419597625732, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7587535977363586, + "num_tokens": 286741891.0, + "step": 11085 + }, + { + "epoch": 1.2174390511750495, + "grad_norm": 1.7843570709228516, + "learning_rate": 5e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7538557052612305, + "num_tokens": 286767155.0, + "step": 11086 + }, + { + "epoch": 1.217548868877663, + "grad_norm": 1.8358335494995117, + "learning_rate": 5e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7476348876953125, + "num_tokens": 286791588.0, + "step": 11087 + }, + { + "epoch": 1.2176586865802768, + "grad_norm": 1.7237169742584229, + "learning_rate": 5e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7105805277824402, + "num_tokens": 286821835.0, + "step": 11088 + }, + { + "epoch": 1.2177685042828905, + "grad_norm": 1.9494768381118774, + "learning_rate": 5e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7461972832679749, + "num_tokens": 286846210.0, + "step": 11089 + }, + { + "epoch": 1.217878321985504, + "grad_norm": 1.7709712982177734, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7280561923980713, + "num_tokens": 286875135.0, + "step": 11090 + }, + { + "epoch": 1.2179881396881178, + "grad_norm": 1.818764328956604, + "learning_rate": 5e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7430285215377808, + "num_tokens": 286901237.0, + "step": 11091 + }, + { + "epoch": 1.2180979573907313, + "grad_norm": 2.1456146240234375, + "learning_rate": 5e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.737979531288147, + "num_tokens": 286922556.0, + "step": 11092 + }, + { + "epoch": 1.218207775093345, + "grad_norm": 2.0243661403656006, + "learning_rate": 5e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.751759946346283, + "num_tokens": 286944655.0, + "step": 11093 + }, + { + "epoch": 1.2183175927959586, + "grad_norm": 2.146557569503784, + "learning_rate": 5e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.73415207862854, + "num_tokens": 286966731.0, + "step": 11094 + }, + { + "epoch": 1.2184274104985724, + "grad_norm": 1.961916208267212, + "learning_rate": 5e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.715020477771759, + "num_tokens": 286992179.0, + "step": 11095 + }, + { + "epoch": 1.218537228201186, + "grad_norm": 2.011563301086426, + "learning_rate": 5e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7304072380065918, + "num_tokens": 287016027.0, + "step": 11096 + }, + { + "epoch": 1.2186470459037997, + "grad_norm": 2.0331997871398926, + "learning_rate": 5e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7538820505142212, + "num_tokens": 287036397.0, + "step": 11097 + }, + { + "epoch": 1.2187568636064134, + "grad_norm": 1.7879263162612915, + "learning_rate": 5e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7289657592773438, + "num_tokens": 287063879.0, + "step": 11098 + }, + { + "epoch": 1.218866681309027, + "grad_norm": 1.71091890335083, + "learning_rate": 5e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7184479236602783, + "num_tokens": 287096579.0, + "step": 11099 + }, + { + "epoch": 1.2189764990116407, + "grad_norm": 1.8751667737960815, + "learning_rate": 5e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7418166399002075, + "num_tokens": 287122537.0, + "step": 11100 + }, + { + "epoch": 1.2190863167142543, + "grad_norm": 2.0442676544189453, + "learning_rate": 5e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7488956451416016, + "num_tokens": 287145168.0, + "step": 11101 + }, + { + "epoch": 1.219196134416868, + "grad_norm": 1.9813401699066162, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7414838075637817, + "num_tokens": 287166717.0, + "step": 11102 + }, + { + "epoch": 1.2193059521194818, + "grad_norm": 2.2542378902435303, + "learning_rate": 5e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7401272058486938, + "num_tokens": 287185405.0, + "step": 11103 + }, + { + "epoch": 1.2194157698220953, + "grad_norm": 1.8054640293121338, + "learning_rate": 5e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7495143413543701, + "num_tokens": 287210141.0, + "step": 11104 + }, + { + "epoch": 1.219525587524709, + "grad_norm": 2.026714563369751, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7281947135925293, + "num_tokens": 287233731.0, + "step": 11105 + }, + { + "epoch": 1.2196354052273226, + "grad_norm": 1.705739974975586, + "learning_rate": 5e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7368379831314087, + "num_tokens": 287268528.0, + "step": 11106 + }, + { + "epoch": 1.2197452229299364, + "grad_norm": 2.0613467693328857, + "learning_rate": 5e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7457688450813293, + "num_tokens": 287290888.0, + "step": 11107 + }, + { + "epoch": 1.21985504063255, + "grad_norm": 1.9659382104873657, + "learning_rate": 5e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7340923547744751, + "num_tokens": 287317575.0, + "step": 11108 + }, + { + "epoch": 1.2199648583351637, + "grad_norm": 1.8781349658966064, + "learning_rate": 5e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7510381937026978, + "num_tokens": 287342493.0, + "step": 11109 + }, + { + "epoch": 1.2200746760377772, + "grad_norm": 1.9288403987884521, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7382628917694092, + "num_tokens": 287368218.0, + "step": 11110 + }, + { + "epoch": 1.220184493740391, + "grad_norm": 2.1137919425964355, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.731525182723999, + "num_tokens": 287390803.0, + "step": 11111 + }, + { + "epoch": 1.2202943114430047, + "grad_norm": 2.2471439838409424, + "learning_rate": 5e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7295143604278564, + "num_tokens": 287411108.0, + "step": 11112 + }, + { + "epoch": 1.2204041291456182, + "grad_norm": 1.6818616390228271, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7335636019706726, + "num_tokens": 287444134.0, + "step": 11113 + }, + { + "epoch": 1.220513946848232, + "grad_norm": 1.9190661907196045, + "learning_rate": 5e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7378793358802795, + "num_tokens": 287469597.0, + "step": 11114 + }, + { + "epoch": 1.2206237645508455, + "grad_norm": 1.765381097793579, + "learning_rate": 5e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7228918075561523, + "num_tokens": 287502491.0, + "step": 11115 + }, + { + "epoch": 1.2207335822534593, + "grad_norm": 1.8279190063476562, + "learning_rate": 5e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7263761758804321, + "num_tokens": 287530521.0, + "step": 11116 + }, + { + "epoch": 1.2208433999560728, + "grad_norm": 1.5898405313491821, + "learning_rate": 5e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7402073740959167, + "num_tokens": 287561506.0, + "step": 11117 + }, + { + "epoch": 1.2209532176586866, + "grad_norm": 1.9190994501113892, + "learning_rate": 5e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7157002091407776, + "num_tokens": 287589333.0, + "step": 11118 + }, + { + "epoch": 1.2210630353613001, + "grad_norm": 1.865971565246582, + "learning_rate": 5e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7350719571113586, + "num_tokens": 287614647.0, + "step": 11119 + }, + { + "epoch": 1.2211728530639139, + "grad_norm": 1.9952178001403809, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7285563349723816, + "num_tokens": 287637710.0, + "step": 11120 + }, + { + "epoch": 1.2212826707665276, + "grad_norm": 1.7669399976730347, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.725875735282898, + "num_tokens": 287666909.0, + "step": 11121 + }, + { + "epoch": 1.2213924884691412, + "grad_norm": 1.709719181060791, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.765007734298706, + "num_tokens": 287694932.0, + "step": 11122 + }, + { + "epoch": 1.221502306171755, + "grad_norm": 1.9569733142852783, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7310903072357178, + "num_tokens": 287719666.0, + "step": 11123 + }, + { + "epoch": 1.2216121238743685, + "grad_norm": 1.9023922681808472, + "learning_rate": 5e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7297277450561523, + "num_tokens": 287743590.0, + "step": 11124 + }, + { + "epoch": 1.2217219415769822, + "grad_norm": 1.9055222272872925, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7117382884025574, + "num_tokens": 287770342.0, + "step": 11125 + }, + { + "epoch": 1.221831759279596, + "grad_norm": 1.9806137084960938, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7247458100318909, + "num_tokens": 287794973.0, + "step": 11126 + }, + { + "epoch": 1.2219415769822095, + "grad_norm": 2.0235366821289062, + "learning_rate": 5e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.733275294303894, + "num_tokens": 287817842.0, + "step": 11127 + }, + { + "epoch": 1.2220513946848233, + "grad_norm": 1.8467621803283691, + "learning_rate": 5e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7343997955322266, + "num_tokens": 287845180.0, + "step": 11128 + }, + { + "epoch": 1.2221612123874368, + "grad_norm": 1.9201072454452515, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7400891184806824, + "num_tokens": 287870163.0, + "step": 11129 + }, + { + "epoch": 1.2222710300900506, + "grad_norm": 2.1848433017730713, + "learning_rate": 5e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7519510984420776, + "num_tokens": 287888740.0, + "step": 11130 + }, + { + "epoch": 1.222380847792664, + "grad_norm": 1.749610185623169, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7339743375778198, + "num_tokens": 287919399.0, + "step": 11131 + }, + { + "epoch": 1.2224906654952779, + "grad_norm": 1.9020143747329712, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7596287727355957, + "num_tokens": 287943919.0, + "step": 11132 + }, + { + "epoch": 1.2226004831978914, + "grad_norm": 1.8247517347335815, + "learning_rate": 5e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7055068016052246, + "num_tokens": 287970643.0, + "step": 11133 + }, + { + "epoch": 1.2227103009005051, + "grad_norm": 1.774603247642517, + "learning_rate": 5e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7127838134765625, + "num_tokens": 288000004.0, + "step": 11134 + }, + { + "epoch": 1.222820118603119, + "grad_norm": 1.76060950756073, + "learning_rate": 5e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7536219954490662, + "num_tokens": 288029145.0, + "step": 11135 + }, + { + "epoch": 1.2229299363057324, + "grad_norm": 2.066706418991089, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7376505136489868, + "num_tokens": 288049245.0, + "step": 11136 + }, + { + "epoch": 1.2230397540083462, + "grad_norm": 1.8344026803970337, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7372110486030579, + "num_tokens": 288077781.0, + "step": 11137 + }, + { + "epoch": 1.2231495717109597, + "grad_norm": 1.9861990213394165, + "learning_rate": 5e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7372144460678101, + "num_tokens": 288100679.0, + "step": 11138 + }, + { + "epoch": 1.2232593894135735, + "grad_norm": 1.7754685878753662, + "learning_rate": 5e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7536531686782837, + "num_tokens": 288126422.0, + "step": 11139 + }, + { + "epoch": 1.2233692071161872, + "grad_norm": 1.79952073097229, + "learning_rate": 5e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7419304847717285, + "num_tokens": 288152801.0, + "step": 11140 + }, + { + "epoch": 1.2234790248188008, + "grad_norm": 1.7367075681686401, + "learning_rate": 5e-06, + "loss": 0.962, + "mean_token_accuracy": 0.6953144669532776, + "num_tokens": 288184891.0, + "step": 11141 + }, + { + "epoch": 1.2235888425214145, + "grad_norm": 2.0178158283233643, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7291700839996338, + "num_tokens": 288209556.0, + "step": 11142 + }, + { + "epoch": 1.223698660224028, + "grad_norm": 1.7412660121917725, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.723110556602478, + "num_tokens": 288239684.0, + "step": 11143 + }, + { + "epoch": 1.2238084779266418, + "grad_norm": 1.847508430480957, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7539131045341492, + "num_tokens": 288261889.0, + "step": 11144 + }, + { + "epoch": 1.2239182956292554, + "grad_norm": 1.8135223388671875, + "learning_rate": 5e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7157188057899475, + "num_tokens": 288291753.0, + "step": 11145 + }, + { + "epoch": 1.2240281133318691, + "grad_norm": 1.7288506031036377, + "learning_rate": 5e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7148959636688232, + "num_tokens": 288320860.0, + "step": 11146 + }, + { + "epoch": 1.2241379310344827, + "grad_norm": 1.6572736501693726, + "learning_rate": 5e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7124642133712769, + "num_tokens": 288354697.0, + "step": 11147 + }, + { + "epoch": 1.2242477487370964, + "grad_norm": 1.9465994834899902, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7315554022789001, + "num_tokens": 288378091.0, + "step": 11148 + }, + { + "epoch": 1.2243575664397102, + "grad_norm": 1.8376796245574951, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7238495349884033, + "num_tokens": 288405573.0, + "step": 11149 + }, + { + "epoch": 1.2244673841423237, + "grad_norm": 1.8555328845977783, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7093112468719482, + "num_tokens": 288435788.0, + "step": 11150 + }, + { + "epoch": 1.2245772018449375, + "grad_norm": 2.0611672401428223, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7422320246696472, + "num_tokens": 288456708.0, + "step": 11151 + }, + { + "epoch": 1.224687019547551, + "grad_norm": 1.8036985397338867, + "learning_rate": 5e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7380145192146301, + "num_tokens": 288484171.0, + "step": 11152 + }, + { + "epoch": 1.2247968372501647, + "grad_norm": 1.9769372940063477, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7370721697807312, + "num_tokens": 288507465.0, + "step": 11153 + }, + { + "epoch": 1.2249066549527785, + "grad_norm": 1.6662791967391968, + "learning_rate": 5e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7015860080718994, + "num_tokens": 288542220.0, + "step": 11154 + }, + { + "epoch": 1.225016472655392, + "grad_norm": 1.933842658996582, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7464223504066467, + "num_tokens": 288565004.0, + "step": 11155 + }, + { + "epoch": 1.2251262903580058, + "grad_norm": 1.7882537841796875, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7240102291107178, + "num_tokens": 288595238.0, + "step": 11156 + }, + { + "epoch": 1.2252361080606193, + "grad_norm": 1.7822990417480469, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7500745058059692, + "num_tokens": 288623902.0, + "step": 11157 + }, + { + "epoch": 1.225345925763233, + "grad_norm": 1.9073030948638916, + "learning_rate": 5e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7207984328269958, + "num_tokens": 288648455.0, + "step": 11158 + }, + { + "epoch": 1.2254557434658466, + "grad_norm": 1.73780357837677, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.717851459980011, + "num_tokens": 288678192.0, + "step": 11159 + }, + { + "epoch": 1.2255655611684604, + "grad_norm": 1.8898941278457642, + "learning_rate": 5e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7354639172554016, + "num_tokens": 288705230.0, + "step": 11160 + }, + { + "epoch": 1.225675378871074, + "grad_norm": 1.7784672975540161, + "learning_rate": 5e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7271136045455933, + "num_tokens": 288732620.0, + "step": 11161 + }, + { + "epoch": 1.2257851965736877, + "grad_norm": 1.7519111633300781, + "learning_rate": 5e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7343885898590088, + "num_tokens": 288762523.0, + "step": 11162 + }, + { + "epoch": 1.2258950142763014, + "grad_norm": 1.946265459060669, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7500734329223633, + "num_tokens": 288788468.0, + "step": 11163 + }, + { + "epoch": 1.226004831978915, + "grad_norm": 2.0793216228485107, + "learning_rate": 5e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7303166389465332, + "num_tokens": 288810616.0, + "step": 11164 + }, + { + "epoch": 1.2261146496815287, + "grad_norm": 1.8677407503128052, + "learning_rate": 5e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7413387894630432, + "num_tokens": 288838723.0, + "step": 11165 + }, + { + "epoch": 1.2262244673841423, + "grad_norm": 1.8403691053390503, + "learning_rate": 5e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7126338481903076, + "num_tokens": 288866227.0, + "step": 11166 + }, + { + "epoch": 1.226334285086756, + "grad_norm": 1.9548282623291016, + "learning_rate": 5e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7318038940429688, + "num_tokens": 288891223.0, + "step": 11167 + }, + { + "epoch": 1.2264441027893696, + "grad_norm": 1.5238515138626099, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7233781814575195, + "num_tokens": 288927141.0, + "step": 11168 + }, + { + "epoch": 1.2265539204919833, + "grad_norm": 1.8695040941238403, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7481895685195923, + "num_tokens": 288952587.0, + "step": 11169 + }, + { + "epoch": 1.226663738194597, + "grad_norm": 1.7528250217437744, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7581532597541809, + "num_tokens": 288980450.0, + "step": 11170 + }, + { + "epoch": 1.2267735558972106, + "grad_norm": 1.8315553665161133, + "learning_rate": 5e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7396702766418457, + "num_tokens": 289006897.0, + "step": 11171 + }, + { + "epoch": 1.2268833735998244, + "grad_norm": 2.0163519382476807, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7343024015426636, + "num_tokens": 289030957.0, + "step": 11172 + }, + { + "epoch": 1.226993191302438, + "grad_norm": 1.9481433629989624, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7406129240989685, + "num_tokens": 289055588.0, + "step": 11173 + }, + { + "epoch": 1.2271030090050516, + "grad_norm": 1.7606548070907593, + "learning_rate": 5e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7477391958236694, + "num_tokens": 289083294.0, + "step": 11174 + }, + { + "epoch": 1.2272128267076652, + "grad_norm": 2.1994941234588623, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7452260255813599, + "num_tokens": 289106597.0, + "step": 11175 + }, + { + "epoch": 1.227322644410279, + "grad_norm": 2.227142333984375, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7385513186454773, + "num_tokens": 289128413.0, + "step": 11176 + }, + { + "epoch": 1.2274324621128927, + "grad_norm": 1.7823461294174194, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7301437854766846, + "num_tokens": 289157056.0, + "step": 11177 + }, + { + "epoch": 1.2275422798155062, + "grad_norm": 2.025390625, + "learning_rate": 5e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7509920597076416, + "num_tokens": 289180325.0, + "step": 11178 + }, + { + "epoch": 1.22765209751812, + "grad_norm": 1.8494791984558105, + "learning_rate": 5e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7436972856521606, + "num_tokens": 289205763.0, + "step": 11179 + }, + { + "epoch": 1.2277619152207335, + "grad_norm": 1.6928828954696655, + "learning_rate": 5e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7070499658584595, + "num_tokens": 289236906.0, + "step": 11180 + }, + { + "epoch": 1.2278717329233473, + "grad_norm": 1.9491817951202393, + "learning_rate": 5e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.7511441111564636, + "num_tokens": 289258905.0, + "step": 11181 + }, + { + "epoch": 1.2279815506259608, + "grad_norm": 2.095935106277466, + "learning_rate": 5e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7221797704696655, + "num_tokens": 289280801.0, + "step": 11182 + }, + { + "epoch": 1.2280913683285746, + "grad_norm": 1.8747305870056152, + "learning_rate": 5e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7324684858322144, + "num_tokens": 289306955.0, + "step": 11183 + }, + { + "epoch": 1.228201186031188, + "grad_norm": 1.6146330833435059, + "learning_rate": 5e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7436389327049255, + "num_tokens": 289336222.0, + "step": 11184 + }, + { + "epoch": 1.2283110037338019, + "grad_norm": 1.7516371011734009, + "learning_rate": 5e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7316642999649048, + "num_tokens": 289363430.0, + "step": 11185 + }, + { + "epoch": 1.2284208214364156, + "grad_norm": 1.5759942531585693, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7418012022972107, + "num_tokens": 289393049.0, + "step": 11186 + }, + { + "epoch": 1.2285306391390292, + "grad_norm": 2.13092303276062, + "learning_rate": 5e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7475773096084595, + "num_tokens": 289411872.0, + "step": 11187 + }, + { + "epoch": 1.228640456841643, + "grad_norm": 2.0379996299743652, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7396721839904785, + "num_tokens": 289432855.0, + "step": 11188 + }, + { + "epoch": 1.2287502745442564, + "grad_norm": 1.8468595743179321, + "learning_rate": 5e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7409534454345703, + "num_tokens": 289458028.0, + "step": 11189 + }, + { + "epoch": 1.2288600922468702, + "grad_norm": 2.002224922180176, + "learning_rate": 5e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7389662265777588, + "num_tokens": 289480934.0, + "step": 11190 + }, + { + "epoch": 1.228969909949484, + "grad_norm": 2.0754246711730957, + "learning_rate": 5e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7168615460395813, + "num_tokens": 289506252.0, + "step": 11191 + }, + { + "epoch": 1.2290797276520975, + "grad_norm": 1.8274049758911133, + "learning_rate": 5e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7401309013366699, + "num_tokens": 289533166.0, + "step": 11192 + }, + { + "epoch": 1.2291895453547113, + "grad_norm": 1.898296594619751, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7378999590873718, + "num_tokens": 289557047.0, + "step": 11193 + }, + { + "epoch": 1.2292993630573248, + "grad_norm": 2.348982810974121, + "learning_rate": 5e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7249943614006042, + "num_tokens": 289575904.0, + "step": 11194 + }, + { + "epoch": 1.2294091807599385, + "grad_norm": 1.791455626487732, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7472834587097168, + "num_tokens": 289603003.0, + "step": 11195 + }, + { + "epoch": 1.229518998462552, + "grad_norm": 1.8237481117248535, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7648531198501587, + "num_tokens": 289628729.0, + "step": 11196 + }, + { + "epoch": 1.2296288161651658, + "grad_norm": 1.7154687643051147, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7037919759750366, + "num_tokens": 289658536.0, + "step": 11197 + }, + { + "epoch": 1.2297386338677794, + "grad_norm": 1.9273184537887573, + "learning_rate": 5e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7258145213127136, + "num_tokens": 289683996.0, + "step": 11198 + }, + { + "epoch": 1.2298484515703931, + "grad_norm": 1.7589846849441528, + "learning_rate": 5e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7371731996536255, + "num_tokens": 289712843.0, + "step": 11199 + }, + { + "epoch": 1.2299582692730069, + "grad_norm": 1.9664784669876099, + "learning_rate": 5e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7392369508743286, + "num_tokens": 289737022.0, + "step": 11200 + }, + { + "epoch": 1.2300680869756204, + "grad_norm": 1.859950304031372, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7275488376617432, + "num_tokens": 289762235.0, + "step": 11201 + }, + { + "epoch": 1.2301779046782342, + "grad_norm": 1.7912851572036743, + "learning_rate": 5e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7447766065597534, + "num_tokens": 289789738.0, + "step": 11202 + }, + { + "epoch": 1.2302877223808477, + "grad_norm": 1.8804399967193604, + "learning_rate": 5e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7322602272033691, + "num_tokens": 289815163.0, + "step": 11203 + }, + { + "epoch": 1.2303975400834615, + "grad_norm": 1.9285075664520264, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7365221381187439, + "num_tokens": 289842196.0, + "step": 11204 + }, + { + "epoch": 1.2305073577860752, + "grad_norm": 2.0551960468292236, + "learning_rate": 5e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7274088859558105, + "num_tokens": 289864300.0, + "step": 11205 + }, + { + "epoch": 1.2306171754886888, + "grad_norm": 1.8441001176834106, + "learning_rate": 5e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7142510414123535, + "num_tokens": 289897134.0, + "step": 11206 + }, + { + "epoch": 1.2307269931913025, + "grad_norm": 2.0830626487731934, + "learning_rate": 5e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7438884973526001, + "num_tokens": 289921358.0, + "step": 11207 + }, + { + "epoch": 1.230836810893916, + "grad_norm": 1.9695587158203125, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7175289988517761, + "num_tokens": 289943959.0, + "step": 11208 + }, + { + "epoch": 1.2309466285965298, + "grad_norm": 2.184053897857666, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7313278913497925, + "num_tokens": 289963697.0, + "step": 11209 + }, + { + "epoch": 1.2310564462991433, + "grad_norm": 2.0582361221313477, + "learning_rate": 5e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7412817478179932, + "num_tokens": 289988196.0, + "step": 11210 + }, + { + "epoch": 1.231166264001757, + "grad_norm": 2.0860202312469482, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7287898659706116, + "num_tokens": 290010791.0, + "step": 11211 + }, + { + "epoch": 1.2312760817043706, + "grad_norm": 1.8399176597595215, + "learning_rate": 5e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7180591821670532, + "num_tokens": 290038386.0, + "step": 11212 + }, + { + "epoch": 1.2313858994069844, + "grad_norm": 1.94405996799469, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7352319359779358, + "num_tokens": 290060772.0, + "step": 11213 + }, + { + "epoch": 1.2314957171095982, + "grad_norm": 1.8192009925842285, + "learning_rate": 5e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7236038446426392, + "num_tokens": 290088999.0, + "step": 11214 + }, + { + "epoch": 1.2316055348122117, + "grad_norm": 1.911973237991333, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7185933589935303, + "num_tokens": 290115190.0, + "step": 11215 + }, + { + "epoch": 1.2317153525148254, + "grad_norm": 1.9096261262893677, + "learning_rate": 5e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.7369738817214966, + "num_tokens": 290139496.0, + "step": 11216 + }, + { + "epoch": 1.231825170217439, + "grad_norm": 2.133136510848999, + "learning_rate": 5e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7355977296829224, + "num_tokens": 290161261.0, + "step": 11217 + }, + { + "epoch": 1.2319349879200527, + "grad_norm": 1.8864325284957886, + "learning_rate": 5e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7548260688781738, + "num_tokens": 290186204.0, + "step": 11218 + }, + { + "epoch": 1.2320448056226665, + "grad_norm": 2.1365952491760254, + "learning_rate": 5e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7448714971542358, + "num_tokens": 290208501.0, + "step": 11219 + }, + { + "epoch": 1.23215462332528, + "grad_norm": 1.8285093307495117, + "learning_rate": 5e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7252419590950012, + "num_tokens": 290236469.0, + "step": 11220 + }, + { + "epoch": 1.2322644410278938, + "grad_norm": 1.823881983757019, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7230143547058105, + "num_tokens": 290266086.0, + "step": 11221 + }, + { + "epoch": 1.2323742587305073, + "grad_norm": 1.7671574354171753, + "learning_rate": 5e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7374061346054077, + "num_tokens": 290294814.0, + "step": 11222 + }, + { + "epoch": 1.232484076433121, + "grad_norm": 1.8342809677124023, + "learning_rate": 5e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7224375009536743, + "num_tokens": 290323189.0, + "step": 11223 + }, + { + "epoch": 1.2325938941357346, + "grad_norm": 1.699406385421753, + "learning_rate": 5e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7279857993125916, + "num_tokens": 290356770.0, + "step": 11224 + }, + { + "epoch": 1.2327037118383484, + "grad_norm": 1.797006607055664, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.717552661895752, + "num_tokens": 290384938.0, + "step": 11225 + }, + { + "epoch": 1.232813529540962, + "grad_norm": 1.598771572113037, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7315115332603455, + "num_tokens": 290418014.0, + "step": 11226 + }, + { + "epoch": 1.2329233472435757, + "grad_norm": 1.89174222946167, + "learning_rate": 5e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7193185091018677, + "num_tokens": 290443352.0, + "step": 11227 + }, + { + "epoch": 1.2330331649461894, + "grad_norm": 2.135378360748291, + "learning_rate": 5e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7372761964797974, + "num_tokens": 290464306.0, + "step": 11228 + }, + { + "epoch": 1.233142982648803, + "grad_norm": 1.9731560945510864, + "learning_rate": 5e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7536294460296631, + "num_tokens": 290484290.0, + "step": 11229 + }, + { + "epoch": 1.2332528003514167, + "grad_norm": 1.710549235343933, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7286125421524048, + "num_tokens": 290515662.0, + "step": 11230 + }, + { + "epoch": 1.2333626180540302, + "grad_norm": 1.748887538909912, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7287982702255249, + "num_tokens": 290544316.0, + "step": 11231 + }, + { + "epoch": 1.233472435756644, + "grad_norm": 1.8774449825286865, + "learning_rate": 5e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.72894287109375, + "num_tokens": 290570960.0, + "step": 11232 + }, + { + "epoch": 1.2335822534592575, + "grad_norm": 1.8984354734420776, + "learning_rate": 5e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.735098123550415, + "num_tokens": 290596249.0, + "step": 11233 + }, + { + "epoch": 1.2336920711618713, + "grad_norm": 1.9749996662139893, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7315948605537415, + "num_tokens": 290621959.0, + "step": 11234 + }, + { + "epoch": 1.2338018888644848, + "grad_norm": 1.590170979499817, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7198029160499573, + "num_tokens": 290656351.0, + "step": 11235 + }, + { + "epoch": 1.2339117065670986, + "grad_norm": 1.722139835357666, + "learning_rate": 5e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.733371376991272, + "num_tokens": 290682411.0, + "step": 11236 + }, + { + "epoch": 1.2340215242697123, + "grad_norm": 1.8491570949554443, + "learning_rate": 5e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7291053533554077, + "num_tokens": 290708725.0, + "step": 11237 + }, + { + "epoch": 1.2341313419723259, + "grad_norm": 1.906962513923645, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7289829254150391, + "num_tokens": 290735338.0, + "step": 11238 + }, + { + "epoch": 1.2342411596749396, + "grad_norm": 1.9969360828399658, + "learning_rate": 5e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7430275082588196, + "num_tokens": 290758200.0, + "step": 11239 + }, + { + "epoch": 1.2343509773775532, + "grad_norm": 2.101421594619751, + "learning_rate": 5e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7487053871154785, + "num_tokens": 290777678.0, + "step": 11240 + }, + { + "epoch": 1.234460795080167, + "grad_norm": 1.7902354001998901, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7169976234436035, + "num_tokens": 290806719.0, + "step": 11241 + }, + { + "epoch": 1.2345706127827807, + "grad_norm": 1.7963136434555054, + "learning_rate": 5e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7335429787635803, + "num_tokens": 290832087.0, + "step": 11242 + }, + { + "epoch": 1.2346804304853942, + "grad_norm": 1.6815894842147827, + "learning_rate": 5e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7315738797187805, + "num_tokens": 290863636.0, + "step": 11243 + }, + { + "epoch": 1.234790248188008, + "grad_norm": 1.6375072002410889, + "learning_rate": 5e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7416954636573792, + "num_tokens": 290895691.0, + "step": 11244 + }, + { + "epoch": 1.2349000658906215, + "grad_norm": 1.893104076385498, + "learning_rate": 5e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.740685760974884, + "num_tokens": 290920033.0, + "step": 11245 + }, + { + "epoch": 1.2350098835932353, + "grad_norm": 1.903403401374817, + "learning_rate": 5e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7334012985229492, + "num_tokens": 290943647.0, + "step": 11246 + }, + { + "epoch": 1.2351197012958488, + "grad_norm": 1.9679127931594849, + "learning_rate": 5e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7403920888900757, + "num_tokens": 290965465.0, + "step": 11247 + }, + { + "epoch": 1.2352295189984626, + "grad_norm": 1.7379176616668701, + "learning_rate": 5e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7430877685546875, + "num_tokens": 290992341.0, + "step": 11248 + }, + { + "epoch": 1.235339336701076, + "grad_norm": 1.6561740636825562, + "learning_rate": 5e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7352916598320007, + "num_tokens": 291024943.0, + "step": 11249 + }, + { + "epoch": 1.2354491544036899, + "grad_norm": 2.0299766063690186, + "learning_rate": 5e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.737802267074585, + "num_tokens": 291048374.0, + "step": 11250 + }, + { + "epoch": 1.2355589721063036, + "grad_norm": 1.917083978652954, + "learning_rate": 5e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7314684391021729, + "num_tokens": 291074639.0, + "step": 11251 + }, + { + "epoch": 1.2356687898089171, + "grad_norm": 1.680214762687683, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.725223183631897, + "num_tokens": 291106468.0, + "step": 11252 + }, + { + "epoch": 1.235778607511531, + "grad_norm": 1.8915163278579712, + "learning_rate": 5e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7415647506713867, + "num_tokens": 291129313.0, + "step": 11253 + }, + { + "epoch": 1.2358884252141444, + "grad_norm": 1.9060935974121094, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7263103723526001, + "num_tokens": 291156495.0, + "step": 11254 + }, + { + "epoch": 1.2359982429167582, + "grad_norm": 1.8403329849243164, + "learning_rate": 5e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.746192216873169, + "num_tokens": 291180776.0, + "step": 11255 + }, + { + "epoch": 1.236108060619372, + "grad_norm": 1.638477087020874, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7106539011001587, + "num_tokens": 291215177.0, + "step": 11256 + }, + { + "epoch": 1.2362178783219855, + "grad_norm": 1.8728736639022827, + "learning_rate": 5e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7340433597564697, + "num_tokens": 291239221.0, + "step": 11257 + }, + { + "epoch": 1.2363276960245992, + "grad_norm": 1.804550051689148, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7415527701377869, + "num_tokens": 291265553.0, + "step": 11258 + }, + { + "epoch": 1.2364375137272128, + "grad_norm": 2.000617742538452, + "learning_rate": 5e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7396472692489624, + "num_tokens": 291286226.0, + "step": 11259 + }, + { + "epoch": 1.2365473314298265, + "grad_norm": 1.8704713582992554, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7493382692337036, + "num_tokens": 291309076.0, + "step": 11260 + }, + { + "epoch": 1.23665714913244, + "grad_norm": 1.8889005184173584, + "learning_rate": 5e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7527850866317749, + "num_tokens": 291332865.0, + "step": 11261 + }, + { + "epoch": 1.2367669668350538, + "grad_norm": 1.791535496711731, + "learning_rate": 5e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7419379949569702, + "num_tokens": 291358840.0, + "step": 11262 + }, + { + "epoch": 1.2368767845376674, + "grad_norm": 1.7647278308868408, + "learning_rate": 5e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7072404623031616, + "num_tokens": 291387408.0, + "step": 11263 + }, + { + "epoch": 1.2369866022402811, + "grad_norm": 1.5778987407684326, + "learning_rate": 5e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7183492183685303, + "num_tokens": 291422555.0, + "step": 11264 + }, + { + "epoch": 1.2370964199428949, + "grad_norm": 1.9846534729003906, + "learning_rate": 5e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7270109057426453, + "num_tokens": 291444621.0, + "step": 11265 + }, + { + "epoch": 1.2372062376455084, + "grad_norm": 1.5252467393875122, + "learning_rate": 5e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7321179509162903, + "num_tokens": 291480366.0, + "step": 11266 + }, + { + "epoch": 1.2373160553481222, + "grad_norm": 1.9344003200531006, + "learning_rate": 5e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7145161032676697, + "num_tokens": 291507888.0, + "step": 11267 + }, + { + "epoch": 1.2374258730507357, + "grad_norm": 1.8730137348175049, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7336763143539429, + "num_tokens": 291536419.0, + "step": 11268 + }, + { + "epoch": 1.2375356907533495, + "grad_norm": 1.726349949836731, + "learning_rate": 5e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7342535257339478, + "num_tokens": 291569479.0, + "step": 11269 + }, + { + "epoch": 1.2376455084559632, + "grad_norm": 1.9416266679763794, + "learning_rate": 5e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7436001300811768, + "num_tokens": 291591727.0, + "step": 11270 + }, + { + "epoch": 1.2377553261585768, + "grad_norm": 1.705126166343689, + "learning_rate": 5e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7105938792228699, + "num_tokens": 291624380.0, + "step": 11271 + }, + { + "epoch": 1.2378651438611905, + "grad_norm": 1.6503568887710571, + "learning_rate": 5e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.716755747795105, + "num_tokens": 291657677.0, + "step": 11272 + }, + { + "epoch": 1.237974961563804, + "grad_norm": 1.9820302724838257, + "learning_rate": 5e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7256150841712952, + "num_tokens": 291682084.0, + "step": 11273 + }, + { + "epoch": 1.2380847792664178, + "grad_norm": 1.7502636909484863, + "learning_rate": 5e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7204324007034302, + "num_tokens": 291713717.0, + "step": 11274 + }, + { + "epoch": 1.2381945969690313, + "grad_norm": 1.821304440498352, + "learning_rate": 5e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7316458225250244, + "num_tokens": 291741354.0, + "step": 11275 + }, + { + "epoch": 1.238304414671645, + "grad_norm": 1.820669174194336, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7118176221847534, + "num_tokens": 291769006.0, + "step": 11276 + }, + { + "epoch": 1.2384142323742586, + "grad_norm": 1.932636022567749, + "learning_rate": 5e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7356868982315063, + "num_tokens": 291792744.0, + "step": 11277 + }, + { + "epoch": 1.2385240500768724, + "grad_norm": 1.7694865465164185, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7330994009971619, + "num_tokens": 291820849.0, + "step": 11278 + }, + { + "epoch": 1.2386338677794861, + "grad_norm": 1.7294938564300537, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.713337242603302, + "num_tokens": 291856753.0, + "step": 11279 + }, + { + "epoch": 1.2387436854820997, + "grad_norm": 1.7776222229003906, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7228859663009644, + "num_tokens": 291884542.0, + "step": 11280 + }, + { + "epoch": 1.2388535031847134, + "grad_norm": 1.8046633005142212, + "learning_rate": 5e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.735375165939331, + "num_tokens": 291910015.0, + "step": 11281 + }, + { + "epoch": 1.238963320887327, + "grad_norm": 1.9225828647613525, + "learning_rate": 5e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7447826862335205, + "num_tokens": 291933991.0, + "step": 11282 + }, + { + "epoch": 1.2390731385899407, + "grad_norm": 1.9985811710357666, + "learning_rate": 5e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7391647100448608, + "num_tokens": 291955923.0, + "step": 11283 + }, + { + "epoch": 1.2391829562925545, + "grad_norm": 1.9701299667358398, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7231519222259521, + "num_tokens": 291980898.0, + "step": 11284 + }, + { + "epoch": 1.239292773995168, + "grad_norm": 2.1172940731048584, + "learning_rate": 5e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7239522933959961, + "num_tokens": 292002442.0, + "step": 11285 + }, + { + "epoch": 1.2394025916977818, + "grad_norm": 1.4792544841766357, + "learning_rate": 5e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7474746704101562, + "num_tokens": 292037904.0, + "step": 11286 + }, + { + "epoch": 1.2395124094003953, + "grad_norm": 1.6804105043411255, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.719985842704773, + "num_tokens": 292070188.0, + "step": 11287 + }, + { + "epoch": 1.239622227103009, + "grad_norm": 1.9594290256500244, + "learning_rate": 5e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7395831346511841, + "num_tokens": 292093303.0, + "step": 11288 + }, + { + "epoch": 1.2397320448056226, + "grad_norm": 1.8381450176239014, + "learning_rate": 5e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7202451229095459, + "num_tokens": 292120327.0, + "step": 11289 + }, + { + "epoch": 1.2398418625082364, + "grad_norm": 1.801764965057373, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.735527515411377, + "num_tokens": 292148169.0, + "step": 11290 + }, + { + "epoch": 1.23995168021085, + "grad_norm": 1.8565255403518677, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7339562177658081, + "num_tokens": 292173593.0, + "step": 11291 + }, + { + "epoch": 1.2400614979134637, + "grad_norm": 1.9887486696243286, + "learning_rate": 5e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7124365568161011, + "num_tokens": 292197419.0, + "step": 11292 + }, + { + "epoch": 1.2401713156160774, + "grad_norm": 1.911880612373352, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7411046028137207, + "num_tokens": 292221499.0, + "step": 11293 + }, + { + "epoch": 1.240281133318691, + "grad_norm": 1.963654637336731, + "learning_rate": 5e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7182397842407227, + "num_tokens": 292246974.0, + "step": 11294 + }, + { + "epoch": 1.2403909510213047, + "grad_norm": 1.8879581689834595, + "learning_rate": 5e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7167810201644897, + "num_tokens": 292275982.0, + "step": 11295 + }, + { + "epoch": 1.2405007687239182, + "grad_norm": 1.7814619541168213, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7382247447967529, + "num_tokens": 292303936.0, + "step": 11296 + }, + { + "epoch": 1.240610586426532, + "grad_norm": 1.5874576568603516, + "learning_rate": 5e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7341711521148682, + "num_tokens": 292336751.0, + "step": 11297 + }, + { + "epoch": 1.2407204041291455, + "grad_norm": 1.7380123138427734, + "learning_rate": 5e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7266710996627808, + "num_tokens": 292367968.0, + "step": 11298 + }, + { + "epoch": 1.2408302218317593, + "grad_norm": 1.6259818077087402, + "learning_rate": 5e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7012912034988403, + "num_tokens": 292402562.0, + "step": 11299 + }, + { + "epoch": 1.2409400395343728, + "grad_norm": 1.772405743598938, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.729060173034668, + "num_tokens": 292430718.0, + "step": 11300 + }, + { + "epoch": 1.2410498572369866, + "grad_norm": 1.8739712238311768, + "learning_rate": 5e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7199612259864807, + "num_tokens": 292456090.0, + "step": 11301 + }, + { + "epoch": 1.2411596749396003, + "grad_norm": 1.8674613237380981, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7494531273841858, + "num_tokens": 292482051.0, + "step": 11302 + }, + { + "epoch": 1.2412694926422139, + "grad_norm": 2.3043975830078125, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7309515476226807, + "num_tokens": 292502184.0, + "step": 11303 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 1.8808726072311401, + "learning_rate": 5e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7265444993972778, + "num_tokens": 292529035.0, + "step": 11304 + }, + { + "epoch": 1.2414891280474412, + "grad_norm": 1.868377923965454, + "learning_rate": 5e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7276977300643921, + "num_tokens": 292555711.0, + "step": 11305 + }, + { + "epoch": 1.241598945750055, + "grad_norm": 1.9088424444198608, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7357253432273865, + "num_tokens": 292583093.0, + "step": 11306 + }, + { + "epoch": 1.2417087634526687, + "grad_norm": 1.6538068056106567, + "learning_rate": 5e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7291169762611389, + "num_tokens": 292613581.0, + "step": 11307 + }, + { + "epoch": 1.2418185811552822, + "grad_norm": 1.6382231712341309, + "learning_rate": 5e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7377380132675171, + "num_tokens": 292644846.0, + "step": 11308 + }, + { + "epoch": 1.241928398857896, + "grad_norm": 2.0235934257507324, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7388375401496887, + "num_tokens": 292666496.0, + "step": 11309 + }, + { + "epoch": 1.2420382165605095, + "grad_norm": 2.0652453899383545, + "learning_rate": 5e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7495574951171875, + "num_tokens": 292687532.0, + "step": 11310 + }, + { + "epoch": 1.2421480342631233, + "grad_norm": 1.8405128717422485, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7308932542800903, + "num_tokens": 292713630.0, + "step": 11311 + }, + { + "epoch": 1.2422578519657368, + "grad_norm": 1.9718587398529053, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7248825430870056, + "num_tokens": 292736480.0, + "step": 11312 + }, + { + "epoch": 1.2423676696683505, + "grad_norm": 1.993895173072815, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7310529947280884, + "num_tokens": 292761687.0, + "step": 11313 + }, + { + "epoch": 1.242477487370964, + "grad_norm": 1.8781523704528809, + "learning_rate": 5e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7164860367774963, + "num_tokens": 292789197.0, + "step": 11314 + }, + { + "epoch": 1.2425873050735778, + "grad_norm": 1.9325921535491943, + "learning_rate": 5e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7246696949005127, + "num_tokens": 292815418.0, + "step": 11315 + }, + { + "epoch": 1.2426971227761916, + "grad_norm": 2.1280791759490967, + "learning_rate": 5e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7484094500541687, + "num_tokens": 292837531.0, + "step": 11316 + }, + { + "epoch": 1.2428069404788051, + "grad_norm": 1.822520136833191, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7172785401344299, + "num_tokens": 292862635.0, + "step": 11317 + }, + { + "epoch": 1.242916758181419, + "grad_norm": 1.7360289096832275, + "learning_rate": 5e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7371963262557983, + "num_tokens": 292890386.0, + "step": 11318 + }, + { + "epoch": 1.2430265758840324, + "grad_norm": 1.7944409847259521, + "learning_rate": 5e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7340048551559448, + "num_tokens": 292918353.0, + "step": 11319 + }, + { + "epoch": 1.2431363935866462, + "grad_norm": 1.8308039903640747, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7318020462989807, + "num_tokens": 292943695.0, + "step": 11320 + }, + { + "epoch": 1.24324621128926, + "grad_norm": 1.6204681396484375, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7261431813240051, + "num_tokens": 292973867.0, + "step": 11321 + }, + { + "epoch": 1.2433560289918735, + "grad_norm": 1.6464269161224365, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7328787446022034, + "num_tokens": 293004479.0, + "step": 11322 + }, + { + "epoch": 1.2434658466944872, + "grad_norm": 1.8391014337539673, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.731913685798645, + "num_tokens": 293032193.0, + "step": 11323 + }, + { + "epoch": 1.2435756643971008, + "grad_norm": 1.8833062648773193, + "learning_rate": 5e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7148474454879761, + "num_tokens": 293058090.0, + "step": 11324 + }, + { + "epoch": 1.2436854820997145, + "grad_norm": 1.805609941482544, + "learning_rate": 5e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7452687621116638, + "num_tokens": 293083550.0, + "step": 11325 + }, + { + "epoch": 1.243795299802328, + "grad_norm": 1.7828595638275146, + "learning_rate": 5e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7153003215789795, + "num_tokens": 293112362.0, + "step": 11326 + }, + { + "epoch": 1.2439051175049418, + "grad_norm": 2.285811185836792, + "learning_rate": 5e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7355501651763916, + "num_tokens": 293131946.0, + "step": 11327 + }, + { + "epoch": 1.2440149352075554, + "grad_norm": 2.2012274265289307, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.752123236656189, + "num_tokens": 293151765.0, + "step": 11328 + }, + { + "epoch": 1.244124752910169, + "grad_norm": 1.606853723526001, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7356735467910767, + "num_tokens": 293184298.0, + "step": 11329 + }, + { + "epoch": 1.2442345706127829, + "grad_norm": 1.8900123834609985, + "learning_rate": 5e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.720090389251709, + "num_tokens": 293209229.0, + "step": 11330 + }, + { + "epoch": 1.2443443883153964, + "grad_norm": 1.9488766193389893, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7398127317428589, + "num_tokens": 293234331.0, + "step": 11331 + }, + { + "epoch": 1.2444542060180102, + "grad_norm": 1.787027359008789, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7272834777832031, + "num_tokens": 293262421.0, + "step": 11332 + }, + { + "epoch": 1.2445640237206237, + "grad_norm": 1.8153634071350098, + "learning_rate": 5e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7457846403121948, + "num_tokens": 293288423.0, + "step": 11333 + }, + { + "epoch": 1.2446738414232374, + "grad_norm": 1.872006893157959, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7419793605804443, + "num_tokens": 293311541.0, + "step": 11334 + }, + { + "epoch": 1.2447836591258512, + "grad_norm": 1.7565646171569824, + "learning_rate": 5e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7392160296440125, + "num_tokens": 293338746.0, + "step": 11335 + }, + { + "epoch": 1.2448934768284647, + "grad_norm": 1.8932044506072998, + "learning_rate": 5e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7437382936477661, + "num_tokens": 293363552.0, + "step": 11336 + }, + { + "epoch": 1.2450032945310785, + "grad_norm": 1.8487497568130493, + "learning_rate": 5e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7321101427078247, + "num_tokens": 293388276.0, + "step": 11337 + }, + { + "epoch": 1.245113112233692, + "grad_norm": 1.7891448736190796, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.739692747592926, + "num_tokens": 293416418.0, + "step": 11338 + }, + { + "epoch": 1.2452229299363058, + "grad_norm": 1.9046210050582886, + "learning_rate": 5e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7247086763381958, + "num_tokens": 293444359.0, + "step": 11339 + }, + { + "epoch": 1.2453327476389193, + "grad_norm": 1.8310269117355347, + "learning_rate": 5e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7056748867034912, + "num_tokens": 293474182.0, + "step": 11340 + }, + { + "epoch": 1.245442565341533, + "grad_norm": 2.1578288078308105, + "learning_rate": 5e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.755782425403595, + "num_tokens": 293493141.0, + "step": 11341 + }, + { + "epoch": 1.2455523830441466, + "grad_norm": 2.0691206455230713, + "learning_rate": 5e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7164491415023804, + "num_tokens": 293516430.0, + "step": 11342 + }, + { + "epoch": 1.2456622007467604, + "grad_norm": 1.8410024642944336, + "learning_rate": 5e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7307568788528442, + "num_tokens": 293542606.0, + "step": 11343 + }, + { + "epoch": 1.2457720184493741, + "grad_norm": 1.9432835578918457, + "learning_rate": 5e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7172987461090088, + "num_tokens": 293566592.0, + "step": 11344 + }, + { + "epoch": 1.2458818361519877, + "grad_norm": 2.2311899662017822, + "learning_rate": 5e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.753258466720581, + "num_tokens": 293585509.0, + "step": 11345 + }, + { + "epoch": 1.2459916538546014, + "grad_norm": 2.046477794647217, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7265833616256714, + "num_tokens": 293609480.0, + "step": 11346 + }, + { + "epoch": 1.246101471557215, + "grad_norm": 1.9149856567382812, + "learning_rate": 5e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7282860279083252, + "num_tokens": 293636624.0, + "step": 11347 + }, + { + "epoch": 1.2462112892598287, + "grad_norm": 1.9807554483413696, + "learning_rate": 5e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7271971702575684, + "num_tokens": 293660549.0, + "step": 11348 + }, + { + "epoch": 1.2463211069624422, + "grad_norm": 1.5786885023117065, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7325191497802734, + "num_tokens": 293695619.0, + "step": 11349 + }, + { + "epoch": 1.246430924665056, + "grad_norm": 2.1072440147399902, + "learning_rate": 5e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7573079466819763, + "num_tokens": 293715494.0, + "step": 11350 + }, + { + "epoch": 1.2465407423676698, + "grad_norm": 1.8576998710632324, + "learning_rate": 5e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.729310154914856, + "num_tokens": 293740990.0, + "step": 11351 + }, + { + "epoch": 1.2466505600702833, + "grad_norm": 2.012075185775757, + "learning_rate": 5e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7281304597854614, + "num_tokens": 293764363.0, + "step": 11352 + }, + { + "epoch": 1.246760377772897, + "grad_norm": 1.9550057649612427, + "learning_rate": 5e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7232420444488525, + "num_tokens": 293790764.0, + "step": 11353 + }, + { + "epoch": 1.2468701954755106, + "grad_norm": 1.7807694673538208, + "learning_rate": 5e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7170884609222412, + "num_tokens": 293820028.0, + "step": 11354 + }, + { + "epoch": 1.2469800131781243, + "grad_norm": 1.9542659521102905, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.749244213104248, + "num_tokens": 293844515.0, + "step": 11355 + }, + { + "epoch": 1.2470898308807379, + "grad_norm": 1.7625532150268555, + "learning_rate": 5e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7548558115959167, + "num_tokens": 293869688.0, + "step": 11356 + }, + { + "epoch": 1.2471996485833516, + "grad_norm": 1.9597370624542236, + "learning_rate": 5e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7164989709854126, + "num_tokens": 293897254.0, + "step": 11357 + }, + { + "epoch": 1.2473094662859654, + "grad_norm": 1.9868180751800537, + "learning_rate": 5e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7286739945411682, + "num_tokens": 293920224.0, + "step": 11358 + }, + { + "epoch": 1.247419283988579, + "grad_norm": 1.7785770893096924, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7336036562919617, + "num_tokens": 293947624.0, + "step": 11359 + }, + { + "epoch": 1.2475291016911927, + "grad_norm": 2.0652761459350586, + "learning_rate": 5e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7178352475166321, + "num_tokens": 293970712.0, + "step": 11360 + }, + { + "epoch": 1.2476389193938062, + "grad_norm": 2.3244152069091797, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7365946769714355, + "num_tokens": 293990021.0, + "step": 11361 + }, + { + "epoch": 1.24774873709642, + "grad_norm": 1.9092179536819458, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7486348152160645, + "num_tokens": 294014082.0, + "step": 11362 + }, + { + "epoch": 1.2478585547990335, + "grad_norm": 2.2027060985565186, + "learning_rate": 5e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.737673282623291, + "num_tokens": 294034314.0, + "step": 11363 + }, + { + "epoch": 1.2479683725016473, + "grad_norm": 1.7580550909042358, + "learning_rate": 5e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7318013906478882, + "num_tokens": 294063178.0, + "step": 11364 + }, + { + "epoch": 1.2480781902042608, + "grad_norm": 1.7835588455200195, + "learning_rate": 5e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7139309644699097, + "num_tokens": 294090696.0, + "step": 11365 + }, + { + "epoch": 1.2481880079068746, + "grad_norm": 1.9774022102355957, + "learning_rate": 5e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.746722936630249, + "num_tokens": 294113971.0, + "step": 11366 + }, + { + "epoch": 1.2482978256094883, + "grad_norm": 1.9655110836029053, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7194398641586304, + "num_tokens": 294138846.0, + "step": 11367 + }, + { + "epoch": 1.2484076433121019, + "grad_norm": 2.1565234661102295, + "learning_rate": 5e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7331753969192505, + "num_tokens": 294160447.0, + "step": 11368 + }, + { + "epoch": 1.2485174610147156, + "grad_norm": 2.072134256362915, + "learning_rate": 5e-06, + "loss": 0.834, + "mean_token_accuracy": 0.735967755317688, + "num_tokens": 294182714.0, + "step": 11369 + }, + { + "epoch": 1.2486272787173291, + "grad_norm": 1.736238956451416, + "learning_rate": 5e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7193724513053894, + "num_tokens": 294211604.0, + "step": 11370 + }, + { + "epoch": 1.248737096419943, + "grad_norm": 1.7377333641052246, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7394348382949829, + "num_tokens": 294240237.0, + "step": 11371 + }, + { + "epoch": 1.2488469141225567, + "grad_norm": 2.1430721282958984, + "learning_rate": 5e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7306616306304932, + "num_tokens": 294266305.0, + "step": 11372 + }, + { + "epoch": 1.2489567318251702, + "grad_norm": 1.7793453931808472, + "learning_rate": 5e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7237160205841064, + "num_tokens": 294297708.0, + "step": 11373 + }, + { + "epoch": 1.249066549527784, + "grad_norm": 1.7300060987472534, + "learning_rate": 5e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7199971675872803, + "num_tokens": 294328600.0, + "step": 11374 + }, + { + "epoch": 1.2491763672303975, + "grad_norm": 2.0516202449798584, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7671114206314087, + "num_tokens": 294348526.0, + "step": 11375 + }, + { + "epoch": 1.2492861849330112, + "grad_norm": 1.8236713409423828, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7207039594650269, + "num_tokens": 294376964.0, + "step": 11376 + }, + { + "epoch": 1.2493960026356248, + "grad_norm": 2.1178393363952637, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7500778436660767, + "num_tokens": 294397993.0, + "step": 11377 + }, + { + "epoch": 1.2495058203382385, + "grad_norm": 2.1718294620513916, + "learning_rate": 5e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7492327690124512, + "num_tokens": 294419836.0, + "step": 11378 + }, + { + "epoch": 1.249615638040852, + "grad_norm": 1.9261271953582764, + "learning_rate": 5e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.736492395401001, + "num_tokens": 294443024.0, + "step": 11379 + }, + { + "epoch": 1.2497254557434658, + "grad_norm": 1.9595935344696045, + "learning_rate": 5e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.737058162689209, + "num_tokens": 294466474.0, + "step": 11380 + }, + { + "epoch": 1.2498352734460796, + "grad_norm": 1.6946465969085693, + "learning_rate": 5e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7589312195777893, + "num_tokens": 294494401.0, + "step": 11381 + }, + { + "epoch": 1.2499450911486931, + "grad_norm": 1.7703778743743896, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7239673137664795, + "num_tokens": 294522937.0, + "step": 11382 + }, + { + "epoch": 1.2500549088513069, + "grad_norm": 1.739977240562439, + "learning_rate": 5e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7342725992202759, + "num_tokens": 294553088.0, + "step": 11383 + }, + { + "epoch": 1.2501647265539204, + "grad_norm": 1.7038686275482178, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7164332270622253, + "num_tokens": 294584993.0, + "step": 11384 + }, + { + "epoch": 1.2502745442565342, + "grad_norm": 1.9678927659988403, + "learning_rate": 5e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7266502380371094, + "num_tokens": 294609494.0, + "step": 11385 + }, + { + "epoch": 1.250384361959148, + "grad_norm": 1.828668236732483, + "learning_rate": 5e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7449101805686951, + "num_tokens": 294635042.0, + "step": 11386 + }, + { + "epoch": 1.2504941796617615, + "grad_norm": 1.712170958518982, + "learning_rate": 5e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7531972527503967, + "num_tokens": 294663078.0, + "step": 11387 + }, + { + "epoch": 1.2506039973643752, + "grad_norm": 1.7987052202224731, + "learning_rate": 5e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7591288089752197, + "num_tokens": 294688913.0, + "step": 11388 + }, + { + "epoch": 1.2507138150669888, + "grad_norm": 1.9413738250732422, + "learning_rate": 5e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.744385838508606, + "num_tokens": 294713655.0, + "step": 11389 + }, + { + "epoch": 1.2508236327696025, + "grad_norm": 1.929835557937622, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7379508018493652, + "num_tokens": 294738753.0, + "step": 11390 + }, + { + "epoch": 1.250933450472216, + "grad_norm": 2.1801393032073975, + "learning_rate": 5e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7424716353416443, + "num_tokens": 294759631.0, + "step": 11391 + }, + { + "epoch": 1.2510432681748298, + "grad_norm": 2.00647234916687, + "learning_rate": 5e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7143222689628601, + "num_tokens": 294783153.0, + "step": 11392 + }, + { + "epoch": 1.2511530858774433, + "grad_norm": 1.9940382242202759, + "learning_rate": 5e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7350971698760986, + "num_tokens": 294805319.0, + "step": 11393 + }, + { + "epoch": 1.251262903580057, + "grad_norm": 1.5655821561813354, + "learning_rate": 5e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7200507521629333, + "num_tokens": 294838043.0, + "step": 11394 + }, + { + "epoch": 1.2513727212826709, + "grad_norm": 1.6965701580047607, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.732090950012207, + "num_tokens": 294868707.0, + "step": 11395 + }, + { + "epoch": 1.2514825389852844, + "grad_norm": 1.8733357191085815, + "learning_rate": 5e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7259325981140137, + "num_tokens": 294893835.0, + "step": 11396 + }, + { + "epoch": 1.2515923566878981, + "grad_norm": 1.9055688381195068, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7408201694488525, + "num_tokens": 294918289.0, + "step": 11397 + }, + { + "epoch": 1.2517021743905117, + "grad_norm": 1.9464969635009766, + "learning_rate": 5e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7193411588668823, + "num_tokens": 294943441.0, + "step": 11398 + }, + { + "epoch": 1.2518119920931254, + "grad_norm": 1.938286304473877, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7410149574279785, + "num_tokens": 294967594.0, + "step": 11399 + }, + { + "epoch": 1.2519218097957392, + "grad_norm": 1.8660004138946533, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7345266938209534, + "num_tokens": 294992078.0, + "step": 11400 + }, + { + "epoch": 1.2520316274983527, + "grad_norm": 1.7422215938568115, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7198095321655273, + "num_tokens": 295022479.0, + "step": 11401 + }, + { + "epoch": 1.2521414452009663, + "grad_norm": 1.8711947202682495, + "learning_rate": 5e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7327554821968079, + "num_tokens": 295048259.0, + "step": 11402 + }, + { + "epoch": 1.25225126290358, + "grad_norm": 2.316728115081787, + "learning_rate": 5e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7503129243850708, + "num_tokens": 295065517.0, + "step": 11403 + }, + { + "epoch": 1.2523610806061938, + "grad_norm": 2.0118391513824463, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7533429861068726, + "num_tokens": 295086831.0, + "step": 11404 + }, + { + "epoch": 1.2524708983088073, + "grad_norm": 1.7472227811813354, + "learning_rate": 5e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.713998556137085, + "num_tokens": 295118229.0, + "step": 11405 + }, + { + "epoch": 1.252580716011421, + "grad_norm": 1.8369137048721313, + "learning_rate": 5e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7474887371063232, + "num_tokens": 295143711.0, + "step": 11406 + }, + { + "epoch": 1.2526905337140346, + "grad_norm": 1.9084609746932983, + "learning_rate": 5e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7389780282974243, + "num_tokens": 295167128.0, + "step": 11407 + }, + { + "epoch": 1.2528003514166484, + "grad_norm": 1.8490537405014038, + "learning_rate": 5e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7125535607337952, + "num_tokens": 295198270.0, + "step": 11408 + }, + { + "epoch": 1.2529101691192621, + "grad_norm": 1.8561540842056274, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7384231686592102, + "num_tokens": 295224632.0, + "step": 11409 + }, + { + "epoch": 1.2530199868218757, + "grad_norm": 2.0842998027801514, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7250035405158997, + "num_tokens": 295247686.0, + "step": 11410 + }, + { + "epoch": 1.2531298045244894, + "grad_norm": 1.8761723041534424, + "learning_rate": 5e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7433971166610718, + "num_tokens": 295275116.0, + "step": 11411 + }, + { + "epoch": 1.253239622227103, + "grad_norm": 2.1686084270477295, + "learning_rate": 5e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7304097414016724, + "num_tokens": 295297642.0, + "step": 11412 + }, + { + "epoch": 1.2533494399297167, + "grad_norm": 1.9485141038894653, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7368385195732117, + "num_tokens": 295323585.0, + "step": 11413 + }, + { + "epoch": 1.2534592576323305, + "grad_norm": 2.053154945373535, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7354544401168823, + "num_tokens": 295347443.0, + "step": 11414 + }, + { + "epoch": 1.253569075334944, + "grad_norm": 1.8723963499069214, + "learning_rate": 5e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7311473488807678, + "num_tokens": 295372949.0, + "step": 11415 + }, + { + "epoch": 1.2536788930375575, + "grad_norm": 1.9726558923721313, + "learning_rate": 5e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7330197095870972, + "num_tokens": 295397678.0, + "step": 11416 + }, + { + "epoch": 1.2537887107401713, + "grad_norm": 2.0283467769622803, + "learning_rate": 5e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7357975244522095, + "num_tokens": 295419449.0, + "step": 11417 + }, + { + "epoch": 1.253898528442785, + "grad_norm": 1.976281762123108, + "learning_rate": 5e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7394838333129883, + "num_tokens": 295444250.0, + "step": 11418 + }, + { + "epoch": 1.2540083461453986, + "grad_norm": 1.8475992679595947, + "learning_rate": 5e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7370035648345947, + "num_tokens": 295471612.0, + "step": 11419 + }, + { + "epoch": 1.2541181638480123, + "grad_norm": 1.9173146486282349, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7418663501739502, + "num_tokens": 295494889.0, + "step": 11420 + }, + { + "epoch": 1.2542279815506259, + "grad_norm": 1.8816227912902832, + "learning_rate": 5e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.7506434917449951, + "num_tokens": 295518722.0, + "step": 11421 + }, + { + "epoch": 1.2543377992532396, + "grad_norm": 1.863610029220581, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7161316871643066, + "num_tokens": 295545270.0, + "step": 11422 + }, + { + "epoch": 1.2544476169558534, + "grad_norm": 1.9738273620605469, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7421191334724426, + "num_tokens": 295568766.0, + "step": 11423 + }, + { + "epoch": 1.254557434658467, + "grad_norm": 1.8240782022476196, + "learning_rate": 5e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7512848973274231, + "num_tokens": 295595478.0, + "step": 11424 + }, + { + "epoch": 1.2546672523610807, + "grad_norm": 2.127981662750244, + "learning_rate": 5e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7591286897659302, + "num_tokens": 295613728.0, + "step": 11425 + }, + { + "epoch": 1.2547770700636942, + "grad_norm": 1.987011432647705, + "learning_rate": 5e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7270609736442566, + "num_tokens": 295637585.0, + "step": 11426 + }, + { + "epoch": 1.254886887766308, + "grad_norm": 1.7874702215194702, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7397208213806152, + "num_tokens": 295665761.0, + "step": 11427 + }, + { + "epoch": 1.2549967054689217, + "grad_norm": 1.945824146270752, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7336003184318542, + "num_tokens": 295691411.0, + "step": 11428 + }, + { + "epoch": 1.2551065231715353, + "grad_norm": 2.1413092613220215, + "learning_rate": 5e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.743461549282074, + "num_tokens": 295712441.0, + "step": 11429 + }, + { + "epoch": 1.2552163408741488, + "grad_norm": 1.6706870794296265, + "learning_rate": 5e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7103118300437927, + "num_tokens": 295743975.0, + "step": 11430 + }, + { + "epoch": 1.2553261585767626, + "grad_norm": 2.041116952896118, + "learning_rate": 5e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7274038791656494, + "num_tokens": 295766923.0, + "step": 11431 + }, + { + "epoch": 1.2554359762793763, + "grad_norm": 1.8045240640640259, + "learning_rate": 5e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7313579320907593, + "num_tokens": 295793376.0, + "step": 11432 + }, + { + "epoch": 1.2555457939819898, + "grad_norm": 1.885776162147522, + "learning_rate": 5e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7325866222381592, + "num_tokens": 295821029.0, + "step": 11433 + }, + { + "epoch": 1.2556556116846036, + "grad_norm": 2.1109743118286133, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.722602367401123, + "num_tokens": 295844617.0, + "step": 11434 + }, + { + "epoch": 1.2557654293872171, + "grad_norm": 1.6856547594070435, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7273743152618408, + "num_tokens": 295876947.0, + "step": 11435 + }, + { + "epoch": 1.255875247089831, + "grad_norm": 1.784867525100708, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7253617644309998, + "num_tokens": 295903708.0, + "step": 11436 + }, + { + "epoch": 1.2559850647924446, + "grad_norm": 1.7157959938049316, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7341357469558716, + "num_tokens": 295932427.0, + "step": 11437 + }, + { + "epoch": 1.2560948824950582, + "grad_norm": 1.6809865236282349, + "learning_rate": 5e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7264645099639893, + "num_tokens": 295962209.0, + "step": 11438 + }, + { + "epoch": 1.256204700197672, + "grad_norm": 1.923883318901062, + "learning_rate": 5e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7303045988082886, + "num_tokens": 295986998.0, + "step": 11439 + }, + { + "epoch": 1.2563145179002855, + "grad_norm": 1.8720506429672241, + "learning_rate": 5e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7285702228546143, + "num_tokens": 296011819.0, + "step": 11440 + }, + { + "epoch": 1.2564243356028992, + "grad_norm": 1.6578271389007568, + "learning_rate": 5e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7473180890083313, + "num_tokens": 296040845.0, + "step": 11441 + }, + { + "epoch": 1.2565341533055128, + "grad_norm": 2.024089813232422, + "learning_rate": 5e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.750780463218689, + "num_tokens": 296061832.0, + "step": 11442 + }, + { + "epoch": 1.2566439710081265, + "grad_norm": 1.8183999061584473, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7743926644325256, + "num_tokens": 296086343.0, + "step": 11443 + }, + { + "epoch": 1.25675378871074, + "grad_norm": 1.6910511255264282, + "learning_rate": 5e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.714043140411377, + "num_tokens": 296119112.0, + "step": 11444 + }, + { + "epoch": 1.2568636064133538, + "grad_norm": 2.1050057411193848, + "learning_rate": 5e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7399508953094482, + "num_tokens": 296141596.0, + "step": 11445 + }, + { + "epoch": 1.2569734241159676, + "grad_norm": 1.7578375339508057, + "learning_rate": 5e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7209979295730591, + "num_tokens": 296169993.0, + "step": 11446 + }, + { + "epoch": 1.257083241818581, + "grad_norm": 1.9259793758392334, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7329733967781067, + "num_tokens": 296194289.0, + "step": 11447 + }, + { + "epoch": 1.2571930595211949, + "grad_norm": 1.8019723892211914, + "learning_rate": 5e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7453391551971436, + "num_tokens": 296219598.0, + "step": 11448 + }, + { + "epoch": 1.2573028772238084, + "grad_norm": 2.039433002471924, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7405188083648682, + "num_tokens": 296241877.0, + "step": 11449 + }, + { + "epoch": 1.2574126949264222, + "grad_norm": 2.074445962905884, + "learning_rate": 5e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7378097176551819, + "num_tokens": 296261822.0, + "step": 11450 + }, + { + "epoch": 1.257522512629036, + "grad_norm": 1.821467638015747, + "learning_rate": 5e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7199579477310181, + "num_tokens": 296288813.0, + "step": 11451 + }, + { + "epoch": 1.2576323303316495, + "grad_norm": 1.7353242635726929, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.746397852897644, + "num_tokens": 296319245.0, + "step": 11452 + }, + { + "epoch": 1.257742148034263, + "grad_norm": 2.0190978050231934, + "learning_rate": 5e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7311126589775085, + "num_tokens": 296343721.0, + "step": 11453 + }, + { + "epoch": 1.2578519657368767, + "grad_norm": 2.121225595474243, + "learning_rate": 5e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7400909662246704, + "num_tokens": 296363882.0, + "step": 11454 + }, + { + "epoch": 1.2579617834394905, + "grad_norm": 1.771242618560791, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7539640069007874, + "num_tokens": 296389651.0, + "step": 11455 + }, + { + "epoch": 1.258071601142104, + "grad_norm": 1.9127793312072754, + "learning_rate": 5e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7276666164398193, + "num_tokens": 296414571.0, + "step": 11456 + }, + { + "epoch": 1.2581814188447178, + "grad_norm": 1.8523788452148438, + "learning_rate": 5e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7283017635345459, + "num_tokens": 296442481.0, + "step": 11457 + }, + { + "epoch": 1.2582912365473313, + "grad_norm": 1.9534590244293213, + "learning_rate": 5e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7411164045333862, + "num_tokens": 296467705.0, + "step": 11458 + }, + { + "epoch": 1.258401054249945, + "grad_norm": 1.8201065063476562, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7628772258758545, + "num_tokens": 296494704.0, + "step": 11459 + }, + { + "epoch": 1.2585108719525588, + "grad_norm": 1.784770131111145, + "learning_rate": 5e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.733535885810852, + "num_tokens": 296525379.0, + "step": 11460 + }, + { + "epoch": 1.2586206896551724, + "grad_norm": 1.8183577060699463, + "learning_rate": 5e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7342917919158936, + "num_tokens": 296551513.0, + "step": 11461 + }, + { + "epoch": 1.2587305073577861, + "grad_norm": 1.694360375404358, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.734009861946106, + "num_tokens": 296580084.0, + "step": 11462 + }, + { + "epoch": 1.2588403250603997, + "grad_norm": 1.9210388660430908, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7265619039535522, + "num_tokens": 296604676.0, + "step": 11463 + }, + { + "epoch": 1.2589501427630134, + "grad_norm": 1.941733479499817, + "learning_rate": 5e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7147326469421387, + "num_tokens": 296631783.0, + "step": 11464 + }, + { + "epoch": 1.2590599604656272, + "grad_norm": 1.9552786350250244, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7309445738792419, + "num_tokens": 296658719.0, + "step": 11465 + }, + { + "epoch": 1.2591697781682407, + "grad_norm": 2.0712478160858154, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7460567355155945, + "num_tokens": 296680068.0, + "step": 11466 + }, + { + "epoch": 1.2592795958708543, + "grad_norm": 1.8908346891403198, + "learning_rate": 5e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7436789274215698, + "num_tokens": 296703155.0, + "step": 11467 + }, + { + "epoch": 1.259389413573468, + "grad_norm": 1.7267141342163086, + "learning_rate": 5e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7666903734207153, + "num_tokens": 296729366.0, + "step": 11468 + }, + { + "epoch": 1.2594992312760818, + "grad_norm": 2.0800223350524902, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7423516511917114, + "num_tokens": 296751415.0, + "step": 11469 + }, + { + "epoch": 1.2596090489786953, + "grad_norm": 1.956244707107544, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7238216400146484, + "num_tokens": 296776289.0, + "step": 11470 + }, + { + "epoch": 1.259718866681309, + "grad_norm": 1.8344831466674805, + "learning_rate": 5e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7288161516189575, + "num_tokens": 296803114.0, + "step": 11471 + }, + { + "epoch": 1.2598286843839226, + "grad_norm": 1.6487964391708374, + "learning_rate": 5e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7477781772613525, + "num_tokens": 296830842.0, + "step": 11472 + }, + { + "epoch": 1.2599385020865363, + "grad_norm": 1.6603070497512817, + "learning_rate": 5e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7170782089233398, + "num_tokens": 296864729.0, + "step": 11473 + }, + { + "epoch": 1.26004831978915, + "grad_norm": 2.2878894805908203, + "learning_rate": 5e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7426964044570923, + "num_tokens": 296883905.0, + "step": 11474 + }, + { + "epoch": 1.2601581374917636, + "grad_norm": 2.180818796157837, + "learning_rate": 5e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7361791133880615, + "num_tokens": 296904907.0, + "step": 11475 + }, + { + "epoch": 1.2602679551943774, + "grad_norm": 2.018181562423706, + "learning_rate": 5e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7544054985046387, + "num_tokens": 296928253.0, + "step": 11476 + }, + { + "epoch": 1.260377772896991, + "grad_norm": 2.067312240600586, + "learning_rate": 5e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7112778425216675, + "num_tokens": 296953004.0, + "step": 11477 + }, + { + "epoch": 1.2604875905996047, + "grad_norm": 1.9531337022781372, + "learning_rate": 5e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.729846715927124, + "num_tokens": 296979565.0, + "step": 11478 + }, + { + "epoch": 1.2605974083022184, + "grad_norm": 1.7098826169967651, + "learning_rate": 5e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7562441825866699, + "num_tokens": 297008776.0, + "step": 11479 + }, + { + "epoch": 1.260707226004832, + "grad_norm": 1.9219948053359985, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7406289577484131, + "num_tokens": 297031812.0, + "step": 11480 + }, + { + "epoch": 1.2608170437074455, + "grad_norm": 1.958034873008728, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7285133004188538, + "num_tokens": 297055113.0, + "step": 11481 + }, + { + "epoch": 1.2609268614100593, + "grad_norm": 1.8269771337509155, + "learning_rate": 5e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7377442121505737, + "num_tokens": 297081590.0, + "step": 11482 + }, + { + "epoch": 1.261036679112673, + "grad_norm": 1.8662339448928833, + "learning_rate": 5e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7342544794082642, + "num_tokens": 297106809.0, + "step": 11483 + }, + { + "epoch": 1.2611464968152866, + "grad_norm": 1.9605692625045776, + "learning_rate": 5e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7312438488006592, + "num_tokens": 297130942.0, + "step": 11484 + }, + { + "epoch": 1.2612563145179003, + "grad_norm": 2.0998647212982178, + "learning_rate": 5e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.738818883895874, + "num_tokens": 297152522.0, + "step": 11485 + }, + { + "epoch": 1.2613661322205139, + "grad_norm": 1.7442904710769653, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7314722537994385, + "num_tokens": 297184250.0, + "step": 11486 + }, + { + "epoch": 1.2614759499231276, + "grad_norm": 1.7299340963363647, + "learning_rate": 5e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7034542560577393, + "num_tokens": 297216945.0, + "step": 11487 + }, + { + "epoch": 1.2615857676257414, + "grad_norm": 1.8040794134140015, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7271692752838135, + "num_tokens": 297243422.0, + "step": 11488 + }, + { + "epoch": 1.261695585328355, + "grad_norm": 1.965427279472351, + "learning_rate": 5e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7251040935516357, + "num_tokens": 297266230.0, + "step": 11489 + }, + { + "epoch": 1.2618054030309687, + "grad_norm": 2.061357021331787, + "learning_rate": 5e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7241566181182861, + "num_tokens": 297289462.0, + "step": 11490 + }, + { + "epoch": 1.2619152207335822, + "grad_norm": 1.8412624597549438, + "learning_rate": 5e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7034621238708496, + "num_tokens": 297317713.0, + "step": 11491 + }, + { + "epoch": 1.262025038436196, + "grad_norm": 1.9494503736495972, + "learning_rate": 5e-06, + "loss": 0.868, + "mean_token_accuracy": 0.728438138961792, + "num_tokens": 297342392.0, + "step": 11492 + }, + { + "epoch": 1.2621348561388097, + "grad_norm": 1.7040283679962158, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7145617604255676, + "num_tokens": 297374691.0, + "step": 11493 + }, + { + "epoch": 1.2622446738414232, + "grad_norm": 2.1134090423583984, + "learning_rate": 5e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.7496448159217834, + "num_tokens": 297395616.0, + "step": 11494 + }, + { + "epoch": 1.2623544915440368, + "grad_norm": 1.6835057735443115, + "learning_rate": 5e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.706575870513916, + "num_tokens": 297428541.0, + "step": 11495 + }, + { + "epoch": 1.2624643092466505, + "grad_norm": 2.19916033744812, + "learning_rate": 5e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7309226989746094, + "num_tokens": 297449384.0, + "step": 11496 + }, + { + "epoch": 1.2625741269492643, + "grad_norm": 1.6502960920333862, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7376514673233032, + "num_tokens": 297480060.0, + "step": 11497 + }, + { + "epoch": 1.2626839446518778, + "grad_norm": 1.7037534713745117, + "learning_rate": 5e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7328097820281982, + "num_tokens": 297508478.0, + "step": 11498 + }, + { + "epoch": 1.2627937623544916, + "grad_norm": 2.1739583015441895, + "learning_rate": 5e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7397253513336182, + "num_tokens": 297528183.0, + "step": 11499 + }, + { + "epoch": 1.2629035800571051, + "grad_norm": 1.7974047660827637, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7246944904327393, + "num_tokens": 297555881.0, + "step": 11500 + }, + { + "epoch": 1.2630133977597189, + "grad_norm": 1.8256064653396606, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7387993335723877, + "num_tokens": 297583075.0, + "step": 11501 + }, + { + "epoch": 1.2631232154623326, + "grad_norm": 1.7921855449676514, + "learning_rate": 5e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7217284440994263, + "num_tokens": 297611704.0, + "step": 11502 + }, + { + "epoch": 1.2632330331649462, + "grad_norm": 2.0356340408325195, + "learning_rate": 5e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7155002951622009, + "num_tokens": 297637030.0, + "step": 11503 + }, + { + "epoch": 1.26334285086756, + "grad_norm": 1.8274779319763184, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.746802568435669, + "num_tokens": 297660768.0, + "step": 11504 + }, + { + "epoch": 1.2634526685701735, + "grad_norm": 1.9177123308181763, + "learning_rate": 5e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7249128222465515, + "num_tokens": 297685143.0, + "step": 11505 + }, + { + "epoch": 1.2635624862727872, + "grad_norm": 1.717199683189392, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7188887000083923, + "num_tokens": 297717262.0, + "step": 11506 + }, + { + "epoch": 1.2636723039754008, + "grad_norm": 1.8132754564285278, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7313589453697205, + "num_tokens": 297743862.0, + "step": 11507 + }, + { + "epoch": 1.2637821216780145, + "grad_norm": 2.2530415058135986, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7452839612960815, + "num_tokens": 297762882.0, + "step": 11508 + }, + { + "epoch": 1.263891939380628, + "grad_norm": 2.0405900478363037, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7281796932220459, + "num_tokens": 297786733.0, + "step": 11509 + }, + { + "epoch": 1.2640017570832418, + "grad_norm": 1.8796324729919434, + "learning_rate": 5e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7221630811691284, + "num_tokens": 297812395.0, + "step": 11510 + }, + { + "epoch": 1.2641115747858556, + "grad_norm": 1.8784288167953491, + "learning_rate": 5e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.702222466468811, + "num_tokens": 297843342.0, + "step": 11511 + }, + { + "epoch": 1.264221392488469, + "grad_norm": 2.0754051208496094, + "learning_rate": 5e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7528865933418274, + "num_tokens": 297864393.0, + "step": 11512 + }, + { + "epoch": 1.2643312101910829, + "grad_norm": 2.0970301628112793, + "learning_rate": 5e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7430403828620911, + "num_tokens": 297885236.0, + "step": 11513 + }, + { + "epoch": 1.2644410278936964, + "grad_norm": 1.936659574508667, + "learning_rate": 5e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7229673862457275, + "num_tokens": 297909338.0, + "step": 11514 + }, + { + "epoch": 1.2645508455963101, + "grad_norm": 1.7248014211654663, + "learning_rate": 5e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7454664707183838, + "num_tokens": 297941173.0, + "step": 11515 + }, + { + "epoch": 1.264660663298924, + "grad_norm": 1.8614693880081177, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.76665198802948, + "num_tokens": 297965591.0, + "step": 11516 + }, + { + "epoch": 1.2647704810015374, + "grad_norm": 1.9308586120605469, + "learning_rate": 5e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7176119685173035, + "num_tokens": 297991209.0, + "step": 11517 + }, + { + "epoch": 1.264880298704151, + "grad_norm": 1.7389365434646606, + "learning_rate": 5e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7279701232910156, + "num_tokens": 298020571.0, + "step": 11518 + }, + { + "epoch": 1.2649901164067647, + "grad_norm": 1.9242770671844482, + "learning_rate": 5e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7311331033706665, + "num_tokens": 298043999.0, + "step": 11519 + }, + { + "epoch": 1.2650999341093785, + "grad_norm": 2.0810718536376953, + "learning_rate": 5e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7506322264671326, + "num_tokens": 298065120.0, + "step": 11520 + }, + { + "epoch": 1.265209751811992, + "grad_norm": 1.826432466506958, + "learning_rate": 5e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.727554202079773, + "num_tokens": 298092203.0, + "step": 11521 + }, + { + "epoch": 1.2653195695146058, + "grad_norm": 2.1004278659820557, + "learning_rate": 5e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7228403091430664, + "num_tokens": 298114770.0, + "step": 11522 + }, + { + "epoch": 1.2654293872172193, + "grad_norm": 2.022343158721924, + "learning_rate": 5e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7233550548553467, + "num_tokens": 298138132.0, + "step": 11523 + }, + { + "epoch": 1.265539204919833, + "grad_norm": 1.9203262329101562, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.766999363899231, + "num_tokens": 298160037.0, + "step": 11524 + }, + { + "epoch": 1.2656490226224468, + "grad_norm": 2.1902217864990234, + "learning_rate": 5e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7622376680374146, + "num_tokens": 298179664.0, + "step": 11525 + }, + { + "epoch": 1.2657588403250604, + "grad_norm": 1.7906484603881836, + "learning_rate": 5e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7392210960388184, + "num_tokens": 298207126.0, + "step": 11526 + }, + { + "epoch": 1.2658686580276741, + "grad_norm": 2.1070451736450195, + "learning_rate": 5e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7476407289505005, + "num_tokens": 298227359.0, + "step": 11527 + }, + { + "epoch": 1.2659784757302877, + "grad_norm": 1.8125252723693848, + "learning_rate": 5e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7297965884208679, + "num_tokens": 298254356.0, + "step": 11528 + }, + { + "epoch": 1.2660882934329014, + "grad_norm": 1.9214009046554565, + "learning_rate": 5e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7422195672988892, + "num_tokens": 298278729.0, + "step": 11529 + }, + { + "epoch": 1.2661981111355152, + "grad_norm": 1.821699619293213, + "learning_rate": 5e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7535338401794434, + "num_tokens": 298303147.0, + "step": 11530 + }, + { + "epoch": 1.2663079288381287, + "grad_norm": 1.743648886680603, + "learning_rate": 5e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7310559749603271, + "num_tokens": 298332783.0, + "step": 11531 + }, + { + "epoch": 1.2664177465407422, + "grad_norm": 1.6645249128341675, + "learning_rate": 5e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7186571955680847, + "num_tokens": 298364703.0, + "step": 11532 + }, + { + "epoch": 1.266527564243356, + "grad_norm": 1.9317371845245361, + "learning_rate": 5e-06, + "loss": 0.868, + "mean_token_accuracy": 0.739186704158783, + "num_tokens": 298389987.0, + "step": 11533 + }, + { + "epoch": 1.2666373819459698, + "grad_norm": 1.8278566598892212, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7348291873931885, + "num_tokens": 298419115.0, + "step": 11534 + }, + { + "epoch": 1.2667471996485833, + "grad_norm": 2.0352394580841064, + "learning_rate": 5e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7342281341552734, + "num_tokens": 298441815.0, + "step": 11535 + }, + { + "epoch": 1.266857017351197, + "grad_norm": 1.5752551555633545, + "learning_rate": 5e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7354559898376465, + "num_tokens": 298476899.0, + "step": 11536 + }, + { + "epoch": 1.2669668350538106, + "grad_norm": 1.9509694576263428, + "learning_rate": 5e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7274893522262573, + "num_tokens": 298500779.0, + "step": 11537 + }, + { + "epoch": 1.2670766527564243, + "grad_norm": 1.8120932579040527, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7428907155990601, + "num_tokens": 298527277.0, + "step": 11538 + }, + { + "epoch": 1.267186470459038, + "grad_norm": 2.037910223007202, + "learning_rate": 5e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7378593683242798, + "num_tokens": 298549261.0, + "step": 11539 + }, + { + "epoch": 1.2672962881616516, + "grad_norm": 1.9765105247497559, + "learning_rate": 5e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7595586776733398, + "num_tokens": 298570384.0, + "step": 11540 + }, + { + "epoch": 1.2674061058642654, + "grad_norm": 2.033879041671753, + "learning_rate": 5e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7221858501434326, + "num_tokens": 298593431.0, + "step": 11541 + }, + { + "epoch": 1.267515923566879, + "grad_norm": 1.6867629289627075, + "learning_rate": 5e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.718069314956665, + "num_tokens": 298625914.0, + "step": 11542 + }, + { + "epoch": 1.2676257412694927, + "grad_norm": 1.8017194271087646, + "learning_rate": 5e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7417014837265015, + "num_tokens": 298651021.0, + "step": 11543 + }, + { + "epoch": 1.2677355589721064, + "grad_norm": 1.903465986251831, + "learning_rate": 5e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7516392469406128, + "num_tokens": 298674081.0, + "step": 11544 + }, + { + "epoch": 1.26784537667472, + "grad_norm": 1.6921969652175903, + "learning_rate": 5e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7181119918823242, + "num_tokens": 298704952.0, + "step": 11545 + }, + { + "epoch": 1.2679551943773335, + "grad_norm": 1.890571117401123, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7196617126464844, + "num_tokens": 298729944.0, + "step": 11546 + }, + { + "epoch": 1.2680650120799473, + "grad_norm": 1.7963683605194092, + "learning_rate": 5e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.743266224861145, + "num_tokens": 298758960.0, + "step": 11547 + }, + { + "epoch": 1.268174829782561, + "grad_norm": 1.8133624792099, + "learning_rate": 5e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.700251579284668, + "num_tokens": 298786779.0, + "step": 11548 + }, + { + "epoch": 1.2682846474851746, + "grad_norm": 2.2602012157440186, + "learning_rate": 5e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.7434799671173096, + "num_tokens": 298806971.0, + "step": 11549 + }, + { + "epoch": 1.2683944651877883, + "grad_norm": 1.79770028591156, + "learning_rate": 5e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7199438810348511, + "num_tokens": 298836668.0, + "step": 11550 + }, + { + "epoch": 1.2685042828904018, + "grad_norm": 1.8147294521331787, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7514970302581787, + "num_tokens": 298862817.0, + "step": 11551 + }, + { + "epoch": 1.2686141005930156, + "grad_norm": 1.865250587463379, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7182283401489258, + "num_tokens": 298890607.0, + "step": 11552 + }, + { + "epoch": 1.2687239182956294, + "grad_norm": 1.8759679794311523, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7356466054916382, + "num_tokens": 298915495.0, + "step": 11553 + }, + { + "epoch": 1.268833735998243, + "grad_norm": 1.9411207437515259, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7393941879272461, + "num_tokens": 298938448.0, + "step": 11554 + }, + { + "epoch": 1.2689435537008567, + "grad_norm": 1.8408979177474976, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7137994766235352, + "num_tokens": 298965428.0, + "step": 11555 + }, + { + "epoch": 1.2690533714034702, + "grad_norm": 1.9505506753921509, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7173146605491638, + "num_tokens": 298990999.0, + "step": 11556 + }, + { + "epoch": 1.269163189106084, + "grad_norm": 1.9238195419311523, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7589654922485352, + "num_tokens": 299011808.0, + "step": 11557 + }, + { + "epoch": 1.2692730068086975, + "grad_norm": 2.0180201530456543, + "learning_rate": 5e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.752895176410675, + "num_tokens": 299031849.0, + "step": 11558 + }, + { + "epoch": 1.2693828245113112, + "grad_norm": 2.1159462928771973, + "learning_rate": 5e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.7437777519226074, + "num_tokens": 299053241.0, + "step": 11559 + }, + { + "epoch": 1.2694926422139248, + "grad_norm": 2.2055554389953613, + "learning_rate": 5e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7405167818069458, + "num_tokens": 299073294.0, + "step": 11560 + }, + { + "epoch": 1.2696024599165385, + "grad_norm": 1.6750258207321167, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7390929460525513, + "num_tokens": 299102288.0, + "step": 11561 + }, + { + "epoch": 1.2697122776191523, + "grad_norm": 1.7949899435043335, + "learning_rate": 5e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7236194610595703, + "num_tokens": 299132034.0, + "step": 11562 + }, + { + "epoch": 1.2698220953217658, + "grad_norm": 2.065361261367798, + "learning_rate": 5e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7237249612808228, + "num_tokens": 299155087.0, + "step": 11563 + }, + { + "epoch": 1.2699319130243796, + "grad_norm": 1.9701510667800903, + "learning_rate": 5e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7492220401763916, + "num_tokens": 299177184.0, + "step": 11564 + }, + { + "epoch": 1.2700417307269931, + "grad_norm": 2.037297487258911, + "learning_rate": 5e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7492835521697998, + "num_tokens": 299199611.0, + "step": 11565 + }, + { + "epoch": 1.2701515484296069, + "grad_norm": 1.7770694494247437, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7298197746276855, + "num_tokens": 299226569.0, + "step": 11566 + }, + { + "epoch": 1.2702613661322206, + "grad_norm": 1.609763264656067, + "learning_rate": 5e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7186589241027832, + "num_tokens": 299260321.0, + "step": 11567 + }, + { + "epoch": 1.2703711838348342, + "grad_norm": 2.001422882080078, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7489129304885864, + "num_tokens": 299283232.0, + "step": 11568 + }, + { + "epoch": 1.270481001537448, + "grad_norm": 2.0365240573883057, + "learning_rate": 5e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7403726577758789, + "num_tokens": 299303889.0, + "step": 11569 + }, + { + "epoch": 1.2705908192400615, + "grad_norm": 1.8210653066635132, + "learning_rate": 5e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7292562127113342, + "num_tokens": 299330616.0, + "step": 11570 + }, + { + "epoch": 1.2707006369426752, + "grad_norm": 1.851955533027649, + "learning_rate": 5e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7445637583732605, + "num_tokens": 299356651.0, + "step": 11571 + }, + { + "epoch": 1.2708104546452887, + "grad_norm": 2.0514121055603027, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7285686135292053, + "num_tokens": 299379653.0, + "step": 11572 + }, + { + "epoch": 1.2709202723479025, + "grad_norm": 2.0688962936401367, + "learning_rate": 5e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7386009693145752, + "num_tokens": 299402866.0, + "step": 11573 + }, + { + "epoch": 1.271030090050516, + "grad_norm": 1.6642446517944336, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7598193883895874, + "num_tokens": 299432204.0, + "step": 11574 + }, + { + "epoch": 1.2711399077531298, + "grad_norm": 1.8161227703094482, + "learning_rate": 5e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7246975898742676, + "num_tokens": 299460123.0, + "step": 11575 + }, + { + "epoch": 1.2712497254557436, + "grad_norm": 1.9085263013839722, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7382652759552002, + "num_tokens": 299483285.0, + "step": 11576 + }, + { + "epoch": 1.271359543158357, + "grad_norm": 1.8969449996948242, + "learning_rate": 5e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7315230965614319, + "num_tokens": 299507386.0, + "step": 11577 + }, + { + "epoch": 1.2714693608609708, + "grad_norm": 1.90019690990448, + "learning_rate": 5e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7270114421844482, + "num_tokens": 299532921.0, + "step": 11578 + }, + { + "epoch": 1.2715791785635844, + "grad_norm": 1.9132161140441895, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7159100770950317, + "num_tokens": 299559353.0, + "step": 11579 + }, + { + "epoch": 1.2716889962661981, + "grad_norm": 1.8949449062347412, + "learning_rate": 5e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7484344840049744, + "num_tokens": 299584599.0, + "step": 11580 + }, + { + "epoch": 1.271798813968812, + "grad_norm": 1.7269606590270996, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7291693687438965, + "num_tokens": 299611674.0, + "step": 11581 + }, + { + "epoch": 1.2719086316714254, + "grad_norm": 2.112027406692505, + "learning_rate": 5e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7292978763580322, + "num_tokens": 299632737.0, + "step": 11582 + }, + { + "epoch": 1.272018449374039, + "grad_norm": 1.6376135349273682, + "learning_rate": 5e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7051246166229248, + "num_tokens": 299664256.0, + "step": 11583 + }, + { + "epoch": 1.2721282670766527, + "grad_norm": 1.9087376594543457, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7458786964416504, + "num_tokens": 299688334.0, + "step": 11584 + }, + { + "epoch": 1.2722380847792665, + "grad_norm": 1.953609824180603, + "learning_rate": 5e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7239777445793152, + "num_tokens": 299713311.0, + "step": 11585 + }, + { + "epoch": 1.27234790248188, + "grad_norm": 1.8408318758010864, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7528632879257202, + "num_tokens": 299738654.0, + "step": 11586 + }, + { + "epoch": 1.2724577201844938, + "grad_norm": 1.7992322444915771, + "learning_rate": 5e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7247004508972168, + "num_tokens": 299768134.0, + "step": 11587 + }, + { + "epoch": 1.2725675378871073, + "grad_norm": 1.7439812421798706, + "learning_rate": 5e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7288815975189209, + "num_tokens": 299797158.0, + "step": 11588 + }, + { + "epoch": 1.272677355589721, + "grad_norm": 1.7696975469589233, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7189480066299438, + "num_tokens": 299825900.0, + "step": 11589 + }, + { + "epoch": 1.2727871732923348, + "grad_norm": 1.9097638130187988, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7347981929779053, + "num_tokens": 299851126.0, + "step": 11590 + }, + { + "epoch": 1.2728969909949484, + "grad_norm": 1.8001803159713745, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7263308763504028, + "num_tokens": 299878253.0, + "step": 11591 + }, + { + "epoch": 1.273006808697562, + "grad_norm": 1.854294776916504, + "learning_rate": 5e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7129388451576233, + "num_tokens": 299905686.0, + "step": 11592 + }, + { + "epoch": 1.2731166264001756, + "grad_norm": 2.0040366649627686, + "learning_rate": 5e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7288227677345276, + "num_tokens": 299930000.0, + "step": 11593 + }, + { + "epoch": 1.2732264441027894, + "grad_norm": 2.0705435276031494, + "learning_rate": 5e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7543699741363525, + "num_tokens": 299951147.0, + "step": 11594 + }, + { + "epoch": 1.2733362618054032, + "grad_norm": 1.8263530731201172, + "learning_rate": 5e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.740666389465332, + "num_tokens": 299979236.0, + "step": 11595 + }, + { + "epoch": 1.2734460795080167, + "grad_norm": 2.0247955322265625, + "learning_rate": 5e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7467485666275024, + "num_tokens": 300001425.0, + "step": 11596 + }, + { + "epoch": 1.2735558972106302, + "grad_norm": 2.016322135925293, + "learning_rate": 5e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7537892460823059, + "num_tokens": 300022269.0, + "step": 11597 + }, + { + "epoch": 1.273665714913244, + "grad_norm": 1.79448664188385, + "learning_rate": 5e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7283464074134827, + "num_tokens": 300052466.0, + "step": 11598 + }, + { + "epoch": 1.2737755326158577, + "grad_norm": 1.8151522874832153, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.73333340883255, + "num_tokens": 300079641.0, + "step": 11599 + }, + { + "epoch": 1.2738853503184713, + "grad_norm": 1.750286340713501, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7478140592575073, + "num_tokens": 300107946.0, + "step": 11600 + }, + { + "epoch": 1.273995168021085, + "grad_norm": 1.920641303062439, + "learning_rate": 5e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.732973575592041, + "num_tokens": 300135927.0, + "step": 11601 + }, + { + "epoch": 1.2741049857236986, + "grad_norm": 1.8604458570480347, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7233306765556335, + "num_tokens": 300159625.0, + "step": 11602 + }, + { + "epoch": 1.2742148034263123, + "grad_norm": 1.6689046621322632, + "learning_rate": 5e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7350079417228699, + "num_tokens": 300189871.0, + "step": 11603 + }, + { + "epoch": 1.274324621128926, + "grad_norm": 2.011685371398926, + "learning_rate": 5e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.732720136642456, + "num_tokens": 300213187.0, + "step": 11604 + }, + { + "epoch": 1.2744344388315396, + "grad_norm": 1.7238038778305054, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7467097043991089, + "num_tokens": 300243202.0, + "step": 11605 + }, + { + "epoch": 1.2745442565341534, + "grad_norm": 1.85142183303833, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7384345531463623, + "num_tokens": 300269438.0, + "step": 11606 + }, + { + "epoch": 1.274654074236767, + "grad_norm": 2.0162291526794434, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7318029403686523, + "num_tokens": 300291399.0, + "step": 11607 + }, + { + "epoch": 1.2747638919393807, + "grad_norm": 1.6758226156234741, + "learning_rate": 5e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7440434694290161, + "num_tokens": 300321623.0, + "step": 11608 + }, + { + "epoch": 1.2748737096419944, + "grad_norm": 2.0519089698791504, + "learning_rate": 5e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7376352548599243, + "num_tokens": 300343257.0, + "step": 11609 + }, + { + "epoch": 1.274983527344608, + "grad_norm": 1.8270279169082642, + "learning_rate": 5e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7308430671691895, + "num_tokens": 300371368.0, + "step": 11610 + }, + { + "epoch": 1.2750933450472215, + "grad_norm": 2.042344570159912, + "learning_rate": 5e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7540367841720581, + "num_tokens": 300393245.0, + "step": 11611 + }, + { + "epoch": 1.2752031627498353, + "grad_norm": 1.5795230865478516, + "learning_rate": 5e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7162039279937744, + "num_tokens": 300428873.0, + "step": 11612 + }, + { + "epoch": 1.275312980452449, + "grad_norm": 2.016655445098877, + "learning_rate": 5e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7557675242424011, + "num_tokens": 300449512.0, + "step": 11613 + }, + { + "epoch": 1.2754227981550625, + "grad_norm": 1.8289494514465332, + "learning_rate": 5e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7275006771087646, + "num_tokens": 300478643.0, + "step": 11614 + }, + { + "epoch": 1.2755326158576763, + "grad_norm": 1.8018872737884521, + "learning_rate": 5e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.729379415512085, + "num_tokens": 300506163.0, + "step": 11615 + }, + { + "epoch": 1.2756424335602898, + "grad_norm": 1.9672937393188477, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7309848070144653, + "num_tokens": 300529045.0, + "step": 11616 + }, + { + "epoch": 1.2757522512629036, + "grad_norm": 2.048292875289917, + "learning_rate": 5e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7266480922698975, + "num_tokens": 300550613.0, + "step": 11617 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 1.7366646528244019, + "learning_rate": 5e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7155581712722778, + "num_tokens": 300581236.0, + "step": 11618 + }, + { + "epoch": 1.2759718866681309, + "grad_norm": 2.0403356552124023, + "learning_rate": 5e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7507592439651489, + "num_tokens": 300602967.0, + "step": 11619 + }, + { + "epoch": 1.2760817043707446, + "grad_norm": 1.6433732509613037, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7175272703170776, + "num_tokens": 300638159.0, + "step": 11620 + }, + { + "epoch": 1.2761915220733582, + "grad_norm": 1.9690626859664917, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7288913726806641, + "num_tokens": 300662376.0, + "step": 11621 + }, + { + "epoch": 1.276301339775972, + "grad_norm": 2.001539468765259, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7148363590240479, + "num_tokens": 300688596.0, + "step": 11622 + }, + { + "epoch": 1.2764111574785855, + "grad_norm": 1.8154453039169312, + "learning_rate": 5e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7429781556129456, + "num_tokens": 300714105.0, + "step": 11623 + }, + { + "epoch": 1.2765209751811992, + "grad_norm": 1.9909777641296387, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7412928342819214, + "num_tokens": 300737115.0, + "step": 11624 + }, + { + "epoch": 1.2766307928838128, + "grad_norm": 1.6462035179138184, + "learning_rate": 5e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7185978889465332, + "num_tokens": 300770980.0, + "step": 11625 + }, + { + "epoch": 1.2767406105864265, + "grad_norm": 1.6334960460662842, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7465214133262634, + "num_tokens": 300800794.0, + "step": 11626 + }, + { + "epoch": 1.2768504282890403, + "grad_norm": 1.7247676849365234, + "learning_rate": 5e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7367873787879944, + "num_tokens": 300830118.0, + "step": 11627 + }, + { + "epoch": 1.2769602459916538, + "grad_norm": 1.8090019226074219, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7158433198928833, + "num_tokens": 300861523.0, + "step": 11628 + }, + { + "epoch": 1.2770700636942676, + "grad_norm": 1.9207442998886108, + "learning_rate": 5e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7224100828170776, + "num_tokens": 300887966.0, + "step": 11629 + }, + { + "epoch": 1.277179881396881, + "grad_norm": 1.996537685394287, + "learning_rate": 5e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7513158321380615, + "num_tokens": 300909248.0, + "step": 11630 + }, + { + "epoch": 1.2772896990994949, + "grad_norm": 1.9438862800598145, + "learning_rate": 5e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7571362257003784, + "num_tokens": 300932677.0, + "step": 11631 + }, + { + "epoch": 1.2773995168021086, + "grad_norm": 1.92417573928833, + "learning_rate": 5e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.740524411201477, + "num_tokens": 300956637.0, + "step": 11632 + }, + { + "epoch": 1.2775093345047221, + "grad_norm": 1.8457443714141846, + "learning_rate": 5e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7529563903808594, + "num_tokens": 300981214.0, + "step": 11633 + }, + { + "epoch": 1.2776191522073357, + "grad_norm": 2.0379586219787598, + "learning_rate": 5e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7592790126800537, + "num_tokens": 301003345.0, + "step": 11634 + }, + { + "epoch": 1.2777289699099494, + "grad_norm": 1.817792296409607, + "learning_rate": 5e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7252066135406494, + "num_tokens": 301031594.0, + "step": 11635 + }, + { + "epoch": 1.2778387876125632, + "grad_norm": 1.9733924865722656, + "learning_rate": 5e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7335046529769897, + "num_tokens": 301058782.0, + "step": 11636 + }, + { + "epoch": 1.2779486053151767, + "grad_norm": 1.8915839195251465, + "learning_rate": 5e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7206807136535645, + "num_tokens": 301086810.0, + "step": 11637 + }, + { + "epoch": 1.2780584230177905, + "grad_norm": 2.0837340354919434, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.724825918674469, + "num_tokens": 301109490.0, + "step": 11638 + }, + { + "epoch": 1.278168240720404, + "grad_norm": 1.768203616142273, + "learning_rate": 5e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7416406869888306, + "num_tokens": 301136555.0, + "step": 11639 + }, + { + "epoch": 1.2782780584230178, + "grad_norm": 2.0614001750946045, + "learning_rate": 5e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7201095819473267, + "num_tokens": 301161140.0, + "step": 11640 + }, + { + "epoch": 1.2783878761256315, + "grad_norm": 1.824080228805542, + "learning_rate": 5e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7309785485267639, + "num_tokens": 301186820.0, + "step": 11641 + }, + { + "epoch": 1.278497693828245, + "grad_norm": 1.756795883178711, + "learning_rate": 5e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7345468997955322, + "num_tokens": 301214858.0, + "step": 11642 + }, + { + "epoch": 1.2786075115308588, + "grad_norm": 1.816027283668518, + "learning_rate": 5e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7434930205345154, + "num_tokens": 301242512.0, + "step": 11643 + }, + { + "epoch": 1.2787173292334724, + "grad_norm": 1.914013147354126, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7315998077392578, + "num_tokens": 301266592.0, + "step": 11644 + }, + { + "epoch": 1.2788271469360861, + "grad_norm": 1.8924839496612549, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7359145283699036, + "num_tokens": 301291014.0, + "step": 11645 + }, + { + "epoch": 1.2789369646386999, + "grad_norm": 1.9453638792037964, + "learning_rate": 5e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.723160982131958, + "num_tokens": 301318900.0, + "step": 11646 + }, + { + "epoch": 1.2790467823413134, + "grad_norm": 1.8651283979415894, + "learning_rate": 5e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7109922170639038, + "num_tokens": 301346424.0, + "step": 11647 + }, + { + "epoch": 1.279156600043927, + "grad_norm": 1.7721587419509888, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7341902852058411, + "num_tokens": 301375347.0, + "step": 11648 + }, + { + "epoch": 1.2792664177465407, + "grad_norm": 2.0252106189727783, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7340567111968994, + "num_tokens": 301397983.0, + "step": 11649 + }, + { + "epoch": 1.2793762354491545, + "grad_norm": 1.5827715396881104, + "learning_rate": 5e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7223901152610779, + "num_tokens": 301432307.0, + "step": 11650 + }, + { + "epoch": 1.279486053151768, + "grad_norm": 1.808444857597351, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7376480102539062, + "num_tokens": 301462628.0, + "step": 11651 + }, + { + "epoch": 1.2795958708543818, + "grad_norm": 1.7845308780670166, + "learning_rate": 5e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7495417594909668, + "num_tokens": 301490144.0, + "step": 11652 + }, + { + "epoch": 1.2797056885569953, + "grad_norm": 1.6915069818496704, + "learning_rate": 5e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.72797691822052, + "num_tokens": 301520662.0, + "step": 11653 + }, + { + "epoch": 1.279815506259609, + "grad_norm": 1.9034450054168701, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7403501868247986, + "num_tokens": 301543719.0, + "step": 11654 + }, + { + "epoch": 1.2799253239622228, + "grad_norm": 1.8251097202301025, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7257257699966431, + "num_tokens": 301572190.0, + "step": 11655 + }, + { + "epoch": 1.2800351416648363, + "grad_norm": 1.729942798614502, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7340620756149292, + "num_tokens": 301602398.0, + "step": 11656 + }, + { + "epoch": 1.28014495936745, + "grad_norm": 1.7481034994125366, + "learning_rate": 5e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.729667067527771, + "num_tokens": 301631849.0, + "step": 11657 + }, + { + "epoch": 1.2802547770700636, + "grad_norm": 2.0521814823150635, + "learning_rate": 5e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7173444032669067, + "num_tokens": 301656134.0, + "step": 11658 + }, + { + "epoch": 1.2803645947726774, + "grad_norm": 1.8748135566711426, + "learning_rate": 5e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7678061723709106, + "num_tokens": 301681237.0, + "step": 11659 + }, + { + "epoch": 1.2804744124752911, + "grad_norm": 1.7655421495437622, + "learning_rate": 5e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7193233966827393, + "num_tokens": 301711571.0, + "step": 11660 + }, + { + "epoch": 1.2805842301779047, + "grad_norm": 1.9378231763839722, + "learning_rate": 5e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7158571481704712, + "num_tokens": 301737552.0, + "step": 11661 + }, + { + "epoch": 1.2806940478805182, + "grad_norm": 2.1370224952697754, + "learning_rate": 5e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7263663411140442, + "num_tokens": 301759303.0, + "step": 11662 + }, + { + "epoch": 1.280803865583132, + "grad_norm": 1.9075977802276611, + "learning_rate": 5e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7291162014007568, + "num_tokens": 301783037.0, + "step": 11663 + }, + { + "epoch": 1.2809136832857457, + "grad_norm": 1.8424675464630127, + "learning_rate": 5e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7355202436447144, + "num_tokens": 301810986.0, + "step": 11664 + }, + { + "epoch": 1.2810235009883593, + "grad_norm": 1.925951361656189, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7278842926025391, + "num_tokens": 301838498.0, + "step": 11665 + }, + { + "epoch": 1.281133318690973, + "grad_norm": 2.0159225463867188, + "learning_rate": 5e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.749055027961731, + "num_tokens": 301861895.0, + "step": 11666 + }, + { + "epoch": 1.2812431363935866, + "grad_norm": 1.7869794368743896, + "learning_rate": 5e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7326787114143372, + "num_tokens": 301888590.0, + "step": 11667 + }, + { + "epoch": 1.2813529540962003, + "grad_norm": 1.8446542024612427, + "learning_rate": 5e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7242236733436584, + "num_tokens": 301914235.0, + "step": 11668 + }, + { + "epoch": 1.281462771798814, + "grad_norm": 2.1107397079467773, + "learning_rate": 5e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7400438189506531, + "num_tokens": 301935723.0, + "step": 11669 + }, + { + "epoch": 1.2815725895014276, + "grad_norm": 1.865417718887329, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7465875148773193, + "num_tokens": 301961901.0, + "step": 11670 + }, + { + "epoch": 1.2816824072040414, + "grad_norm": 1.818724513053894, + "learning_rate": 5e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7329053282737732, + "num_tokens": 301989217.0, + "step": 11671 + }, + { + "epoch": 1.281792224906655, + "grad_norm": 1.974636435508728, + "learning_rate": 5e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7390037775039673, + "num_tokens": 302013602.0, + "step": 11672 + }, + { + "epoch": 1.2819020426092687, + "grad_norm": 1.8519961833953857, + "learning_rate": 5e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7234597206115723, + "num_tokens": 302040441.0, + "step": 11673 + }, + { + "epoch": 1.2820118603118824, + "grad_norm": 1.6022289991378784, + "learning_rate": 5e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7066200971603394, + "num_tokens": 302074622.0, + "step": 11674 + }, + { + "epoch": 1.282121678014496, + "grad_norm": 1.697695255279541, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7185777425765991, + "num_tokens": 302105839.0, + "step": 11675 + }, + { + "epoch": 1.2822314957171095, + "grad_norm": 2.177708148956299, + "learning_rate": 5e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7618947625160217, + "num_tokens": 302124258.0, + "step": 11676 + }, + { + "epoch": 1.2823413134197232, + "grad_norm": 1.9648048877716064, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7199963331222534, + "num_tokens": 302150584.0, + "step": 11677 + }, + { + "epoch": 1.282451131122337, + "grad_norm": 2.127553939819336, + "learning_rate": 5e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7343661189079285, + "num_tokens": 302171942.0, + "step": 11678 + }, + { + "epoch": 1.2825609488249505, + "grad_norm": 1.8141701221466064, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7209342122077942, + "num_tokens": 302200122.0, + "step": 11679 + }, + { + "epoch": 1.2826707665275643, + "grad_norm": 1.6461514234542847, + "learning_rate": 5e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7357770204544067, + "num_tokens": 302230358.0, + "step": 11680 + }, + { + "epoch": 1.2827805842301778, + "grad_norm": 1.9811983108520508, + "learning_rate": 5e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7542221546173096, + "num_tokens": 302254422.0, + "step": 11681 + }, + { + "epoch": 1.2828904019327916, + "grad_norm": 1.8033801317214966, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7358578443527222, + "num_tokens": 302282401.0, + "step": 11682 + }, + { + "epoch": 1.2830002196354053, + "grad_norm": 1.9288499355316162, + "learning_rate": 5e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7539311051368713, + "num_tokens": 302304756.0, + "step": 11683 + }, + { + "epoch": 1.2831100373380189, + "grad_norm": 1.8161003589630127, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7328174114227295, + "num_tokens": 302333642.0, + "step": 11684 + }, + { + "epoch": 1.2832198550406326, + "grad_norm": 1.9096349477767944, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7229586839675903, + "num_tokens": 302359344.0, + "step": 11685 + }, + { + "epoch": 1.2833296727432462, + "grad_norm": 1.9043411016464233, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7441717386245728, + "num_tokens": 302383680.0, + "step": 11686 + }, + { + "epoch": 1.28343949044586, + "grad_norm": 1.9572674036026, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7346701622009277, + "num_tokens": 302405671.0, + "step": 11687 + }, + { + "epoch": 1.2835493081484735, + "grad_norm": 1.9762928485870361, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7355177998542786, + "num_tokens": 302427859.0, + "step": 11688 + }, + { + "epoch": 1.2836591258510872, + "grad_norm": 1.7838568687438965, + "learning_rate": 5e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7440975308418274, + "num_tokens": 302454018.0, + "step": 11689 + }, + { + "epoch": 1.2837689435537007, + "grad_norm": 1.7581145763397217, + "learning_rate": 5e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.730687141418457, + "num_tokens": 302482772.0, + "step": 11690 + }, + { + "epoch": 1.2838787612563145, + "grad_norm": 1.825102686882019, + "learning_rate": 5e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7389450073242188, + "num_tokens": 302507876.0, + "step": 11691 + }, + { + "epoch": 1.2839885789589283, + "grad_norm": 1.6720362901687622, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7333936095237732, + "num_tokens": 302538763.0, + "step": 11692 + }, + { + "epoch": 1.2840983966615418, + "grad_norm": 1.7586238384246826, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7336450815200806, + "num_tokens": 302566482.0, + "step": 11693 + }, + { + "epoch": 1.2842082143641556, + "grad_norm": 1.8166509866714478, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7159419655799866, + "num_tokens": 302593751.0, + "step": 11694 + }, + { + "epoch": 1.284318032066769, + "grad_norm": 1.8352093696594238, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7266805171966553, + "num_tokens": 302622376.0, + "step": 11695 + }, + { + "epoch": 1.2844278497693828, + "grad_norm": 1.9384273290634155, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7425664663314819, + "num_tokens": 302644332.0, + "step": 11696 + }, + { + "epoch": 1.2845376674719966, + "grad_norm": 2.284358024597168, + "learning_rate": 5e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7434542179107666, + "num_tokens": 302662521.0, + "step": 11697 + }, + { + "epoch": 1.2846474851746101, + "grad_norm": 1.8770192861557007, + "learning_rate": 5e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7144286036491394, + "num_tokens": 302689487.0, + "step": 11698 + }, + { + "epoch": 1.2847573028772237, + "grad_norm": 1.8061705827713013, + "learning_rate": 5e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.759225070476532, + "num_tokens": 302713236.0, + "step": 11699 + }, + { + "epoch": 1.2848671205798374, + "grad_norm": 1.7477266788482666, + "learning_rate": 5e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7407299280166626, + "num_tokens": 302741027.0, + "step": 11700 + }, + { + "epoch": 1.2849769382824512, + "grad_norm": 1.9226528406143188, + "learning_rate": 5e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7341421842575073, + "num_tokens": 302764768.0, + "step": 11701 + }, + { + "epoch": 1.2850867559850647, + "grad_norm": 2.0557520389556885, + "learning_rate": 5e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7317643761634827, + "num_tokens": 302785307.0, + "step": 11702 + }, + { + "epoch": 1.2851965736876785, + "grad_norm": 1.708924412727356, + "learning_rate": 5e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7283714413642883, + "num_tokens": 302815737.0, + "step": 11703 + }, + { + "epoch": 1.285306391390292, + "grad_norm": 1.7045058012008667, + "learning_rate": 5e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7097009420394897, + "num_tokens": 302847260.0, + "step": 11704 + }, + { + "epoch": 1.2854162090929058, + "grad_norm": 1.7888466119766235, + "learning_rate": 5e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7467494010925293, + "num_tokens": 302873735.0, + "step": 11705 + }, + { + "epoch": 1.2855260267955195, + "grad_norm": 1.9291034936904907, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7312419414520264, + "num_tokens": 302897209.0, + "step": 11706 + }, + { + "epoch": 1.285635844498133, + "grad_norm": 1.7005114555358887, + "learning_rate": 5e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7236586809158325, + "num_tokens": 302926793.0, + "step": 11707 + }, + { + "epoch": 1.2857456622007468, + "grad_norm": 1.8066145181655884, + "learning_rate": 5e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7115054130554199, + "num_tokens": 302956535.0, + "step": 11708 + }, + { + "epoch": 1.2858554799033604, + "grad_norm": 1.7937275171279907, + "learning_rate": 5e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7186112403869629, + "num_tokens": 302983695.0, + "step": 11709 + }, + { + "epoch": 1.2859652976059741, + "grad_norm": 1.9235478639602661, + "learning_rate": 5e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7512363195419312, + "num_tokens": 303007663.0, + "step": 11710 + }, + { + "epoch": 1.2860751153085879, + "grad_norm": 1.735063076019287, + "learning_rate": 5e-06, + "loss": 0.7895, + "mean_token_accuracy": 0.7455734014511108, + "num_tokens": 303034640.0, + "step": 11711 + }, + { + "epoch": 1.2861849330112014, + "grad_norm": 1.7909321784973145, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7242637872695923, + "num_tokens": 303062391.0, + "step": 11712 + }, + { + "epoch": 1.286294750713815, + "grad_norm": 2.0173110961914062, + "learning_rate": 5e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7225400805473328, + "num_tokens": 303085658.0, + "step": 11713 + }, + { + "epoch": 1.2864045684164287, + "grad_norm": 1.7051613330841064, + "learning_rate": 5e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7041831016540527, + "num_tokens": 303115821.0, + "step": 11714 + }, + { + "epoch": 1.2865143861190425, + "grad_norm": 1.960971474647522, + "learning_rate": 5e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7219218611717224, + "num_tokens": 303141212.0, + "step": 11715 + }, + { + "epoch": 1.286624203821656, + "grad_norm": 2.042252779006958, + "learning_rate": 5e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.734227180480957, + "num_tokens": 303162640.0, + "step": 11716 + }, + { + "epoch": 1.2867340215242697, + "grad_norm": 1.715440273284912, + "learning_rate": 5e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7189079523086548, + "num_tokens": 303193651.0, + "step": 11717 + }, + { + "epoch": 1.2868438392268833, + "grad_norm": 2.1820597648620605, + "learning_rate": 5e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7352027893066406, + "num_tokens": 303213814.0, + "step": 11718 + }, + { + "epoch": 1.286953656929497, + "grad_norm": 1.8308881521224976, + "learning_rate": 5e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7359861731529236, + "num_tokens": 303239292.0, + "step": 11719 + }, + { + "epoch": 1.2870634746321108, + "grad_norm": 2.0432066917419434, + "learning_rate": 5e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7417100071907043, + "num_tokens": 303264054.0, + "step": 11720 + }, + { + "epoch": 1.2871732923347243, + "grad_norm": 1.93565034866333, + "learning_rate": 5e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7092692852020264, + "num_tokens": 303290419.0, + "step": 11721 + }, + { + "epoch": 1.287283110037338, + "grad_norm": 1.7971817255020142, + "learning_rate": 5e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7344374656677246, + "num_tokens": 303317757.0, + "step": 11722 + }, + { + "epoch": 1.2873929277399516, + "grad_norm": 2.1556382179260254, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7267705202102661, + "num_tokens": 303340645.0, + "step": 11723 + }, + { + "epoch": 1.2875027454425654, + "grad_norm": 1.8557820320129395, + "learning_rate": 5e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7351627945899963, + "num_tokens": 303366844.0, + "step": 11724 + }, + { + "epoch": 1.2876125631451791, + "grad_norm": 1.889068841934204, + "learning_rate": 5e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7508531212806702, + "num_tokens": 303392644.0, + "step": 11725 + }, + { + "epoch": 1.2877223808477927, + "grad_norm": 1.8102531433105469, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7272564172744751, + "num_tokens": 303418499.0, + "step": 11726 + }, + { + "epoch": 1.2878321985504062, + "grad_norm": 1.9254752397537231, + "learning_rate": 5e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7284353971481323, + "num_tokens": 303443073.0, + "step": 11727 + }, + { + "epoch": 1.28794201625302, + "grad_norm": 2.061514139175415, + "learning_rate": 5e-06, + "loss": 0.6591, + "mean_token_accuracy": 0.7851660251617432, + "num_tokens": 303462027.0, + "step": 11728 + }, + { + "epoch": 1.2880518339556337, + "grad_norm": 1.8294997215270996, + "learning_rate": 5e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7099126577377319, + "num_tokens": 303490611.0, + "step": 11729 + }, + { + "epoch": 1.2881616516582473, + "grad_norm": 1.7928366661071777, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7366114854812622, + "num_tokens": 303519557.0, + "step": 11730 + }, + { + "epoch": 1.288271469360861, + "grad_norm": 1.739645004272461, + "learning_rate": 5e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.753502368927002, + "num_tokens": 303547255.0, + "step": 11731 + }, + { + "epoch": 1.2883812870634745, + "grad_norm": 1.794293999671936, + "learning_rate": 5e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.6988291144371033, + "num_tokens": 303577459.0, + "step": 11732 + }, + { + "epoch": 1.2884911047660883, + "grad_norm": 1.9137301445007324, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7228668332099915, + "num_tokens": 303603177.0, + "step": 11733 + }, + { + "epoch": 1.288600922468702, + "grad_norm": 1.6785110235214233, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7407858967781067, + "num_tokens": 303632730.0, + "step": 11734 + }, + { + "epoch": 1.2887107401713156, + "grad_norm": 2.0026895999908447, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7276557087898254, + "num_tokens": 303657525.0, + "step": 11735 + }, + { + "epoch": 1.2888205578739294, + "grad_norm": 2.1078615188598633, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7583893537521362, + "num_tokens": 303677308.0, + "step": 11736 + }, + { + "epoch": 1.2889303755765429, + "grad_norm": 1.8507931232452393, + "learning_rate": 5e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7123156785964966, + "num_tokens": 303704387.0, + "step": 11737 + }, + { + "epoch": 1.2890401932791566, + "grad_norm": 1.5159934759140015, + "learning_rate": 5e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7113709449768066, + "num_tokens": 303739580.0, + "step": 11738 + }, + { + "epoch": 1.2891500109817704, + "grad_norm": 1.9286454916000366, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7365862131118774, + "num_tokens": 303762328.0, + "step": 11739 + }, + { + "epoch": 1.289259828684384, + "grad_norm": 1.6686294078826904, + "learning_rate": 5e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.713793933391571, + "num_tokens": 303793665.0, + "step": 11740 + }, + { + "epoch": 1.2893696463869975, + "grad_norm": 2.161064624786377, + "learning_rate": 5e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7409408688545227, + "num_tokens": 303814425.0, + "step": 11741 + }, + { + "epoch": 1.2894794640896112, + "grad_norm": 2.009100914001465, + "learning_rate": 5e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7111321687698364, + "num_tokens": 303837574.0, + "step": 11742 + }, + { + "epoch": 1.289589281792225, + "grad_norm": 1.8524807691574097, + "learning_rate": 5e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7376255393028259, + "num_tokens": 303861309.0, + "step": 11743 + }, + { + "epoch": 1.2896990994948385, + "grad_norm": 2.010906219482422, + "learning_rate": 5e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7454876899719238, + "num_tokens": 303883366.0, + "step": 11744 + }, + { + "epoch": 1.2898089171974523, + "grad_norm": 1.8387157917022705, + "learning_rate": 5e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7076929807662964, + "num_tokens": 303913408.0, + "step": 11745 + }, + { + "epoch": 1.2899187349000658, + "grad_norm": 1.8191478252410889, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7425011396408081, + "num_tokens": 303940009.0, + "step": 11746 + }, + { + "epoch": 1.2900285526026796, + "grad_norm": 1.888149380683899, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7302796840667725, + "num_tokens": 303965962.0, + "step": 11747 + }, + { + "epoch": 1.2901383703052933, + "grad_norm": 1.7760183811187744, + "learning_rate": 5e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7319433689117432, + "num_tokens": 303993338.0, + "step": 11748 + }, + { + "epoch": 1.2902481880079069, + "grad_norm": 2.020390510559082, + "learning_rate": 5e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7580536603927612, + "num_tokens": 304014033.0, + "step": 11749 + }, + { + "epoch": 1.2903580057105206, + "grad_norm": 2.0372636318206787, + "learning_rate": 5e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7512192130088806, + "num_tokens": 304034903.0, + "step": 11750 + }, + { + "epoch": 1.2904678234131342, + "grad_norm": 1.9769294261932373, + "learning_rate": 5e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7495853304862976, + "num_tokens": 304056309.0, + "step": 11751 + }, + { + "epoch": 1.290577641115748, + "grad_norm": 1.9193462133407593, + "learning_rate": 5e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7572555541992188, + "num_tokens": 304079340.0, + "step": 11752 + }, + { + "epoch": 1.2906874588183614, + "grad_norm": 2.0938496589660645, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7586340308189392, + "num_tokens": 304098945.0, + "step": 11753 + }, + { + "epoch": 1.2907972765209752, + "grad_norm": 1.744234561920166, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7521814703941345, + "num_tokens": 304128194.0, + "step": 11754 + }, + { + "epoch": 1.2909070942235887, + "grad_norm": 1.9088706970214844, + "learning_rate": 5e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7292093634605408, + "num_tokens": 304158488.0, + "step": 11755 + }, + { + "epoch": 1.2910169119262025, + "grad_norm": 1.6471836566925049, + "learning_rate": 5e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6949319839477539, + "num_tokens": 304195539.0, + "step": 11756 + }, + { + "epoch": 1.2911267296288162, + "grad_norm": 1.880530834197998, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7413463592529297, + "num_tokens": 304220521.0, + "step": 11757 + }, + { + "epoch": 1.2912365473314298, + "grad_norm": 1.761056661605835, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7146251201629639, + "num_tokens": 304249151.0, + "step": 11758 + }, + { + "epoch": 1.2913463650340435, + "grad_norm": 1.9663575887680054, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7399464845657349, + "num_tokens": 304270733.0, + "step": 11759 + }, + { + "epoch": 1.291456182736657, + "grad_norm": 1.631891131401062, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7316197752952576, + "num_tokens": 304301613.0, + "step": 11760 + }, + { + "epoch": 1.2915660004392708, + "grad_norm": 1.897350788116455, + "learning_rate": 5e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7193639278411865, + "num_tokens": 304326398.0, + "step": 11761 + }, + { + "epoch": 1.2916758181418846, + "grad_norm": 1.5900379419326782, + "learning_rate": 5e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7325714230537415, + "num_tokens": 304358607.0, + "step": 11762 + }, + { + "epoch": 1.2917856358444981, + "grad_norm": 2.030165910720825, + "learning_rate": 5e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.728419840335846, + "num_tokens": 304380224.0, + "step": 11763 + }, + { + "epoch": 1.2918954535471117, + "grad_norm": 2.0886993408203125, + "learning_rate": 5e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7509397268295288, + "num_tokens": 304400589.0, + "step": 11764 + }, + { + "epoch": 1.2920052712497254, + "grad_norm": 2.024501085281372, + "learning_rate": 5e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7492949962615967, + "num_tokens": 304422999.0, + "step": 11765 + }, + { + "epoch": 1.2921150889523392, + "grad_norm": 1.9304484128952026, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.72639000415802, + "num_tokens": 304450164.0, + "step": 11766 + }, + { + "epoch": 1.2922249066549527, + "grad_norm": 1.7573238611221313, + "learning_rate": 5e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.740638256072998, + "num_tokens": 304478412.0, + "step": 11767 + }, + { + "epoch": 1.2923347243575665, + "grad_norm": 1.9167516231536865, + "learning_rate": 5e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7428745031356812, + "num_tokens": 304504933.0, + "step": 11768 + }, + { + "epoch": 1.29244454206018, + "grad_norm": 1.8672223091125488, + "learning_rate": 5e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7338628172874451, + "num_tokens": 304530229.0, + "step": 11769 + }, + { + "epoch": 1.2925543597627938, + "grad_norm": 1.6720707416534424, + "learning_rate": 5e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7397611141204834, + "num_tokens": 304561812.0, + "step": 11770 + }, + { + "epoch": 1.2926641774654075, + "grad_norm": 2.008927822113037, + "learning_rate": 5e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7142711877822876, + "num_tokens": 304587249.0, + "step": 11771 + }, + { + "epoch": 1.292773995168021, + "grad_norm": 2.0009407997131348, + "learning_rate": 5e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7323256134986877, + "num_tokens": 304609829.0, + "step": 11772 + }, + { + "epoch": 1.2928838128706348, + "grad_norm": 1.8332105875015259, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7216255068778992, + "num_tokens": 304636158.0, + "step": 11773 + }, + { + "epoch": 1.2929936305732483, + "grad_norm": 1.7847493886947632, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.723590612411499, + "num_tokens": 304667731.0, + "step": 11774 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 1.7798120975494385, + "learning_rate": 5e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7165931463241577, + "num_tokens": 304698005.0, + "step": 11775 + }, + { + "epoch": 1.2932132659784759, + "grad_norm": 1.9571549892425537, + "learning_rate": 5e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7287015318870544, + "num_tokens": 304722193.0, + "step": 11776 + }, + { + "epoch": 1.2933230836810894, + "grad_norm": 1.9811716079711914, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.725356936454773, + "num_tokens": 304746713.0, + "step": 11777 + }, + { + "epoch": 1.293432901383703, + "grad_norm": 1.9314053058624268, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7414424419403076, + "num_tokens": 304770142.0, + "step": 11778 + }, + { + "epoch": 1.2935427190863167, + "grad_norm": 1.8300983905792236, + "learning_rate": 5e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.732918918132782, + "num_tokens": 304796805.0, + "step": 11779 + }, + { + "epoch": 1.2936525367889304, + "grad_norm": 2.078080892562866, + "learning_rate": 5e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7469680309295654, + "num_tokens": 304818027.0, + "step": 11780 + }, + { + "epoch": 1.293762354491544, + "grad_norm": 1.9265178442001343, + "learning_rate": 5e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.6992114782333374, + "num_tokens": 304845396.0, + "step": 11781 + }, + { + "epoch": 1.2938721721941577, + "grad_norm": 1.8361481428146362, + "learning_rate": 5e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7297929525375366, + "num_tokens": 304874354.0, + "step": 11782 + }, + { + "epoch": 1.2939819898967713, + "grad_norm": 1.914230227470398, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7271744012832642, + "num_tokens": 304902157.0, + "step": 11783 + }, + { + "epoch": 1.294091807599385, + "grad_norm": 1.8363319635391235, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7292804718017578, + "num_tokens": 304929662.0, + "step": 11784 + }, + { + "epoch": 1.2942016253019988, + "grad_norm": 1.8417868614196777, + "learning_rate": 5e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7354134321212769, + "num_tokens": 304955305.0, + "step": 11785 + }, + { + "epoch": 1.2943114430046123, + "grad_norm": 1.6379075050354004, + "learning_rate": 5e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7322659492492676, + "num_tokens": 304985143.0, + "step": 11786 + }, + { + "epoch": 1.294421260707226, + "grad_norm": 2.414431095123291, + "learning_rate": 5e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7548832893371582, + "num_tokens": 305000904.0, + "step": 11787 + }, + { + "epoch": 1.2945310784098396, + "grad_norm": 1.9355071783065796, + "learning_rate": 5e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7400789260864258, + "num_tokens": 305026153.0, + "step": 11788 + }, + { + "epoch": 1.2946408961124534, + "grad_norm": 1.806825041770935, + "learning_rate": 5e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7320582866668701, + "num_tokens": 305055313.0, + "step": 11789 + }, + { + "epoch": 1.2947507138150671, + "grad_norm": 2.172295093536377, + "learning_rate": 5e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7362480759620667, + "num_tokens": 305075413.0, + "step": 11790 + }, + { + "epoch": 1.2948605315176807, + "grad_norm": 2.2557737827301025, + "learning_rate": 5e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7357622385025024, + "num_tokens": 305094500.0, + "step": 11791 + }, + { + "epoch": 1.2949703492202942, + "grad_norm": 2.0591607093811035, + "learning_rate": 5e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7449849843978882, + "num_tokens": 305119088.0, + "step": 11792 + }, + { + "epoch": 1.295080166922908, + "grad_norm": 1.5227128267288208, + "learning_rate": 5e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7220648527145386, + "num_tokens": 305156876.0, + "step": 11793 + }, + { + "epoch": 1.2951899846255217, + "grad_norm": 1.6736992597579956, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7278702855110168, + "num_tokens": 305186056.0, + "step": 11794 + }, + { + "epoch": 1.2952998023281352, + "grad_norm": 1.794203519821167, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7212195992469788, + "num_tokens": 305212799.0, + "step": 11795 + }, + { + "epoch": 1.295409620030749, + "grad_norm": 1.763418436050415, + "learning_rate": 5e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7119301557540894, + "num_tokens": 305242107.0, + "step": 11796 + }, + { + "epoch": 1.2955194377333625, + "grad_norm": 1.9129656553268433, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7252765893936157, + "num_tokens": 305265188.0, + "step": 11797 + }, + { + "epoch": 1.2956292554359763, + "grad_norm": 2.0304930210113525, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7434381246566772, + "num_tokens": 305286165.0, + "step": 11798 + }, + { + "epoch": 1.29573907313859, + "grad_norm": 2.1997742652893066, + "learning_rate": 5e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7399989366531372, + "num_tokens": 305307613.0, + "step": 11799 + }, + { + "epoch": 1.2958488908412036, + "grad_norm": 1.9993901252746582, + "learning_rate": 5e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7338061332702637, + "num_tokens": 305330081.0, + "step": 11800 + }, + { + "epoch": 1.2959587085438173, + "grad_norm": 1.8611418008804321, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7328163385391235, + "num_tokens": 305357775.0, + "step": 11801 + }, + { + "epoch": 1.2960685262464309, + "grad_norm": 2.103083610534668, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7228721976280212, + "num_tokens": 305382069.0, + "step": 11802 + }, + { + "epoch": 1.2961783439490446, + "grad_norm": 1.846694827079773, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.734126091003418, + "num_tokens": 305409535.0, + "step": 11803 + }, + { + "epoch": 1.2962881616516582, + "grad_norm": 2.031073808670044, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7560387253761292, + "num_tokens": 305432308.0, + "step": 11804 + }, + { + "epoch": 1.296397979354272, + "grad_norm": 1.6649479866027832, + "learning_rate": 5e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7186514735221863, + "num_tokens": 305464217.0, + "step": 11805 + }, + { + "epoch": 1.2965077970568855, + "grad_norm": 1.8266104459762573, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7328280210494995, + "num_tokens": 305492161.0, + "step": 11806 + }, + { + "epoch": 1.2966176147594992, + "grad_norm": 1.896226167678833, + "learning_rate": 5e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7276809811592102, + "num_tokens": 305519119.0, + "step": 11807 + }, + { + "epoch": 1.296727432462113, + "grad_norm": 1.8659852743148804, + "learning_rate": 5e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7234357595443726, + "num_tokens": 305547731.0, + "step": 11808 + }, + { + "epoch": 1.2968372501647265, + "grad_norm": 1.7700954675674438, + "learning_rate": 5e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.739758312702179, + "num_tokens": 305577978.0, + "step": 11809 + }, + { + "epoch": 1.2969470678673403, + "grad_norm": 1.6802793741226196, + "learning_rate": 5e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7279040813446045, + "num_tokens": 305608761.0, + "step": 11810 + }, + { + "epoch": 1.2970568855699538, + "grad_norm": 1.8385666608810425, + "learning_rate": 5e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7795664072036743, + "num_tokens": 305632096.0, + "step": 11811 + }, + { + "epoch": 1.2971667032725676, + "grad_norm": 2.017324209213257, + "learning_rate": 5e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7366063594818115, + "num_tokens": 305655834.0, + "step": 11812 + }, + { + "epoch": 1.2972765209751813, + "grad_norm": 1.761689305305481, + "learning_rate": 5e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7524998188018799, + "num_tokens": 305682312.0, + "step": 11813 + }, + { + "epoch": 1.2973863386777948, + "grad_norm": 1.762754201889038, + "learning_rate": 5e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7459704875946045, + "num_tokens": 305709972.0, + "step": 11814 + }, + { + "epoch": 1.2974961563804086, + "grad_norm": 1.8134464025497437, + "learning_rate": 5e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.741669237613678, + "num_tokens": 305737879.0, + "step": 11815 + }, + { + "epoch": 1.2976059740830221, + "grad_norm": 1.7389485836029053, + "learning_rate": 5e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.710959255695343, + "num_tokens": 305766805.0, + "step": 11816 + }, + { + "epoch": 1.297715791785636, + "grad_norm": 1.837503433227539, + "learning_rate": 5e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7283381819725037, + "num_tokens": 305795096.0, + "step": 11817 + }, + { + "epoch": 1.2978256094882494, + "grad_norm": 1.7413750886917114, + "learning_rate": 5e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7335551381111145, + "num_tokens": 305822089.0, + "step": 11818 + }, + { + "epoch": 1.2979354271908632, + "grad_norm": 1.787343144416809, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7326786518096924, + "num_tokens": 305851549.0, + "step": 11819 + }, + { + "epoch": 1.2980452448934767, + "grad_norm": 1.8671621084213257, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7357933521270752, + "num_tokens": 305877458.0, + "step": 11820 + }, + { + "epoch": 1.2981550625960905, + "grad_norm": 1.8073084354400635, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7437576055526733, + "num_tokens": 305905752.0, + "step": 11821 + }, + { + "epoch": 1.2982648802987042, + "grad_norm": 1.843900203704834, + "learning_rate": 5e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7231172919273376, + "num_tokens": 305930527.0, + "step": 11822 + }, + { + "epoch": 1.2983746980013178, + "grad_norm": 1.8408818244934082, + "learning_rate": 5e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7086412906646729, + "num_tokens": 305958842.0, + "step": 11823 + }, + { + "epoch": 1.2984845157039315, + "grad_norm": 1.9595273733139038, + "learning_rate": 5e-06, + "loss": 0.936, + "mean_token_accuracy": 0.705895721912384, + "num_tokens": 305983377.0, + "step": 11824 + }, + { + "epoch": 1.298594333406545, + "grad_norm": 1.971308708190918, + "learning_rate": 5e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7396039962768555, + "num_tokens": 306005387.0, + "step": 11825 + }, + { + "epoch": 1.2987041511091588, + "grad_norm": 2.0622310638427734, + "learning_rate": 5e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7451146841049194, + "num_tokens": 306027957.0, + "step": 11826 + }, + { + "epoch": 1.2988139688117726, + "grad_norm": 1.8963534832000732, + "learning_rate": 5e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7442115545272827, + "num_tokens": 306051909.0, + "step": 11827 + }, + { + "epoch": 1.2989237865143861, + "grad_norm": 1.804295301437378, + "learning_rate": 5e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7177731394767761, + "num_tokens": 306080017.0, + "step": 11828 + }, + { + "epoch": 1.2990336042169996, + "grad_norm": 1.8944976329803467, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7278172969818115, + "num_tokens": 306104333.0, + "step": 11829 + }, + { + "epoch": 1.2991434219196134, + "grad_norm": 1.8080652952194214, + "learning_rate": 5e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7324246764183044, + "num_tokens": 306131598.0, + "step": 11830 + }, + { + "epoch": 1.2992532396222272, + "grad_norm": 2.0550878047943115, + "learning_rate": 5e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7412379384040833, + "num_tokens": 306154472.0, + "step": 11831 + }, + { + "epoch": 1.2993630573248407, + "grad_norm": 1.677868127822876, + "learning_rate": 5e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7529421448707581, + "num_tokens": 306180536.0, + "step": 11832 + }, + { + "epoch": 1.2994728750274545, + "grad_norm": 2.0280323028564453, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7282114028930664, + "num_tokens": 306204131.0, + "step": 11833 + }, + { + "epoch": 1.299582692730068, + "grad_norm": 1.9786702394485474, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7366019487380981, + "num_tokens": 306227839.0, + "step": 11834 + }, + { + "epoch": 1.2996925104326817, + "grad_norm": 2.0938210487365723, + "learning_rate": 5e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7424058318138123, + "num_tokens": 306249458.0, + "step": 11835 + }, + { + "epoch": 1.2998023281352955, + "grad_norm": 1.6728839874267578, + "learning_rate": 5e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7183485627174377, + "num_tokens": 306281209.0, + "step": 11836 + }, + { + "epoch": 1.299912145837909, + "grad_norm": 1.8822122812271118, + "learning_rate": 5e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7304404973983765, + "num_tokens": 306307453.0, + "step": 11837 + }, + { + "epoch": 1.3000219635405228, + "grad_norm": 2.118298292160034, + "learning_rate": 5e-06, + "loss": 0.774, + "mean_token_accuracy": 0.748643159866333, + "num_tokens": 306326184.0, + "step": 11838 + }, + { + "epoch": 1.3001317812431363, + "grad_norm": 2.3158624172210693, + "learning_rate": 5e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.74506014585495, + "num_tokens": 306343218.0, + "step": 11839 + }, + { + "epoch": 1.30024159894575, + "grad_norm": 1.6434919834136963, + "learning_rate": 5e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7274149656295776, + "num_tokens": 306378140.0, + "step": 11840 + }, + { + "epoch": 1.3003514166483638, + "grad_norm": 1.9381077289581299, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7679997682571411, + "num_tokens": 306400982.0, + "step": 11841 + }, + { + "epoch": 1.3004612343509774, + "grad_norm": 1.8339558839797974, + "learning_rate": 5e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7246675491333008, + "num_tokens": 306427209.0, + "step": 11842 + }, + { + "epoch": 1.300571052053591, + "grad_norm": 2.0771915912628174, + "learning_rate": 5e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7359644770622253, + "num_tokens": 306447546.0, + "step": 11843 + }, + { + "epoch": 1.3006808697562047, + "grad_norm": 1.9148973226547241, + "learning_rate": 5e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7505518198013306, + "num_tokens": 306470753.0, + "step": 11844 + }, + { + "epoch": 1.3007906874588184, + "grad_norm": 1.6522539854049683, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7313709259033203, + "num_tokens": 306502927.0, + "step": 11845 + }, + { + "epoch": 1.300900505161432, + "grad_norm": 1.6556897163391113, + "learning_rate": 5e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7075357437133789, + "num_tokens": 306534123.0, + "step": 11846 + }, + { + "epoch": 1.3010103228640457, + "grad_norm": 1.6514075994491577, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.723293662071228, + "num_tokens": 306565298.0, + "step": 11847 + }, + { + "epoch": 1.3011201405666593, + "grad_norm": 2.0386879444122314, + "learning_rate": 5e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7404986619949341, + "num_tokens": 306586960.0, + "step": 11848 + }, + { + "epoch": 1.301229958269273, + "grad_norm": 2.058155059814453, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.731735348701477, + "num_tokens": 306608523.0, + "step": 11849 + }, + { + "epoch": 1.3013397759718868, + "grad_norm": 1.727220892906189, + "learning_rate": 5e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7141035795211792, + "num_tokens": 306640032.0, + "step": 11850 + }, + { + "epoch": 1.3014495936745003, + "grad_norm": 1.9000474214553833, + "learning_rate": 5e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7490930557250977, + "num_tokens": 306663874.0, + "step": 11851 + }, + { + "epoch": 1.301559411377114, + "grad_norm": 2.181238889694214, + "learning_rate": 5e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7417696714401245, + "num_tokens": 306686224.0, + "step": 11852 + }, + { + "epoch": 1.3016692290797276, + "grad_norm": 1.9643155336380005, + "learning_rate": 5e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7441729307174683, + "num_tokens": 306711667.0, + "step": 11853 + }, + { + "epoch": 1.3017790467823414, + "grad_norm": 1.8439815044403076, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7390345335006714, + "num_tokens": 306736394.0, + "step": 11854 + }, + { + "epoch": 1.301888864484955, + "grad_norm": 1.7786126136779785, + "learning_rate": 5e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7592049241065979, + "num_tokens": 306763775.0, + "step": 11855 + }, + { + "epoch": 1.3019986821875686, + "grad_norm": 2.20721435546875, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7315061092376709, + "num_tokens": 306783344.0, + "step": 11856 + }, + { + "epoch": 1.3021084998901822, + "grad_norm": 1.8703560829162598, + "learning_rate": 5e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7320369482040405, + "num_tokens": 306807702.0, + "step": 11857 + }, + { + "epoch": 1.302218317592796, + "grad_norm": 1.790876030921936, + "learning_rate": 5e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.723965048789978, + "num_tokens": 306836498.0, + "step": 11858 + }, + { + "epoch": 1.3023281352954097, + "grad_norm": 1.943070650100708, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7334338426589966, + "num_tokens": 306860823.0, + "step": 11859 + }, + { + "epoch": 1.3024379529980232, + "grad_norm": 1.7620967626571655, + "learning_rate": 5e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7205358743667603, + "num_tokens": 306891521.0, + "step": 11860 + }, + { + "epoch": 1.302547770700637, + "grad_norm": 1.9097800254821777, + "learning_rate": 5e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7485082149505615, + "num_tokens": 306915395.0, + "step": 11861 + }, + { + "epoch": 1.3026575884032505, + "grad_norm": 1.9927167892456055, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7446999549865723, + "num_tokens": 306938110.0, + "step": 11862 + }, + { + "epoch": 1.3027674061058643, + "grad_norm": 2.22312331199646, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7364637851715088, + "num_tokens": 306959213.0, + "step": 11863 + }, + { + "epoch": 1.302877223808478, + "grad_norm": 1.8999500274658203, + "learning_rate": 5e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7118976712226868, + "num_tokens": 306989123.0, + "step": 11864 + }, + { + "epoch": 1.3029870415110916, + "grad_norm": 1.9049657583236694, + "learning_rate": 5e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7228116989135742, + "num_tokens": 307014830.0, + "step": 11865 + }, + { + "epoch": 1.3030968592137053, + "grad_norm": 1.7706016302108765, + "learning_rate": 5e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7383776903152466, + "num_tokens": 307044347.0, + "step": 11866 + }, + { + "epoch": 1.3032066769163189, + "grad_norm": 2.021538257598877, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7249912619590759, + "num_tokens": 307068705.0, + "step": 11867 + }, + { + "epoch": 1.3033164946189326, + "grad_norm": 1.9618003368377686, + "learning_rate": 5e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7217274904251099, + "num_tokens": 307092691.0, + "step": 11868 + }, + { + "epoch": 1.3034263123215462, + "grad_norm": 2.1232399940490723, + "learning_rate": 5e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7260259985923767, + "num_tokens": 307116230.0, + "step": 11869 + }, + { + "epoch": 1.30353613002416, + "grad_norm": 1.8944331407546997, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.73816978931427, + "num_tokens": 307140236.0, + "step": 11870 + }, + { + "epoch": 1.3036459477267734, + "grad_norm": 1.780537486076355, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7567128539085388, + "num_tokens": 307166586.0, + "step": 11871 + }, + { + "epoch": 1.3037557654293872, + "grad_norm": 1.6310977935791016, + "learning_rate": 5e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7355915307998657, + "num_tokens": 307198672.0, + "step": 11872 + }, + { + "epoch": 1.303865583132001, + "grad_norm": 2.0242395401000977, + "learning_rate": 5e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7460193634033203, + "num_tokens": 307220794.0, + "step": 11873 + }, + { + "epoch": 1.3039754008346145, + "grad_norm": 2.0411570072174072, + "learning_rate": 5e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7193129658699036, + "num_tokens": 307243580.0, + "step": 11874 + }, + { + "epoch": 1.3040852185372283, + "grad_norm": 1.6667169332504272, + "learning_rate": 5e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7011449337005615, + "num_tokens": 307276643.0, + "step": 11875 + }, + { + "epoch": 1.3041950362398418, + "grad_norm": 1.7811559438705444, + "learning_rate": 5e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7198227643966675, + "num_tokens": 307305833.0, + "step": 11876 + }, + { + "epoch": 1.3043048539424555, + "grad_norm": 2.0617423057556152, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7607349157333374, + "num_tokens": 307327280.0, + "step": 11877 + }, + { + "epoch": 1.3044146716450693, + "grad_norm": 1.8469325304031372, + "learning_rate": 5e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7029030919075012, + "num_tokens": 307353012.0, + "step": 11878 + }, + { + "epoch": 1.3045244893476828, + "grad_norm": 2.1005613803863525, + "learning_rate": 5e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7329665422439575, + "num_tokens": 307372909.0, + "step": 11879 + }, + { + "epoch": 1.3046343070502964, + "grad_norm": 1.9590872526168823, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7558572292327881, + "num_tokens": 307395338.0, + "step": 11880 + }, + { + "epoch": 1.3047441247529101, + "grad_norm": 1.9225256443023682, + "learning_rate": 5e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7418062686920166, + "num_tokens": 307419405.0, + "step": 11881 + }, + { + "epoch": 1.3048539424555239, + "grad_norm": 1.826781988143921, + "learning_rate": 5e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7530216574668884, + "num_tokens": 307445059.0, + "step": 11882 + }, + { + "epoch": 1.3049637601581374, + "grad_norm": 2.141908645629883, + "learning_rate": 5e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7553316950798035, + "num_tokens": 307466400.0, + "step": 11883 + }, + { + "epoch": 1.3050735778607512, + "grad_norm": 1.931262493133545, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7303982973098755, + "num_tokens": 307490211.0, + "step": 11884 + }, + { + "epoch": 1.3051833955633647, + "grad_norm": 2.16943097114563, + "learning_rate": 5e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7248823046684265, + "num_tokens": 307511398.0, + "step": 11885 + }, + { + "epoch": 1.3052932132659785, + "grad_norm": 2.0398006439208984, + "learning_rate": 5e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.742951512336731, + "num_tokens": 307534675.0, + "step": 11886 + }, + { + "epoch": 1.3054030309685922, + "grad_norm": 1.9271985292434692, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7296005487442017, + "num_tokens": 307559705.0, + "step": 11887 + }, + { + "epoch": 1.3055128486712058, + "grad_norm": 1.9641928672790527, + "learning_rate": 5e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7425824403762817, + "num_tokens": 307583771.0, + "step": 11888 + }, + { + "epoch": 1.3056226663738195, + "grad_norm": 1.8567039966583252, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7467153668403625, + "num_tokens": 307607620.0, + "step": 11889 + }, + { + "epoch": 1.305732484076433, + "grad_norm": 1.726571798324585, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7255431413650513, + "num_tokens": 307640925.0, + "step": 11890 + }, + { + "epoch": 1.3058423017790468, + "grad_norm": 2.1261191368103027, + "learning_rate": 5e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7347694039344788, + "num_tokens": 307661000.0, + "step": 11891 + }, + { + "epoch": 1.3059521194816606, + "grad_norm": 1.817651391029358, + "learning_rate": 5e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7362043857574463, + "num_tokens": 307688508.0, + "step": 11892 + }, + { + "epoch": 1.306061937184274, + "grad_norm": 1.9322874546051025, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7278776168823242, + "num_tokens": 307716763.0, + "step": 11893 + }, + { + "epoch": 1.3061717548868876, + "grad_norm": 1.8518611192703247, + "learning_rate": 5e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7444823980331421, + "num_tokens": 307743168.0, + "step": 11894 + }, + { + "epoch": 1.3062815725895014, + "grad_norm": 1.8953709602355957, + "learning_rate": 5e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7486624717712402, + "num_tokens": 307768587.0, + "step": 11895 + }, + { + "epoch": 1.3063913902921152, + "grad_norm": 1.8456213474273682, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7425028085708618, + "num_tokens": 307793380.0, + "step": 11896 + }, + { + "epoch": 1.3065012079947287, + "grad_norm": 1.9214568138122559, + "learning_rate": 5e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7247101664543152, + "num_tokens": 307820158.0, + "step": 11897 + }, + { + "epoch": 1.3066110256973424, + "grad_norm": 1.7103408575057983, + "learning_rate": 5e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7308142185211182, + "num_tokens": 307854088.0, + "step": 11898 + }, + { + "epoch": 1.306720843399956, + "grad_norm": 1.8086192607879639, + "learning_rate": 5e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7026331424713135, + "num_tokens": 307882398.0, + "step": 11899 + }, + { + "epoch": 1.3068306611025697, + "grad_norm": 1.9357004165649414, + "learning_rate": 5e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7512966394424438, + "num_tokens": 307905371.0, + "step": 11900 + }, + { + "epoch": 1.3069404788051835, + "grad_norm": 1.7842397689819336, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7163656949996948, + "num_tokens": 307933342.0, + "step": 11901 + }, + { + "epoch": 1.307050296507797, + "grad_norm": 1.9831725358963013, + "learning_rate": 5e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7578788995742798, + "num_tokens": 307955523.0, + "step": 11902 + }, + { + "epoch": 1.3071601142104108, + "grad_norm": 1.686946153640747, + "learning_rate": 5e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7339390516281128, + "num_tokens": 307984268.0, + "step": 11903 + }, + { + "epoch": 1.3072699319130243, + "grad_norm": 1.8454577922821045, + "learning_rate": 5e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7199212312698364, + "num_tokens": 308009720.0, + "step": 11904 + }, + { + "epoch": 1.307379749615638, + "grad_norm": 1.8689696788787842, + "learning_rate": 5e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7228168249130249, + "num_tokens": 308038174.0, + "step": 11905 + }, + { + "epoch": 1.3074895673182518, + "grad_norm": 1.7725883722305298, + "learning_rate": 5e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7488903999328613, + "num_tokens": 308064963.0, + "step": 11906 + }, + { + "epoch": 1.3075993850208654, + "grad_norm": 1.8187962770462036, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7512446641921997, + "num_tokens": 308091816.0, + "step": 11907 + }, + { + "epoch": 1.307709202723479, + "grad_norm": 1.9101232290267944, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7281270027160645, + "num_tokens": 308117229.0, + "step": 11908 + }, + { + "epoch": 1.3078190204260927, + "grad_norm": 1.9544366598129272, + "learning_rate": 5e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7210813760757446, + "num_tokens": 308141915.0, + "step": 11909 + }, + { + "epoch": 1.3079288381287064, + "grad_norm": 1.9352331161499023, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7377767562866211, + "num_tokens": 308163938.0, + "step": 11910 + }, + { + "epoch": 1.30803865583132, + "grad_norm": 1.6434087753295898, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7315430045127869, + "num_tokens": 308195696.0, + "step": 11911 + }, + { + "epoch": 1.3081484735339337, + "grad_norm": 1.8341693878173828, + "learning_rate": 5e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7504374384880066, + "num_tokens": 308220902.0, + "step": 11912 + }, + { + "epoch": 1.3082582912365472, + "grad_norm": 1.9490545988082886, + "learning_rate": 5e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7380075454711914, + "num_tokens": 308243502.0, + "step": 11913 + }, + { + "epoch": 1.308368108939161, + "grad_norm": 1.7290891408920288, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7323204278945923, + "num_tokens": 308271912.0, + "step": 11914 + }, + { + "epoch": 1.3084779266417748, + "grad_norm": 1.9802459478378296, + "learning_rate": 5e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7518419027328491, + "num_tokens": 308291388.0, + "step": 11915 + }, + { + "epoch": 1.3085877443443883, + "grad_norm": 1.8437548875808716, + "learning_rate": 5e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7436371445655823, + "num_tokens": 308315815.0, + "step": 11916 + }, + { + "epoch": 1.308697562047002, + "grad_norm": 1.657004952430725, + "learning_rate": 5e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7430210113525391, + "num_tokens": 308347272.0, + "step": 11917 + }, + { + "epoch": 1.3088073797496156, + "grad_norm": 1.6834253072738647, + "learning_rate": 5e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7320770621299744, + "num_tokens": 308377968.0, + "step": 11918 + }, + { + "epoch": 1.3089171974522293, + "grad_norm": 1.8493512868881226, + "learning_rate": 5e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7293300628662109, + "num_tokens": 308405858.0, + "step": 11919 + }, + { + "epoch": 1.309027015154843, + "grad_norm": 2.0810298919677734, + "learning_rate": 5e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7421119809150696, + "num_tokens": 308428157.0, + "step": 11920 + }, + { + "epoch": 1.3091368328574566, + "grad_norm": 1.8321092128753662, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7361420392990112, + "num_tokens": 308453734.0, + "step": 11921 + }, + { + "epoch": 1.3092466505600702, + "grad_norm": 2.0356342792510986, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7424307465553284, + "num_tokens": 308476997.0, + "step": 11922 + }, + { + "epoch": 1.309356468262684, + "grad_norm": 1.8190395832061768, + "learning_rate": 5e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7458257675170898, + "num_tokens": 308501199.0, + "step": 11923 + }, + { + "epoch": 1.3094662859652977, + "grad_norm": 2.05592679977417, + "learning_rate": 5e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7285932302474976, + "num_tokens": 308526130.0, + "step": 11924 + }, + { + "epoch": 1.3095761036679112, + "grad_norm": 2.150024175643921, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7380648851394653, + "num_tokens": 308546932.0, + "step": 11925 + }, + { + "epoch": 1.309685921370525, + "grad_norm": 1.9795777797698975, + "learning_rate": 5e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7512423396110535, + "num_tokens": 308572591.0, + "step": 11926 + }, + { + "epoch": 1.3097957390731385, + "grad_norm": 1.9452580213546753, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7230806350708008, + "num_tokens": 308599270.0, + "step": 11927 + }, + { + "epoch": 1.3099055567757523, + "grad_norm": 2.1402392387390137, + "learning_rate": 5e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7350150942802429, + "num_tokens": 308621749.0, + "step": 11928 + }, + { + "epoch": 1.310015374478366, + "grad_norm": 2.1110706329345703, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7514229416847229, + "num_tokens": 308641761.0, + "step": 11929 + }, + { + "epoch": 1.3101251921809796, + "grad_norm": 1.6756694316864014, + "learning_rate": 5e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7067244648933411, + "num_tokens": 308671855.0, + "step": 11930 + }, + { + "epoch": 1.3102350098835933, + "grad_norm": 1.9560819864273071, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7199802398681641, + "num_tokens": 308699079.0, + "step": 11931 + }, + { + "epoch": 1.3103448275862069, + "grad_norm": 2.242915391921997, + "learning_rate": 5e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7574080228805542, + "num_tokens": 308716277.0, + "step": 11932 + }, + { + "epoch": 1.3104546452888206, + "grad_norm": 1.9275215864181519, + "learning_rate": 5e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7233189344406128, + "num_tokens": 308740809.0, + "step": 11933 + }, + { + "epoch": 1.3105644629914341, + "grad_norm": 1.8416593074798584, + "learning_rate": 5e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7380611896514893, + "num_tokens": 308767452.0, + "step": 11934 + }, + { + "epoch": 1.310674280694048, + "grad_norm": 1.8072357177734375, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7232425212860107, + "num_tokens": 308793537.0, + "step": 11935 + }, + { + "epoch": 1.3107840983966614, + "grad_norm": 1.9135069847106934, + "learning_rate": 5e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7281574010848999, + "num_tokens": 308819537.0, + "step": 11936 + }, + { + "epoch": 1.3108939160992752, + "grad_norm": 1.7316293716430664, + "learning_rate": 5e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.740667462348938, + "num_tokens": 308846503.0, + "step": 11937 + }, + { + "epoch": 1.311003733801889, + "grad_norm": 2.0412063598632812, + "learning_rate": 5e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.732208251953125, + "num_tokens": 308869284.0, + "step": 11938 + }, + { + "epoch": 1.3111135515045025, + "grad_norm": 2.067420721054077, + "learning_rate": 5e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7461537718772888, + "num_tokens": 308891908.0, + "step": 11939 + }, + { + "epoch": 1.3112233692071162, + "grad_norm": 1.711336374282837, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7271630167961121, + "num_tokens": 308923449.0, + "step": 11940 + }, + { + "epoch": 1.3113331869097298, + "grad_norm": 2.085088014602661, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7508463859558105, + "num_tokens": 308943585.0, + "step": 11941 + }, + { + "epoch": 1.3114430046123435, + "grad_norm": 1.952559232711792, + "learning_rate": 5e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.7472249865531921, + "num_tokens": 308965888.0, + "step": 11942 + }, + { + "epoch": 1.3115528223149573, + "grad_norm": 1.960726261138916, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7295595407485962, + "num_tokens": 308991297.0, + "step": 11943 + }, + { + "epoch": 1.3116626400175708, + "grad_norm": 1.8654978275299072, + "learning_rate": 5e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7423679828643799, + "num_tokens": 309018418.0, + "step": 11944 + }, + { + "epoch": 1.3117724577201844, + "grad_norm": 1.7068495750427246, + "learning_rate": 5e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7395067811012268, + "num_tokens": 309045053.0, + "step": 11945 + }, + { + "epoch": 1.3118822754227981, + "grad_norm": 1.953640103340149, + "learning_rate": 5e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7190015316009521, + "num_tokens": 309070844.0, + "step": 11946 + }, + { + "epoch": 1.3119920931254119, + "grad_norm": 1.809044361114502, + "learning_rate": 5e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7491941452026367, + "num_tokens": 309099440.0, + "step": 11947 + }, + { + "epoch": 1.3121019108280254, + "grad_norm": 2.096959352493286, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7256478071212769, + "num_tokens": 309121544.0, + "step": 11948 + }, + { + "epoch": 1.3122117285306392, + "grad_norm": 1.9703550338745117, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.740880012512207, + "num_tokens": 309145709.0, + "step": 11949 + }, + { + "epoch": 1.3123215462332527, + "grad_norm": 1.801046371459961, + "learning_rate": 5e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7514635920524597, + "num_tokens": 309173357.0, + "step": 11950 + }, + { + "epoch": 1.3124313639358665, + "grad_norm": 1.7584460973739624, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7134721875190735, + "num_tokens": 309203089.0, + "step": 11951 + }, + { + "epoch": 1.3125411816384802, + "grad_norm": 2.2313530445098877, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7348580360412598, + "num_tokens": 309223381.0, + "step": 11952 + }, + { + "epoch": 1.3126509993410937, + "grad_norm": 2.06307315826416, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7392059564590454, + "num_tokens": 309245155.0, + "step": 11953 + }, + { + "epoch": 1.3127608170437075, + "grad_norm": 1.792134165763855, + "learning_rate": 5e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7375032901763916, + "num_tokens": 309271583.0, + "step": 11954 + }, + { + "epoch": 1.312870634746321, + "grad_norm": 1.6553879976272583, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7023518681526184, + "num_tokens": 309302689.0, + "step": 11955 + }, + { + "epoch": 1.3129804524489348, + "grad_norm": 2.2415595054626465, + "learning_rate": 5e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7413960099220276, + "num_tokens": 309321689.0, + "step": 11956 + }, + { + "epoch": 1.3130902701515486, + "grad_norm": 1.7526620626449585, + "learning_rate": 5e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7142046689987183, + "num_tokens": 309354417.0, + "step": 11957 + }, + { + "epoch": 1.313200087854162, + "grad_norm": 1.9910675287246704, + "learning_rate": 5e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7438333630561829, + "num_tokens": 309376340.0, + "step": 11958 + }, + { + "epoch": 1.3133099055567756, + "grad_norm": 1.9732829332351685, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.737389087677002, + "num_tokens": 309399206.0, + "step": 11959 + }, + { + "epoch": 1.3134197232593894, + "grad_norm": 2.0016777515411377, + "learning_rate": 5e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7433158755302429, + "num_tokens": 309422372.0, + "step": 11960 + }, + { + "epoch": 1.3135295409620031, + "grad_norm": 1.9207059144973755, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7523725032806396, + "num_tokens": 309447128.0, + "step": 11961 + }, + { + "epoch": 1.3136393586646167, + "grad_norm": 1.9158744812011719, + "learning_rate": 5e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7117093801498413, + "num_tokens": 309473026.0, + "step": 11962 + }, + { + "epoch": 1.3137491763672304, + "grad_norm": 1.7654553651809692, + "learning_rate": 5e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7427952289581299, + "num_tokens": 309501107.0, + "step": 11963 + }, + { + "epoch": 1.313858994069844, + "grad_norm": 1.6820398569107056, + "learning_rate": 5e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7493577003479004, + "num_tokens": 309531587.0, + "step": 11964 + }, + { + "epoch": 1.3139688117724577, + "grad_norm": 2.0224382877349854, + "learning_rate": 5e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7289032340049744, + "num_tokens": 309554811.0, + "step": 11965 + }, + { + "epoch": 1.3140786294750715, + "grad_norm": 1.9386178255081177, + "learning_rate": 5e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7239757776260376, + "num_tokens": 309580005.0, + "step": 11966 + }, + { + "epoch": 1.314188447177685, + "grad_norm": 1.9377614259719849, + "learning_rate": 5e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7531899809837341, + "num_tokens": 309602333.0, + "step": 11967 + }, + { + "epoch": 1.3142982648802988, + "grad_norm": 2.2254674434661865, + "learning_rate": 5e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7660328149795532, + "num_tokens": 309621329.0, + "step": 11968 + }, + { + "epoch": 1.3144080825829123, + "grad_norm": 1.9029028415679932, + "learning_rate": 5e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.75294029712677, + "num_tokens": 309645871.0, + "step": 11969 + }, + { + "epoch": 1.314517900285526, + "grad_norm": 2.0353844165802, + "learning_rate": 5e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7257647514343262, + "num_tokens": 309668903.0, + "step": 11970 + }, + { + "epoch": 1.3146277179881398, + "grad_norm": 1.7906748056411743, + "learning_rate": 5e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7114377021789551, + "num_tokens": 309699415.0, + "step": 11971 + }, + { + "epoch": 1.3147375356907534, + "grad_norm": 1.7397418022155762, + "learning_rate": 5e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7405484914779663, + "num_tokens": 309730496.0, + "step": 11972 + }, + { + "epoch": 1.314847353393367, + "grad_norm": 1.5830453634262085, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.733564019203186, + "num_tokens": 309764678.0, + "step": 11973 + }, + { + "epoch": 1.3149571710959806, + "grad_norm": 2.0802183151245117, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7388635873794556, + "num_tokens": 309784060.0, + "step": 11974 + }, + { + "epoch": 1.3150669887985944, + "grad_norm": 1.97611665725708, + "learning_rate": 5e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7495614290237427, + "num_tokens": 309806181.0, + "step": 11975 + }, + { + "epoch": 1.315176806501208, + "grad_norm": 1.7968823909759521, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7360383868217468, + "num_tokens": 309834883.0, + "step": 11976 + }, + { + "epoch": 1.3152866242038217, + "grad_norm": 2.164890766143799, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.748544454574585, + "num_tokens": 309854565.0, + "step": 11977 + }, + { + "epoch": 1.3153964419064352, + "grad_norm": 1.9556174278259277, + "learning_rate": 5e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7384469509124756, + "num_tokens": 309875559.0, + "step": 11978 + }, + { + "epoch": 1.315506259609049, + "grad_norm": 1.7662776708602905, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7322256565093994, + "num_tokens": 309902436.0, + "step": 11979 + }, + { + "epoch": 1.3156160773116627, + "grad_norm": 1.7713572978973389, + "learning_rate": 5e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7182571887969971, + "num_tokens": 309930688.0, + "step": 11980 + }, + { + "epoch": 1.3157258950142763, + "grad_norm": 1.7327213287353516, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7384098768234253, + "num_tokens": 309960167.0, + "step": 11981 + }, + { + "epoch": 1.31583571271689, + "grad_norm": 1.9903925657272339, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7312887907028198, + "num_tokens": 309983719.0, + "step": 11982 + }, + { + "epoch": 1.3159455304195036, + "grad_norm": 1.6334359645843506, + "learning_rate": 5e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7335476875305176, + "num_tokens": 310016042.0, + "step": 11983 + }, + { + "epoch": 1.3160553481221173, + "grad_norm": 1.9730494022369385, + "learning_rate": 5e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7457639575004578, + "num_tokens": 310039818.0, + "step": 11984 + }, + { + "epoch": 1.3161651658247309, + "grad_norm": 1.7358726263046265, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7247741222381592, + "num_tokens": 310067352.0, + "step": 11985 + }, + { + "epoch": 1.3162749835273446, + "grad_norm": 1.9149919748306274, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7333052754402161, + "num_tokens": 310090280.0, + "step": 11986 + }, + { + "epoch": 1.3163848012299582, + "grad_norm": 1.6853958368301392, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7234315872192383, + "num_tokens": 310119222.0, + "step": 11987 + }, + { + "epoch": 1.316494618932572, + "grad_norm": 1.690364956855774, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.714897632598877, + "num_tokens": 310149774.0, + "step": 11988 + }, + { + "epoch": 1.3166044366351857, + "grad_norm": 1.8943465948104858, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7243632674217224, + "num_tokens": 310176222.0, + "step": 11989 + }, + { + "epoch": 1.3167142543377992, + "grad_norm": 2.091798782348633, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7286309003829956, + "num_tokens": 310198088.0, + "step": 11990 + }, + { + "epoch": 1.316824072040413, + "grad_norm": 1.6964576244354248, + "learning_rate": 5e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7069878578186035, + "num_tokens": 310230619.0, + "step": 11991 + }, + { + "epoch": 1.3169338897430265, + "grad_norm": 1.7111049890518188, + "learning_rate": 5e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.750847578048706, + "num_tokens": 310258886.0, + "step": 11992 + }, + { + "epoch": 1.3170437074456403, + "grad_norm": 2.173152208328247, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7350023984909058, + "num_tokens": 310279415.0, + "step": 11993 + }, + { + "epoch": 1.317153525148254, + "grad_norm": 1.8144201040267944, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.760248601436615, + "num_tokens": 310303916.0, + "step": 11994 + }, + { + "epoch": 1.3172633428508675, + "grad_norm": 1.8926140069961548, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7278830409049988, + "num_tokens": 310329456.0, + "step": 11995 + }, + { + "epoch": 1.3173731605534813, + "grad_norm": 1.6666547060012817, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7550656199455261, + "num_tokens": 310357480.0, + "step": 11996 + }, + { + "epoch": 1.3174829782560948, + "grad_norm": 1.7659715414047241, + "learning_rate": 5e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7485970854759216, + "num_tokens": 310383603.0, + "step": 11997 + }, + { + "epoch": 1.3175927959587086, + "grad_norm": 1.7894973754882812, + "learning_rate": 5e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7412524819374084, + "num_tokens": 310408419.0, + "step": 11998 + }, + { + "epoch": 1.3177026136613221, + "grad_norm": 1.7062430381774902, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7146249413490295, + "num_tokens": 310437282.0, + "step": 11999 + }, + { + "epoch": 1.3178124313639359, + "grad_norm": 1.951481819152832, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7353812456130981, + "num_tokens": 310460110.0, + "step": 12000 + }, + { + "epoch": 1.3179222490665494, + "grad_norm": 1.7883254289627075, + "learning_rate": 5e-06, + "loss": 0.781, + "mean_token_accuracy": 0.7489550709724426, + "num_tokens": 310487473.0, + "step": 12001 + }, + { + "epoch": 1.3180320667691632, + "grad_norm": 1.7087591886520386, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7297515869140625, + "num_tokens": 310519632.0, + "step": 12002 + }, + { + "epoch": 1.318141884471777, + "grad_norm": 1.8669753074645996, + "learning_rate": 5e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7448216676712036, + "num_tokens": 310547564.0, + "step": 12003 + }, + { + "epoch": 1.3182517021743905, + "grad_norm": 1.7540013790130615, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7355014085769653, + "num_tokens": 310573787.0, + "step": 12004 + }, + { + "epoch": 1.3183615198770042, + "grad_norm": 1.7590413093566895, + "learning_rate": 5e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7383711338043213, + "num_tokens": 310602614.0, + "step": 12005 + }, + { + "epoch": 1.3184713375796178, + "grad_norm": 1.9605739116668701, + "learning_rate": 5e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7058873176574707, + "num_tokens": 310629718.0, + "step": 12006 + }, + { + "epoch": 1.3185811552822315, + "grad_norm": 2.0005650520324707, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7372812032699585, + "num_tokens": 310651654.0, + "step": 12007 + }, + { + "epoch": 1.3186909729848453, + "grad_norm": 1.92086660861969, + "learning_rate": 5e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.723690927028656, + "num_tokens": 310676883.0, + "step": 12008 + }, + { + "epoch": 1.3188007906874588, + "grad_norm": 1.866644263267517, + "learning_rate": 5e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7430890202522278, + "num_tokens": 310703291.0, + "step": 12009 + }, + { + "epoch": 1.3189106083900723, + "grad_norm": 1.8647783994674683, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7290271520614624, + "num_tokens": 310729889.0, + "step": 12010 + }, + { + "epoch": 1.319020426092686, + "grad_norm": 1.924565076828003, + "learning_rate": 5e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.739825963973999, + "num_tokens": 310755618.0, + "step": 12011 + }, + { + "epoch": 1.3191302437952999, + "grad_norm": 2.130740165710449, + "learning_rate": 5e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7358051538467407, + "num_tokens": 310776081.0, + "step": 12012 + }, + { + "epoch": 1.3192400614979134, + "grad_norm": 2.1973254680633545, + "learning_rate": 5e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7412925362586975, + "num_tokens": 310796783.0, + "step": 12013 + }, + { + "epoch": 1.3193498792005272, + "grad_norm": 2.0227344036102295, + "learning_rate": 5e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7472811937332153, + "num_tokens": 310817651.0, + "step": 12014 + }, + { + "epoch": 1.3194596969031407, + "grad_norm": 1.827463984489441, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7384103536605835, + "num_tokens": 310845554.0, + "step": 12015 + }, + { + "epoch": 1.3195695146057544, + "grad_norm": 1.7676397562026978, + "learning_rate": 5e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.728747546672821, + "num_tokens": 310871157.0, + "step": 12016 + }, + { + "epoch": 1.3196793323083682, + "grad_norm": 2.201566696166992, + "learning_rate": 5e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7409411072731018, + "num_tokens": 310890925.0, + "step": 12017 + }, + { + "epoch": 1.3197891500109817, + "grad_norm": 1.9134483337402344, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.721229076385498, + "num_tokens": 310917760.0, + "step": 12018 + }, + { + "epoch": 1.3198989677135955, + "grad_norm": 2.196169137954712, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7539503574371338, + "num_tokens": 310940744.0, + "step": 12019 + }, + { + "epoch": 1.320008785416209, + "grad_norm": 2.1463797092437744, + "learning_rate": 5e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7341839075088501, + "num_tokens": 310961191.0, + "step": 12020 + }, + { + "epoch": 1.3201186031188228, + "grad_norm": 2.15413498878479, + "learning_rate": 5e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7499101161956787, + "num_tokens": 310983563.0, + "step": 12021 + }, + { + "epoch": 1.3202284208214365, + "grad_norm": 1.7864060401916504, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7265774011611938, + "num_tokens": 311012314.0, + "step": 12022 + }, + { + "epoch": 1.32033823852405, + "grad_norm": 1.8422656059265137, + "learning_rate": 5e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7277145385742188, + "num_tokens": 311040137.0, + "step": 12023 + }, + { + "epoch": 1.3204480562266636, + "grad_norm": 1.7637635469436646, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7314862608909607, + "num_tokens": 311070590.0, + "step": 12024 + }, + { + "epoch": 1.3205578739292774, + "grad_norm": 2.0006933212280273, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7527403831481934, + "num_tokens": 311093175.0, + "step": 12025 + }, + { + "epoch": 1.3206676916318911, + "grad_norm": 1.8287067413330078, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7280625104904175, + "num_tokens": 311122958.0, + "step": 12026 + }, + { + "epoch": 1.3207775093345047, + "grad_norm": 2.0386962890625, + "learning_rate": 5e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7457889914512634, + "num_tokens": 311143875.0, + "step": 12027 + }, + { + "epoch": 1.3208873270371184, + "grad_norm": 1.9482147693634033, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7576968669891357, + "num_tokens": 311165834.0, + "step": 12028 + }, + { + "epoch": 1.320997144739732, + "grad_norm": 1.6553313732147217, + "learning_rate": 5e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7233673334121704, + "num_tokens": 311196073.0, + "step": 12029 + }, + { + "epoch": 1.3211069624423457, + "grad_norm": 1.9311952590942383, + "learning_rate": 5e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7292699217796326, + "num_tokens": 311219838.0, + "step": 12030 + }, + { + "epoch": 1.3212167801449595, + "grad_norm": 2.1353232860565186, + "learning_rate": 5e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7221730947494507, + "num_tokens": 311242028.0, + "step": 12031 + }, + { + "epoch": 1.321326597847573, + "grad_norm": 1.7934553623199463, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7270584106445312, + "num_tokens": 311268843.0, + "step": 12032 + }, + { + "epoch": 1.3214364155501868, + "grad_norm": 2.0474677085876465, + "learning_rate": 5e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7428716421127319, + "num_tokens": 311290721.0, + "step": 12033 + }, + { + "epoch": 1.3215462332528003, + "grad_norm": 1.7646629810333252, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7294939756393433, + "num_tokens": 311320669.0, + "step": 12034 + }, + { + "epoch": 1.321656050955414, + "grad_norm": 1.7389463186264038, + "learning_rate": 5e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7098568677902222, + "num_tokens": 311351613.0, + "step": 12035 + }, + { + "epoch": 1.3217658686580278, + "grad_norm": 1.7704358100891113, + "learning_rate": 5e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7217914462089539, + "num_tokens": 311380073.0, + "step": 12036 + }, + { + "epoch": 1.3218756863606413, + "grad_norm": 2.1106743812561035, + "learning_rate": 5e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7350232601165771, + "num_tokens": 311403019.0, + "step": 12037 + }, + { + "epoch": 1.3219855040632549, + "grad_norm": 1.8103358745574951, + "learning_rate": 5e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7363340854644775, + "num_tokens": 311429449.0, + "step": 12038 + }, + { + "epoch": 1.3220953217658686, + "grad_norm": 1.7359848022460938, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7247910499572754, + "num_tokens": 311457950.0, + "step": 12039 + }, + { + "epoch": 1.3222051394684824, + "grad_norm": 1.984508991241455, + "learning_rate": 5e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7163442373275757, + "num_tokens": 311483216.0, + "step": 12040 + }, + { + "epoch": 1.322314957171096, + "grad_norm": 1.8547086715698242, + "learning_rate": 5e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7493709325790405, + "num_tokens": 311506559.0, + "step": 12041 + }, + { + "epoch": 1.3224247748737097, + "grad_norm": 1.800008773803711, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7531609535217285, + "num_tokens": 311532338.0, + "step": 12042 + }, + { + "epoch": 1.3225345925763232, + "grad_norm": 1.7284009456634521, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7098275423049927, + "num_tokens": 311563457.0, + "step": 12043 + }, + { + "epoch": 1.322644410278937, + "grad_norm": 2.0330076217651367, + "learning_rate": 5e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7312865257263184, + "num_tokens": 311586442.0, + "step": 12044 + }, + { + "epoch": 1.3227542279815507, + "grad_norm": 1.7962369918823242, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7386326789855957, + "num_tokens": 311614934.0, + "step": 12045 + }, + { + "epoch": 1.3228640456841643, + "grad_norm": 1.7179986238479614, + "learning_rate": 5e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7453664541244507, + "num_tokens": 311643771.0, + "step": 12046 + }, + { + "epoch": 1.322973863386778, + "grad_norm": 2.1549229621887207, + "learning_rate": 5e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7399183511734009, + "num_tokens": 311663814.0, + "step": 12047 + }, + { + "epoch": 1.3230836810893916, + "grad_norm": 1.6460175514221191, + "learning_rate": 5e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7310550212860107, + "num_tokens": 311694227.0, + "step": 12048 + }, + { + "epoch": 1.3231934987920053, + "grad_norm": 2.000042200088501, + "learning_rate": 5e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7498282194137573, + "num_tokens": 311717915.0, + "step": 12049 + }, + { + "epoch": 1.3233033164946189, + "grad_norm": 1.700744867324829, + "learning_rate": 5e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7446222305297852, + "num_tokens": 311748994.0, + "step": 12050 + }, + { + "epoch": 1.3234131341972326, + "grad_norm": 1.8855836391448975, + "learning_rate": 5e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7272890210151672, + "num_tokens": 311774465.0, + "step": 12051 + }, + { + "epoch": 1.3235229518998461, + "grad_norm": 1.8874191045761108, + "learning_rate": 5e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7383473515510559, + "num_tokens": 311800536.0, + "step": 12052 + }, + { + "epoch": 1.32363276960246, + "grad_norm": 1.7998111248016357, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7596451044082642, + "num_tokens": 311825473.0, + "step": 12053 + }, + { + "epoch": 1.3237425873050737, + "grad_norm": 1.9620918035507202, + "learning_rate": 5e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7288329601287842, + "num_tokens": 311849741.0, + "step": 12054 + }, + { + "epoch": 1.3238524050076872, + "grad_norm": 1.9747440814971924, + "learning_rate": 5e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7517240047454834, + "num_tokens": 311870367.0, + "step": 12055 + }, + { + "epoch": 1.323962222710301, + "grad_norm": 1.7662547826766968, + "learning_rate": 5e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7352231740951538, + "num_tokens": 311898612.0, + "step": 12056 + }, + { + "epoch": 1.3240720404129145, + "grad_norm": 1.957732081413269, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7254456281661987, + "num_tokens": 311923835.0, + "step": 12057 + }, + { + "epoch": 1.3241818581155282, + "grad_norm": 1.8474946022033691, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7326086163520813, + "num_tokens": 311954010.0, + "step": 12058 + }, + { + "epoch": 1.324291675818142, + "grad_norm": 1.9959022998809814, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7300969362258911, + "num_tokens": 311979011.0, + "step": 12059 + }, + { + "epoch": 1.3244014935207555, + "grad_norm": 1.9693958759307861, + "learning_rate": 5e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7343738079071045, + "num_tokens": 312002185.0, + "step": 12060 + }, + { + "epoch": 1.324511311223369, + "grad_norm": 1.8207199573516846, + "learning_rate": 5e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7271897792816162, + "num_tokens": 312028269.0, + "step": 12061 + }, + { + "epoch": 1.3246211289259828, + "grad_norm": 1.7090266942977905, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7106513977050781, + "num_tokens": 312061920.0, + "step": 12062 + }, + { + "epoch": 1.3247309466285966, + "grad_norm": 1.757675051689148, + "learning_rate": 5e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7200727462768555, + "num_tokens": 312091634.0, + "step": 12063 + }, + { + "epoch": 1.3248407643312101, + "grad_norm": 1.748782753944397, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7204935550689697, + "num_tokens": 312120392.0, + "step": 12064 + }, + { + "epoch": 1.3249505820338239, + "grad_norm": 1.933503270149231, + "learning_rate": 5e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7272258400917053, + "num_tokens": 312144125.0, + "step": 12065 + }, + { + "epoch": 1.3250603997364374, + "grad_norm": 1.822735071182251, + "learning_rate": 5e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7374057769775391, + "num_tokens": 312169838.0, + "step": 12066 + }, + { + "epoch": 1.3251702174390512, + "grad_norm": 1.8202229738235474, + "learning_rate": 5e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7472277879714966, + "num_tokens": 312194209.0, + "step": 12067 + }, + { + "epoch": 1.325280035141665, + "grad_norm": 1.8563776016235352, + "learning_rate": 5e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7495173215866089, + "num_tokens": 312216789.0, + "step": 12068 + }, + { + "epoch": 1.3253898528442785, + "grad_norm": 1.9738211631774902, + "learning_rate": 5e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7521709203720093, + "num_tokens": 312238712.0, + "step": 12069 + }, + { + "epoch": 1.3254996705468922, + "grad_norm": 1.8442604541778564, + "learning_rate": 5e-06, + "loss": 0.7944, + "mean_token_accuracy": 0.7484784126281738, + "num_tokens": 312264939.0, + "step": 12070 + }, + { + "epoch": 1.3256094882495058, + "grad_norm": 2.0350325107574463, + "learning_rate": 5e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7461832761764526, + "num_tokens": 312286719.0, + "step": 12071 + }, + { + "epoch": 1.3257193059521195, + "grad_norm": 1.8394852876663208, + "learning_rate": 5e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7113356590270996, + "num_tokens": 312315925.0, + "step": 12072 + }, + { + "epoch": 1.3258291236547333, + "grad_norm": 1.928796410560608, + "learning_rate": 5e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7290823459625244, + "num_tokens": 312341547.0, + "step": 12073 + }, + { + "epoch": 1.3259389413573468, + "grad_norm": 1.7211706638336182, + "learning_rate": 5e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7338522672653198, + "num_tokens": 312371050.0, + "step": 12074 + }, + { + "epoch": 1.3260487590599603, + "grad_norm": 1.8271868228912354, + "learning_rate": 5e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7187768816947937, + "num_tokens": 312399751.0, + "step": 12075 + }, + { + "epoch": 1.326158576762574, + "grad_norm": 2.0718719959259033, + "learning_rate": 5e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7491768002510071, + "num_tokens": 312421019.0, + "step": 12076 + }, + { + "epoch": 1.3262683944651878, + "grad_norm": 2.144463300704956, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7143638134002686, + "num_tokens": 312444156.0, + "step": 12077 + }, + { + "epoch": 1.3263782121678014, + "grad_norm": 2.1337053775787354, + "learning_rate": 5e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7465326189994812, + "num_tokens": 312464987.0, + "step": 12078 + }, + { + "epoch": 1.3264880298704151, + "grad_norm": 2.2236382961273193, + "learning_rate": 5e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7498807907104492, + "num_tokens": 312484620.0, + "step": 12079 + }, + { + "epoch": 1.3265978475730287, + "grad_norm": 1.7243632078170776, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7203208208084106, + "num_tokens": 312514543.0, + "step": 12080 + }, + { + "epoch": 1.3267076652756424, + "grad_norm": 1.9236323833465576, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7229898571968079, + "num_tokens": 312538854.0, + "step": 12081 + }, + { + "epoch": 1.3268174829782562, + "grad_norm": 1.9582819938659668, + "learning_rate": 5e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7401970624923706, + "num_tokens": 312559801.0, + "step": 12082 + }, + { + "epoch": 1.3269273006808697, + "grad_norm": 2.1036858558654785, + "learning_rate": 5e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7356578707695007, + "num_tokens": 312580987.0, + "step": 12083 + }, + { + "epoch": 1.3270371183834835, + "grad_norm": 1.7757445573806763, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7208783626556396, + "num_tokens": 312610075.0, + "step": 12084 + }, + { + "epoch": 1.327146936086097, + "grad_norm": 1.8335886001586914, + "learning_rate": 5e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7403180599212646, + "num_tokens": 312636417.0, + "step": 12085 + }, + { + "epoch": 1.3272567537887108, + "grad_norm": 1.645432710647583, + "learning_rate": 5e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7020421624183655, + "num_tokens": 312670585.0, + "step": 12086 + }, + { + "epoch": 1.3273665714913245, + "grad_norm": 1.8828022480010986, + "learning_rate": 5e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7255256772041321, + "num_tokens": 312696121.0, + "step": 12087 + }, + { + "epoch": 1.327476389193938, + "grad_norm": 1.9939749240875244, + "learning_rate": 5e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7194108366966248, + "num_tokens": 312720415.0, + "step": 12088 + }, + { + "epoch": 1.3275862068965516, + "grad_norm": 1.8060603141784668, + "learning_rate": 5e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7265529632568359, + "num_tokens": 312747326.0, + "step": 12089 + }, + { + "epoch": 1.3276960245991654, + "grad_norm": 2.007164716720581, + "learning_rate": 5e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7382773160934448, + "num_tokens": 312772126.0, + "step": 12090 + }, + { + "epoch": 1.3278058423017791, + "grad_norm": 2.0676395893096924, + "learning_rate": 5e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7074042558670044, + "num_tokens": 312793727.0, + "step": 12091 + }, + { + "epoch": 1.3279156600043927, + "grad_norm": 1.9890588521957397, + "learning_rate": 5e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7345502376556396, + "num_tokens": 312817028.0, + "step": 12092 + }, + { + "epoch": 1.3280254777070064, + "grad_norm": 1.6844035387039185, + "learning_rate": 5e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7207369804382324, + "num_tokens": 312848157.0, + "step": 12093 + }, + { + "epoch": 1.32813529540962, + "grad_norm": 1.851658821105957, + "learning_rate": 5e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7145752906799316, + "num_tokens": 312877038.0, + "step": 12094 + }, + { + "epoch": 1.3282451131122337, + "grad_norm": 1.8087307214736938, + "learning_rate": 5e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7367873191833496, + "num_tokens": 312905988.0, + "step": 12095 + }, + { + "epoch": 1.3283549308148475, + "grad_norm": 1.806606650352478, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7373059391975403, + "num_tokens": 312933843.0, + "step": 12096 + }, + { + "epoch": 1.328464748517461, + "grad_norm": 1.6774070262908936, + "learning_rate": 5e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7101278901100159, + "num_tokens": 312964521.0, + "step": 12097 + }, + { + "epoch": 1.3285745662200747, + "grad_norm": 1.6866883039474487, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7298880815505981, + "num_tokens": 312995522.0, + "step": 12098 + }, + { + "epoch": 1.3286843839226883, + "grad_norm": 2.0571415424346924, + "learning_rate": 5e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7369211912155151, + "num_tokens": 313017186.0, + "step": 12099 + }, + { + "epoch": 1.328794201625302, + "grad_norm": 1.7388619184494019, + "learning_rate": 5e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7576388120651245, + "num_tokens": 313045148.0, + "step": 12100 + }, + { + "epoch": 1.3289040193279158, + "grad_norm": 1.8341325521469116, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7406657338142395, + "num_tokens": 313071484.0, + "step": 12101 + }, + { + "epoch": 1.3290138370305293, + "grad_norm": 1.8291221857070923, + "learning_rate": 5e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7398130893707275, + "num_tokens": 313099948.0, + "step": 12102 + }, + { + "epoch": 1.3291236547331429, + "grad_norm": 1.949112892150879, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7262860536575317, + "num_tokens": 313123572.0, + "step": 12103 + }, + { + "epoch": 1.3292334724357566, + "grad_norm": 1.9515665769577026, + "learning_rate": 5e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7166790962219238, + "num_tokens": 313147564.0, + "step": 12104 + }, + { + "epoch": 1.3293432901383704, + "grad_norm": 2.174659013748169, + "learning_rate": 5e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.7466089725494385, + "num_tokens": 313167978.0, + "step": 12105 + }, + { + "epoch": 1.329453107840984, + "grad_norm": 1.9708645343780518, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.728463888168335, + "num_tokens": 313190501.0, + "step": 12106 + }, + { + "epoch": 1.3295629255435977, + "grad_norm": 1.8286892175674438, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7124899625778198, + "num_tokens": 313215313.0, + "step": 12107 + }, + { + "epoch": 1.3296727432462112, + "grad_norm": 1.732961654663086, + "learning_rate": 5e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7176494598388672, + "num_tokens": 313246295.0, + "step": 12108 + }, + { + "epoch": 1.329782560948825, + "grad_norm": 1.8765969276428223, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7443332672119141, + "num_tokens": 313273573.0, + "step": 12109 + }, + { + "epoch": 1.3298923786514387, + "grad_norm": 1.941542625427246, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7271779775619507, + "num_tokens": 313297285.0, + "step": 12110 + }, + { + "epoch": 1.3300021963540523, + "grad_norm": 1.970201015472412, + "learning_rate": 5e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7295058965682983, + "num_tokens": 313322055.0, + "step": 12111 + }, + { + "epoch": 1.330112014056666, + "grad_norm": 1.8870844841003418, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7376145720481873, + "num_tokens": 313345672.0, + "step": 12112 + }, + { + "epoch": 1.3302218317592795, + "grad_norm": 1.6310126781463623, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7244099974632263, + "num_tokens": 313379372.0, + "step": 12113 + }, + { + "epoch": 1.3303316494618933, + "grad_norm": 1.8741450309753418, + "learning_rate": 5e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7312272787094116, + "num_tokens": 313405682.0, + "step": 12114 + }, + { + "epoch": 1.3304414671645068, + "grad_norm": 1.7160032987594604, + "learning_rate": 5e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7450623512268066, + "num_tokens": 313434001.0, + "step": 12115 + }, + { + "epoch": 1.3305512848671206, + "grad_norm": 1.6543914079666138, + "learning_rate": 5e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7501413822174072, + "num_tokens": 313463652.0, + "step": 12116 + }, + { + "epoch": 1.3306611025697341, + "grad_norm": 1.758178472518921, + "learning_rate": 5e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7091288566589355, + "num_tokens": 313491016.0, + "step": 12117 + }, + { + "epoch": 1.330770920272348, + "grad_norm": 1.9325257539749146, + "learning_rate": 5e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7658675312995911, + "num_tokens": 313511900.0, + "step": 12118 + }, + { + "epoch": 1.3308807379749616, + "grad_norm": 1.6310185194015503, + "learning_rate": 5e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7635193467140198, + "num_tokens": 313542675.0, + "step": 12119 + }, + { + "epoch": 1.3309905556775752, + "grad_norm": 2.085235834121704, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7408241033554077, + "num_tokens": 313566664.0, + "step": 12120 + }, + { + "epoch": 1.331100373380189, + "grad_norm": 1.9552767276763916, + "learning_rate": 5e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7362627983093262, + "num_tokens": 313591416.0, + "step": 12121 + }, + { + "epoch": 1.3312101910828025, + "grad_norm": 1.7105261087417603, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7184727787971497, + "num_tokens": 313622306.0, + "step": 12122 + }, + { + "epoch": 1.3313200087854162, + "grad_norm": 1.9685009717941284, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7313255071640015, + "num_tokens": 313646557.0, + "step": 12123 + }, + { + "epoch": 1.33142982648803, + "grad_norm": 1.972112774848938, + "learning_rate": 5e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7060456275939941, + "num_tokens": 313671938.0, + "step": 12124 + }, + { + "epoch": 1.3315396441906435, + "grad_norm": 2.027117967605591, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7531919479370117, + "num_tokens": 313694623.0, + "step": 12125 + }, + { + "epoch": 1.331649461893257, + "grad_norm": 1.9847931861877441, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7558237910270691, + "num_tokens": 313715158.0, + "step": 12126 + }, + { + "epoch": 1.3317592795958708, + "grad_norm": 1.6616641283035278, + "learning_rate": 5e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.729637622833252, + "num_tokens": 313746831.0, + "step": 12127 + }, + { + "epoch": 1.3318690972984846, + "grad_norm": 1.6457078456878662, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7287328243255615, + "num_tokens": 313779844.0, + "step": 12128 + }, + { + "epoch": 1.331978915001098, + "grad_norm": 1.8534317016601562, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7174059152603149, + "num_tokens": 313810705.0, + "step": 12129 + }, + { + "epoch": 1.3320887327037119, + "grad_norm": 1.887218952178955, + "learning_rate": 5e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7269587516784668, + "num_tokens": 313837364.0, + "step": 12130 + }, + { + "epoch": 1.3321985504063254, + "grad_norm": 1.5682518482208252, + "learning_rate": 5e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7031393647193909, + "num_tokens": 313873257.0, + "step": 12131 + }, + { + "epoch": 1.3323083681089392, + "grad_norm": 1.8827780485153198, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7387775778770447, + "num_tokens": 313897875.0, + "step": 12132 + }, + { + "epoch": 1.332418185811553, + "grad_norm": 1.7437341213226318, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7209212779998779, + "num_tokens": 313925475.0, + "step": 12133 + }, + { + "epoch": 1.3325280035141664, + "grad_norm": 2.036738395690918, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7336410880088806, + "num_tokens": 313949755.0, + "step": 12134 + }, + { + "epoch": 1.3326378212167802, + "grad_norm": 2.155986785888672, + "learning_rate": 5e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.78110671043396, + "num_tokens": 313966708.0, + "step": 12135 + }, + { + "epoch": 1.3327476389193937, + "grad_norm": 2.1972217559814453, + "learning_rate": 5e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7427588701248169, + "num_tokens": 313986577.0, + "step": 12136 + }, + { + "epoch": 1.3328574566220075, + "grad_norm": 1.9910916090011597, + "learning_rate": 5e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7347567677497864, + "num_tokens": 314011114.0, + "step": 12137 + }, + { + "epoch": 1.3329672743246213, + "grad_norm": 1.9202637672424316, + "learning_rate": 5e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7170383334159851, + "num_tokens": 314036411.0, + "step": 12138 + }, + { + "epoch": 1.3330770920272348, + "grad_norm": 1.9369242191314697, + "learning_rate": 5e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7558950185775757, + "num_tokens": 314060225.0, + "step": 12139 + }, + { + "epoch": 1.3331869097298483, + "grad_norm": 1.9731014966964722, + "learning_rate": 5e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7483096122741699, + "num_tokens": 314083397.0, + "step": 12140 + }, + { + "epoch": 1.333296727432462, + "grad_norm": 2.106121063232422, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7450491786003113, + "num_tokens": 314103936.0, + "step": 12141 + }, + { + "epoch": 1.3334065451350758, + "grad_norm": 1.8229374885559082, + "learning_rate": 5e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.729736328125, + "num_tokens": 314130723.0, + "step": 12142 + }, + { + "epoch": 1.3335163628376894, + "grad_norm": 2.0509963035583496, + "learning_rate": 5e-06, + "loss": 0.819, + "mean_token_accuracy": 0.742707371711731, + "num_tokens": 314152110.0, + "step": 12143 + }, + { + "epoch": 1.3336261805403031, + "grad_norm": 1.60452401638031, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.735567569732666, + "num_tokens": 314183700.0, + "step": 12144 + }, + { + "epoch": 1.3337359982429167, + "grad_norm": 1.8818683624267578, + "learning_rate": 5e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7311339974403381, + "num_tokens": 314211238.0, + "step": 12145 + }, + { + "epoch": 1.3338458159455304, + "grad_norm": 1.8593626022338867, + "learning_rate": 5e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7386643290519714, + "num_tokens": 314234840.0, + "step": 12146 + }, + { + "epoch": 1.3339556336481442, + "grad_norm": 1.7871383428573608, + "learning_rate": 5e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7453404664993286, + "num_tokens": 314261590.0, + "step": 12147 + }, + { + "epoch": 1.3340654513507577, + "grad_norm": 1.8862818479537964, + "learning_rate": 5e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7230842113494873, + "num_tokens": 314288707.0, + "step": 12148 + }, + { + "epoch": 1.3341752690533715, + "grad_norm": 1.6915552616119385, + "learning_rate": 5e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7304179072380066, + "num_tokens": 314318114.0, + "step": 12149 + }, + { + "epoch": 1.334285086755985, + "grad_norm": 2.069256544113159, + "learning_rate": 5e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7382205724716187, + "num_tokens": 314339140.0, + "step": 12150 + }, + { + "epoch": 1.3343949044585988, + "grad_norm": 2.046036720275879, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7235808372497559, + "num_tokens": 314362400.0, + "step": 12151 + }, + { + "epoch": 1.3345047221612125, + "grad_norm": 1.7874151468276978, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7239499092102051, + "num_tokens": 314392311.0, + "step": 12152 + }, + { + "epoch": 1.334614539863826, + "grad_norm": 1.8312065601348877, + "learning_rate": 5e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7047224044799805, + "num_tokens": 314422909.0, + "step": 12153 + }, + { + "epoch": 1.3347243575664396, + "grad_norm": 1.870612621307373, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7461737394332886, + "num_tokens": 314448637.0, + "step": 12154 + }, + { + "epoch": 1.3348341752690533, + "grad_norm": 1.8698762655258179, + "learning_rate": 5e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7098434567451477, + "num_tokens": 314476297.0, + "step": 12155 + }, + { + "epoch": 1.334943992971667, + "grad_norm": 1.8555692434310913, + "learning_rate": 5e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7251490354537964, + "num_tokens": 314503799.0, + "step": 12156 + }, + { + "epoch": 1.3350538106742806, + "grad_norm": 2.273904800415039, + "learning_rate": 5e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7507364749908447, + "num_tokens": 314523475.0, + "step": 12157 + }, + { + "epoch": 1.3351636283768944, + "grad_norm": 1.6788417100906372, + "learning_rate": 5e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7289661765098572, + "num_tokens": 314552806.0, + "step": 12158 + }, + { + "epoch": 1.335273446079508, + "grad_norm": 1.7605615854263306, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7431696057319641, + "num_tokens": 314579296.0, + "step": 12159 + }, + { + "epoch": 1.3353832637821217, + "grad_norm": 1.8208932876586914, + "learning_rate": 5e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7635750770568848, + "num_tokens": 314605421.0, + "step": 12160 + }, + { + "epoch": 1.3354930814847354, + "grad_norm": 1.9412630796432495, + "learning_rate": 5e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7352260947227478, + "num_tokens": 314629772.0, + "step": 12161 + }, + { + "epoch": 1.335602899187349, + "grad_norm": 1.8915393352508545, + "learning_rate": 5e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7504702806472778, + "num_tokens": 314655012.0, + "step": 12162 + }, + { + "epoch": 1.3357127168899627, + "grad_norm": 1.9922832250595093, + "learning_rate": 5e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.74153733253479, + "num_tokens": 314679201.0, + "step": 12163 + }, + { + "epoch": 1.3358225345925763, + "grad_norm": 1.9934108257293701, + "learning_rate": 5e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7368552684783936, + "num_tokens": 314701135.0, + "step": 12164 + }, + { + "epoch": 1.33593235229519, + "grad_norm": 1.9014830589294434, + "learning_rate": 5e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7456550002098083, + "num_tokens": 314726013.0, + "step": 12165 + }, + { + "epoch": 1.3360421699978036, + "grad_norm": 1.8514128923416138, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7528143525123596, + "num_tokens": 314754088.0, + "step": 12166 + }, + { + "epoch": 1.3361519877004173, + "grad_norm": 1.6383415460586548, + "learning_rate": 5e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7312254905700684, + "num_tokens": 314787624.0, + "step": 12167 + }, + { + "epoch": 1.3362618054030309, + "grad_norm": 2.1594231128692627, + "learning_rate": 5e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7455782890319824, + "num_tokens": 314808942.0, + "step": 12168 + }, + { + "epoch": 1.3363716231056446, + "grad_norm": 1.7023729085922241, + "learning_rate": 5e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.742546796798706, + "num_tokens": 314837019.0, + "step": 12169 + }, + { + "epoch": 1.3364814408082584, + "grad_norm": 1.8856176137924194, + "learning_rate": 5e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7184597253799438, + "num_tokens": 314861718.0, + "step": 12170 + }, + { + "epoch": 1.336591258510872, + "grad_norm": 1.6350212097167969, + "learning_rate": 5e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7600229978561401, + "num_tokens": 314890067.0, + "step": 12171 + }, + { + "epoch": 1.3367010762134857, + "grad_norm": 1.6957112550735474, + "learning_rate": 5e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.72828608751297, + "num_tokens": 314917838.0, + "step": 12172 + }, + { + "epoch": 1.3368108939160992, + "grad_norm": 1.848445177078247, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7276991605758667, + "num_tokens": 314943474.0, + "step": 12173 + }, + { + "epoch": 1.336920711618713, + "grad_norm": 1.6367511749267578, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7152334451675415, + "num_tokens": 314977056.0, + "step": 12174 + }, + { + "epoch": 1.3370305293213267, + "grad_norm": 1.6635661125183105, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7432993650436401, + "num_tokens": 315007394.0, + "step": 12175 + }, + { + "epoch": 1.3371403470239402, + "grad_norm": 1.5739037990570068, + "learning_rate": 5e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7243830561637878, + "num_tokens": 315041867.0, + "step": 12176 + }, + { + "epoch": 1.337250164726554, + "grad_norm": 1.9535914659500122, + "learning_rate": 5e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7241484522819519, + "num_tokens": 315068776.0, + "step": 12177 + }, + { + "epoch": 1.3373599824291675, + "grad_norm": 1.7976385354995728, + "learning_rate": 5e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7405648827552795, + "num_tokens": 315094727.0, + "step": 12178 + }, + { + "epoch": 1.3374698001317813, + "grad_norm": 1.805909514427185, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.715723991394043, + "num_tokens": 315120475.0, + "step": 12179 + }, + { + "epoch": 1.3375796178343948, + "grad_norm": 1.6615906953811646, + "learning_rate": 5e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.735461950302124, + "num_tokens": 315148869.0, + "step": 12180 + }, + { + "epoch": 1.3376894355370086, + "grad_norm": 1.9281452894210815, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.735133945941925, + "num_tokens": 315171899.0, + "step": 12181 + }, + { + "epoch": 1.3377992532396221, + "grad_norm": 1.6744279861450195, + "learning_rate": 5e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7157450318336487, + "num_tokens": 315204744.0, + "step": 12182 + }, + { + "epoch": 1.3379090709422359, + "grad_norm": 1.668763279914856, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7263244986534119, + "num_tokens": 315234258.0, + "step": 12183 + }, + { + "epoch": 1.3380188886448496, + "grad_norm": 1.6640832424163818, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7439075112342834, + "num_tokens": 315263309.0, + "step": 12184 + }, + { + "epoch": 1.3381287063474632, + "grad_norm": 1.9661520719528198, + "learning_rate": 5e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7535421848297119, + "num_tokens": 315286868.0, + "step": 12185 + }, + { + "epoch": 1.338238524050077, + "grad_norm": 2.0720901489257812, + "learning_rate": 5e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7344975471496582, + "num_tokens": 315309232.0, + "step": 12186 + }, + { + "epoch": 1.3383483417526905, + "grad_norm": 2.166177749633789, + "learning_rate": 5e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7476259469985962, + "num_tokens": 315329425.0, + "step": 12187 + }, + { + "epoch": 1.3384581594553042, + "grad_norm": 1.8430485725402832, + "learning_rate": 5e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7486997246742249, + "num_tokens": 315352939.0, + "step": 12188 + }, + { + "epoch": 1.338567977157918, + "grad_norm": 2.144376277923584, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7318682074546814, + "num_tokens": 315373337.0, + "step": 12189 + }, + { + "epoch": 1.3386777948605315, + "grad_norm": 1.8553965091705322, + "learning_rate": 5e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7317829132080078, + "num_tokens": 315399954.0, + "step": 12190 + }, + { + "epoch": 1.338787612563145, + "grad_norm": 1.7262786626815796, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7247947454452515, + "num_tokens": 315431249.0, + "step": 12191 + }, + { + "epoch": 1.3388974302657588, + "grad_norm": 2.0714168548583984, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7393814325332642, + "num_tokens": 315453280.0, + "step": 12192 + }, + { + "epoch": 1.3390072479683726, + "grad_norm": 1.7895885705947876, + "learning_rate": 5e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7271592617034912, + "num_tokens": 315482300.0, + "step": 12193 + }, + { + "epoch": 1.339117065670986, + "grad_norm": 1.8532353639602661, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7073975205421448, + "num_tokens": 315510335.0, + "step": 12194 + }, + { + "epoch": 1.3392268833735999, + "grad_norm": 1.7253235578536987, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7130279541015625, + "num_tokens": 315540742.0, + "step": 12195 + }, + { + "epoch": 1.3393367010762134, + "grad_norm": 1.8510346412658691, + "learning_rate": 5e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7344828844070435, + "num_tokens": 315566418.0, + "step": 12196 + }, + { + "epoch": 1.3394465187788271, + "grad_norm": 1.9143446683883667, + "learning_rate": 5e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7310888767242432, + "num_tokens": 315590483.0, + "step": 12197 + }, + { + "epoch": 1.339556336481441, + "grad_norm": 1.9189412593841553, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7244927287101746, + "num_tokens": 315616055.0, + "step": 12198 + }, + { + "epoch": 1.3396661541840544, + "grad_norm": 1.8395843505859375, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7444342970848083, + "num_tokens": 315640606.0, + "step": 12199 + }, + { + "epoch": 1.3397759718866682, + "grad_norm": 1.7734123468399048, + "learning_rate": 5e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7467219829559326, + "num_tokens": 315666725.0, + "step": 12200 + }, + { + "epoch": 1.3398857895892817, + "grad_norm": 1.7523839473724365, + "learning_rate": 5e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7249243855476379, + "num_tokens": 315694870.0, + "step": 12201 + }, + { + "epoch": 1.3399956072918955, + "grad_norm": 1.9381519556045532, + "learning_rate": 5e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7278626561164856, + "num_tokens": 315718403.0, + "step": 12202 + }, + { + "epoch": 1.3401054249945092, + "grad_norm": 1.7802067995071411, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7428395748138428, + "num_tokens": 315745998.0, + "step": 12203 + }, + { + "epoch": 1.3402152426971228, + "grad_norm": 1.5824453830718994, + "learning_rate": 5e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7151848077774048, + "num_tokens": 315780026.0, + "step": 12204 + }, + { + "epoch": 1.3403250603997363, + "grad_norm": 1.751863956451416, + "learning_rate": 5e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.7412853837013245, + "num_tokens": 315808331.0, + "step": 12205 + }, + { + "epoch": 1.34043487810235, + "grad_norm": 1.9507051706314087, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7338792085647583, + "num_tokens": 315831777.0, + "step": 12206 + }, + { + "epoch": 1.3405446958049638, + "grad_norm": 1.7390416860580444, + "learning_rate": 5e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7275280952453613, + "num_tokens": 315861726.0, + "step": 12207 + }, + { + "epoch": 1.3406545135075774, + "grad_norm": 1.613662838935852, + "learning_rate": 5e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7232463955879211, + "num_tokens": 315893140.0, + "step": 12208 + }, + { + "epoch": 1.3407643312101911, + "grad_norm": 1.7259612083435059, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7304883003234863, + "num_tokens": 315922895.0, + "step": 12209 + }, + { + "epoch": 1.3408741489128047, + "grad_norm": 1.6880497932434082, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7087183594703674, + "num_tokens": 315954822.0, + "step": 12210 + }, + { + "epoch": 1.3409839666154184, + "grad_norm": 1.7504940032958984, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7329635620117188, + "num_tokens": 315983542.0, + "step": 12211 + }, + { + "epoch": 1.3410937843180322, + "grad_norm": 1.6573823690414429, + "learning_rate": 5e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7283527851104736, + "num_tokens": 316013293.0, + "step": 12212 + }, + { + "epoch": 1.3412036020206457, + "grad_norm": 1.6963984966278076, + "learning_rate": 5e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7322702407836914, + "num_tokens": 316044664.0, + "step": 12213 + }, + { + "epoch": 1.3413134197232595, + "grad_norm": 1.805975079536438, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.745837926864624, + "num_tokens": 316070846.0, + "step": 12214 + }, + { + "epoch": 1.341423237425873, + "grad_norm": 2.1204912662506104, + "learning_rate": 5e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7453383803367615, + "num_tokens": 316092735.0, + "step": 12215 + }, + { + "epoch": 1.3415330551284868, + "grad_norm": 2.0950663089752197, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.73054039478302, + "num_tokens": 316116026.0, + "step": 12216 + }, + { + "epoch": 1.3416428728311005, + "grad_norm": 1.5616120100021362, + "learning_rate": 5e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7047497034072876, + "num_tokens": 316155869.0, + "step": 12217 + }, + { + "epoch": 1.341752690533714, + "grad_norm": 2.343668222427368, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7313466668128967, + "num_tokens": 316174391.0, + "step": 12218 + }, + { + "epoch": 1.3418625082363276, + "grad_norm": 1.7497056722640991, + "learning_rate": 5e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7301775217056274, + "num_tokens": 316202459.0, + "step": 12219 + }, + { + "epoch": 1.3419723259389413, + "grad_norm": 1.6739041805267334, + "learning_rate": 5e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7686379551887512, + "num_tokens": 316231054.0, + "step": 12220 + }, + { + "epoch": 1.342082143641555, + "grad_norm": 1.9897992610931396, + "learning_rate": 5e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7180014848709106, + "num_tokens": 316255027.0, + "step": 12221 + }, + { + "epoch": 1.3421919613441686, + "grad_norm": 1.7305091619491577, + "learning_rate": 5e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7127547860145569, + "num_tokens": 316286625.0, + "step": 12222 + }, + { + "epoch": 1.3423017790467824, + "grad_norm": 1.8560726642608643, + "learning_rate": 5e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7367777824401855, + "num_tokens": 316311792.0, + "step": 12223 + }, + { + "epoch": 1.342411596749396, + "grad_norm": 1.8700060844421387, + "learning_rate": 5e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.732905387878418, + "num_tokens": 316339882.0, + "step": 12224 + }, + { + "epoch": 1.3425214144520097, + "grad_norm": 1.891225814819336, + "learning_rate": 5e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.765590250492096, + "num_tokens": 316362333.0, + "step": 12225 + }, + { + "epoch": 1.3426312321546234, + "grad_norm": 2.041912078857422, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7545329928398132, + "num_tokens": 316383304.0, + "step": 12226 + }, + { + "epoch": 1.342741049857237, + "grad_norm": 2.340350866317749, + "learning_rate": 5e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7562041282653809, + "num_tokens": 316401423.0, + "step": 12227 + }, + { + "epoch": 1.3428508675598507, + "grad_norm": 1.8144638538360596, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7347632050514221, + "num_tokens": 316428570.0, + "step": 12228 + }, + { + "epoch": 1.3429606852624643, + "grad_norm": 1.7515147924423218, + "learning_rate": 5e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7102333307266235, + "num_tokens": 316458512.0, + "step": 12229 + }, + { + "epoch": 1.343070502965078, + "grad_norm": 1.6844141483306885, + "learning_rate": 5e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7224947214126587, + "num_tokens": 316492708.0, + "step": 12230 + }, + { + "epoch": 1.3431803206676916, + "grad_norm": 1.747747778892517, + "learning_rate": 5e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.722115695476532, + "num_tokens": 316520672.0, + "step": 12231 + }, + { + "epoch": 1.3432901383703053, + "grad_norm": 1.981713056564331, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7252436876296997, + "num_tokens": 316544954.0, + "step": 12232 + }, + { + "epoch": 1.3433999560729188, + "grad_norm": 1.9475994110107422, + "learning_rate": 5e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7522737979888916, + "num_tokens": 316567024.0, + "step": 12233 + }, + { + "epoch": 1.3435097737755326, + "grad_norm": 1.7594826221466064, + "learning_rate": 5e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7288490533828735, + "num_tokens": 316596812.0, + "step": 12234 + }, + { + "epoch": 1.3436195914781464, + "grad_norm": 1.8826154470443726, + "learning_rate": 5e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7237659692764282, + "num_tokens": 316622264.0, + "step": 12235 + }, + { + "epoch": 1.34372940918076, + "grad_norm": 1.9260083436965942, + "learning_rate": 5e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7249215841293335, + "num_tokens": 316648423.0, + "step": 12236 + }, + { + "epoch": 1.3438392268833736, + "grad_norm": 1.7674014568328857, + "learning_rate": 5e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7069658041000366, + "num_tokens": 316675359.0, + "step": 12237 + }, + { + "epoch": 1.3439490445859872, + "grad_norm": 1.9368267059326172, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7341410517692566, + "num_tokens": 316698348.0, + "step": 12238 + }, + { + "epoch": 1.344058862288601, + "grad_norm": 1.934460163116455, + "learning_rate": 5e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7503806352615356, + "num_tokens": 316721723.0, + "step": 12239 + }, + { + "epoch": 1.3441686799912147, + "grad_norm": 2.1568877696990967, + "learning_rate": 5e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7548943758010864, + "num_tokens": 316741041.0, + "step": 12240 + }, + { + "epoch": 1.3442784976938282, + "grad_norm": 1.8527095317840576, + "learning_rate": 5e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7200113534927368, + "num_tokens": 316765326.0, + "step": 12241 + }, + { + "epoch": 1.3443883153964418, + "grad_norm": 1.7809205055236816, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7471438646316528, + "num_tokens": 316792049.0, + "step": 12242 + }, + { + "epoch": 1.3444981330990555, + "grad_norm": 1.8811416625976562, + "learning_rate": 5e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.759279727935791, + "num_tokens": 316818982.0, + "step": 12243 + }, + { + "epoch": 1.3446079508016693, + "grad_norm": 1.6634628772735596, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7290661334991455, + "num_tokens": 316849642.0, + "step": 12244 + }, + { + "epoch": 1.3447177685042828, + "grad_norm": 1.9590933322906494, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7346611022949219, + "num_tokens": 316873420.0, + "step": 12245 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 1.7342183589935303, + "learning_rate": 5e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7160265445709229, + "num_tokens": 316902220.0, + "step": 12246 + }, + { + "epoch": 1.34493740390951, + "grad_norm": 1.751367211341858, + "learning_rate": 5e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7544100284576416, + "num_tokens": 316927724.0, + "step": 12247 + }, + { + "epoch": 1.3450472216121239, + "grad_norm": 1.7589993476867676, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7263858318328857, + "num_tokens": 316953080.0, + "step": 12248 + }, + { + "epoch": 1.3451570393147376, + "grad_norm": 1.9700924158096313, + "learning_rate": 5e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7238965034484863, + "num_tokens": 316976235.0, + "step": 12249 + }, + { + "epoch": 1.3452668570173512, + "grad_norm": 1.806540608406067, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.720659613609314, + "num_tokens": 317002640.0, + "step": 12250 + }, + { + "epoch": 1.345376674719965, + "grad_norm": 1.615925908088684, + "learning_rate": 5e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7028236389160156, + "num_tokens": 317040579.0, + "step": 12251 + }, + { + "epoch": 1.3454864924225785, + "grad_norm": 1.8831015825271606, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7409666776657104, + "num_tokens": 317066476.0, + "step": 12252 + }, + { + "epoch": 1.3455963101251922, + "grad_norm": 1.6538689136505127, + "learning_rate": 5e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7306091785430908, + "num_tokens": 317096982.0, + "step": 12253 + }, + { + "epoch": 1.345706127827806, + "grad_norm": 1.8256345987319946, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7488688230514526, + "num_tokens": 317122742.0, + "step": 12254 + }, + { + "epoch": 1.3458159455304195, + "grad_norm": 1.7091110944747925, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7140436172485352, + "num_tokens": 317152179.0, + "step": 12255 + }, + { + "epoch": 1.345925763233033, + "grad_norm": 1.6582754850387573, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7279579639434814, + "num_tokens": 317184489.0, + "step": 12256 + }, + { + "epoch": 1.3460355809356468, + "grad_norm": 1.7897439002990723, + "learning_rate": 5e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7156084775924683, + "num_tokens": 317214684.0, + "step": 12257 + }, + { + "epoch": 1.3461453986382605, + "grad_norm": 1.7338284254074097, + "learning_rate": 5e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7360466718673706, + "num_tokens": 317243459.0, + "step": 12258 + }, + { + "epoch": 1.346255216340874, + "grad_norm": 1.7584656476974487, + "learning_rate": 5e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7613643407821655, + "num_tokens": 317267655.0, + "step": 12259 + }, + { + "epoch": 1.3463650340434878, + "grad_norm": 1.7171686887741089, + "learning_rate": 5e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7284034490585327, + "num_tokens": 317299271.0, + "step": 12260 + }, + { + "epoch": 1.3464748517461014, + "grad_norm": 1.9035564661026, + "learning_rate": 5e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7271045446395874, + "num_tokens": 317323327.0, + "step": 12261 + }, + { + "epoch": 1.3465846694487151, + "grad_norm": 1.858484148979187, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7496419548988342, + "num_tokens": 317346424.0, + "step": 12262 + }, + { + "epoch": 1.346694487151329, + "grad_norm": 1.6824655532836914, + "learning_rate": 5e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.72576904296875, + "num_tokens": 317375161.0, + "step": 12263 + }, + { + "epoch": 1.3468043048539424, + "grad_norm": 1.895681619644165, + "learning_rate": 5e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7412217855453491, + "num_tokens": 317399434.0, + "step": 12264 + }, + { + "epoch": 1.3469141225565562, + "grad_norm": 1.8174697160720825, + "learning_rate": 5e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7406133413314819, + "num_tokens": 317426824.0, + "step": 12265 + }, + { + "epoch": 1.3470239402591697, + "grad_norm": 2.010953903198242, + "learning_rate": 5e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7373441457748413, + "num_tokens": 317452034.0, + "step": 12266 + }, + { + "epoch": 1.3471337579617835, + "grad_norm": 1.813398838043213, + "learning_rate": 5e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7215147614479065, + "num_tokens": 317482501.0, + "step": 12267 + }, + { + "epoch": 1.3472435756643972, + "grad_norm": 1.9815512895584106, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7419416308403015, + "num_tokens": 317506040.0, + "step": 12268 + }, + { + "epoch": 1.3473533933670108, + "grad_norm": 1.6602851152420044, + "learning_rate": 5e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7415223121643066, + "num_tokens": 317536682.0, + "step": 12269 + }, + { + "epoch": 1.3474632110696243, + "grad_norm": 1.8341842889785767, + "learning_rate": 5e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7008277773857117, + "num_tokens": 317565645.0, + "step": 12270 + }, + { + "epoch": 1.347573028772238, + "grad_norm": 1.9034888744354248, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7323222160339355, + "num_tokens": 317590186.0, + "step": 12271 + }, + { + "epoch": 1.3476828464748518, + "grad_norm": 1.998443603515625, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7147212028503418, + "num_tokens": 317614867.0, + "step": 12272 + }, + { + "epoch": 1.3477926641774653, + "grad_norm": 1.9142876863479614, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7142394185066223, + "num_tokens": 317640724.0, + "step": 12273 + }, + { + "epoch": 1.347902481880079, + "grad_norm": 1.9425498247146606, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7460745573043823, + "num_tokens": 317663331.0, + "step": 12274 + }, + { + "epoch": 1.3480122995826926, + "grad_norm": 1.7222723960876465, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7304767370223999, + "num_tokens": 317690623.0, + "step": 12275 + }, + { + "epoch": 1.3481221172853064, + "grad_norm": 1.679715633392334, + "learning_rate": 5e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7219710350036621, + "num_tokens": 317722023.0, + "step": 12276 + }, + { + "epoch": 1.3482319349879202, + "grad_norm": 1.8100615739822388, + "learning_rate": 5e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7209024429321289, + "num_tokens": 317749424.0, + "step": 12277 + }, + { + "epoch": 1.3483417526905337, + "grad_norm": 1.9148027896881104, + "learning_rate": 5e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7464947700500488, + "num_tokens": 317772060.0, + "step": 12278 + }, + { + "epoch": 1.3484515703931474, + "grad_norm": 1.8982174396514893, + "learning_rate": 5e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.749809741973877, + "num_tokens": 317794830.0, + "step": 12279 + }, + { + "epoch": 1.348561388095761, + "grad_norm": 1.6783140897750854, + "learning_rate": 5e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7433991432189941, + "num_tokens": 317824203.0, + "step": 12280 + }, + { + "epoch": 1.3486712057983747, + "grad_norm": 1.723159909248352, + "learning_rate": 5e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7171558141708374, + "num_tokens": 317853320.0, + "step": 12281 + }, + { + "epoch": 1.3487810235009885, + "grad_norm": 1.9492498636245728, + "learning_rate": 5e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7346851825714111, + "num_tokens": 317877162.0, + "step": 12282 + }, + { + "epoch": 1.348890841203602, + "grad_norm": 1.784354567527771, + "learning_rate": 5e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7209876775741577, + "num_tokens": 317903801.0, + "step": 12283 + }, + { + "epoch": 1.3490006589062156, + "grad_norm": 2.0839312076568604, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7495564222335815, + "num_tokens": 317924505.0, + "step": 12284 + }, + { + "epoch": 1.3491104766088293, + "grad_norm": 1.574719786643982, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7433713674545288, + "num_tokens": 317957230.0, + "step": 12285 + }, + { + "epoch": 1.349220294311443, + "grad_norm": 1.9157601594924927, + "learning_rate": 5e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7329472899436951, + "num_tokens": 317984265.0, + "step": 12286 + }, + { + "epoch": 1.3493301120140566, + "grad_norm": 1.7483429908752441, + "learning_rate": 5e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7039014101028442, + "num_tokens": 318015382.0, + "step": 12287 + }, + { + "epoch": 1.3494399297166704, + "grad_norm": 2.0272011756896973, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.758660078048706, + "num_tokens": 318038596.0, + "step": 12288 + }, + { + "epoch": 1.349549747419284, + "grad_norm": 1.7163960933685303, + "learning_rate": 5e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.761408269405365, + "num_tokens": 318065907.0, + "step": 12289 + }, + { + "epoch": 1.3496595651218977, + "grad_norm": 2.063364267349243, + "learning_rate": 5e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7440096139907837, + "num_tokens": 318085381.0, + "step": 12290 + }, + { + "epoch": 1.3497693828245114, + "grad_norm": 2.046738386154175, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7353390455245972, + "num_tokens": 318106773.0, + "step": 12291 + }, + { + "epoch": 1.349879200527125, + "grad_norm": 1.9494518041610718, + "learning_rate": 5e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7499560713768005, + "num_tokens": 318129776.0, + "step": 12292 + }, + { + "epoch": 1.3499890182297387, + "grad_norm": 2.023653745651245, + "learning_rate": 5e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.735336422920227, + "num_tokens": 318151362.0, + "step": 12293 + }, + { + "epoch": 1.3500988359323522, + "grad_norm": 1.8936312198638916, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7260256409645081, + "num_tokens": 318177225.0, + "step": 12294 + }, + { + "epoch": 1.350208653634966, + "grad_norm": 2.016524314880371, + "learning_rate": 5e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7256163954734802, + "num_tokens": 318199484.0, + "step": 12295 + }, + { + "epoch": 1.3503184713375795, + "grad_norm": 2.0423660278320312, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7313617467880249, + "num_tokens": 318220094.0, + "step": 12296 + }, + { + "epoch": 1.3504282890401933, + "grad_norm": 1.8032275438308716, + "learning_rate": 5e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7105514407157898, + "num_tokens": 318247876.0, + "step": 12297 + }, + { + "epoch": 1.3505381067428068, + "grad_norm": 1.897933840751648, + "learning_rate": 5e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.74173903465271, + "num_tokens": 318273256.0, + "step": 12298 + }, + { + "epoch": 1.3506479244454206, + "grad_norm": 2.0480637550354004, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7271929979324341, + "num_tokens": 318297023.0, + "step": 12299 + }, + { + "epoch": 1.3507577421480343, + "grad_norm": 1.879520058631897, + "learning_rate": 5e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7443441152572632, + "num_tokens": 318322463.0, + "step": 12300 + }, + { + "epoch": 1.3508675598506479, + "grad_norm": 1.6913727521896362, + "learning_rate": 5e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7366293668746948, + "num_tokens": 318351631.0, + "step": 12301 + }, + { + "epoch": 1.3509773775532616, + "grad_norm": 1.9519598484039307, + "learning_rate": 5e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7469378709793091, + "num_tokens": 318374131.0, + "step": 12302 + }, + { + "epoch": 1.3510871952558752, + "grad_norm": 1.6892290115356445, + "learning_rate": 5e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7339819669723511, + "num_tokens": 318403942.0, + "step": 12303 + }, + { + "epoch": 1.351197012958489, + "grad_norm": 1.8606969118118286, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7319884896278381, + "num_tokens": 318430442.0, + "step": 12304 + }, + { + "epoch": 1.3513068306611027, + "grad_norm": 2.169541358947754, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7673386335372925, + "num_tokens": 318449219.0, + "step": 12305 + }, + { + "epoch": 1.3514166483637162, + "grad_norm": 1.8611948490142822, + "learning_rate": 5e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7420367002487183, + "num_tokens": 318475476.0, + "step": 12306 + }, + { + "epoch": 1.3515264660663298, + "grad_norm": 2.0046017169952393, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7284752130508423, + "num_tokens": 318500443.0, + "step": 12307 + }, + { + "epoch": 1.3516362837689435, + "grad_norm": 1.8434690237045288, + "learning_rate": 5e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.729787290096283, + "num_tokens": 318526963.0, + "step": 12308 + }, + { + "epoch": 1.3517461014715573, + "grad_norm": 1.917870044708252, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.737500011920929, + "num_tokens": 318549741.0, + "step": 12309 + }, + { + "epoch": 1.3518559191741708, + "grad_norm": 1.920719027519226, + "learning_rate": 5e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7036051750183105, + "num_tokens": 318577501.0, + "step": 12310 + }, + { + "epoch": 1.3519657368767846, + "grad_norm": 1.72263765335083, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7560264468193054, + "num_tokens": 318606057.0, + "step": 12311 + }, + { + "epoch": 1.352075554579398, + "grad_norm": 1.678572177886963, + "learning_rate": 5e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7416170835494995, + "num_tokens": 318636799.0, + "step": 12312 + }, + { + "epoch": 1.3521853722820119, + "grad_norm": 1.716132640838623, + "learning_rate": 5e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.704686164855957, + "num_tokens": 318667741.0, + "step": 12313 + }, + { + "epoch": 1.3522951899846256, + "grad_norm": 2.010005235671997, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7553123831748962, + "num_tokens": 318689846.0, + "step": 12314 + }, + { + "epoch": 1.3524050076872391, + "grad_norm": 1.7984540462493896, + "learning_rate": 5e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7360295057296753, + "num_tokens": 318714361.0, + "step": 12315 + }, + { + "epoch": 1.352514825389853, + "grad_norm": 1.88022780418396, + "learning_rate": 5e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7306460738182068, + "num_tokens": 318738806.0, + "step": 12316 + }, + { + "epoch": 1.3526246430924664, + "grad_norm": 1.794616460800171, + "learning_rate": 5e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7301067113876343, + "num_tokens": 318765817.0, + "step": 12317 + }, + { + "epoch": 1.3527344607950802, + "grad_norm": 1.825939655303955, + "learning_rate": 5e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7201235294342041, + "num_tokens": 318793814.0, + "step": 12318 + }, + { + "epoch": 1.352844278497694, + "grad_norm": 1.9390008449554443, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7464855909347534, + "num_tokens": 318816746.0, + "step": 12319 + }, + { + "epoch": 1.3529540962003075, + "grad_norm": 1.7104123830795288, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7240686416625977, + "num_tokens": 318846957.0, + "step": 12320 + }, + { + "epoch": 1.353063913902921, + "grad_norm": 1.6057252883911133, + "learning_rate": 5e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7202571630477905, + "num_tokens": 318878218.0, + "step": 12321 + }, + { + "epoch": 1.3531737316055348, + "grad_norm": 1.7340564727783203, + "learning_rate": 5e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.712462842464447, + "num_tokens": 318906838.0, + "step": 12322 + }, + { + "epoch": 1.3532835493081485, + "grad_norm": 1.7744616270065308, + "learning_rate": 5e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7309475541114807, + "num_tokens": 318935674.0, + "step": 12323 + }, + { + "epoch": 1.353393367010762, + "grad_norm": 2.0020298957824707, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7263427972793579, + "num_tokens": 318958498.0, + "step": 12324 + }, + { + "epoch": 1.3535031847133758, + "grad_norm": 2.125765323638916, + "learning_rate": 5e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7464274168014526, + "num_tokens": 318978692.0, + "step": 12325 + }, + { + "epoch": 1.3536130024159894, + "grad_norm": 1.8323420286178589, + "learning_rate": 5e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7284740209579468, + "num_tokens": 319002986.0, + "step": 12326 + }, + { + "epoch": 1.3537228201186031, + "grad_norm": 1.6434578895568848, + "learning_rate": 5e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7030289173126221, + "num_tokens": 319035778.0, + "step": 12327 + }, + { + "epoch": 1.3538326378212169, + "grad_norm": 1.7715816497802734, + "learning_rate": 5e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.74644535779953, + "num_tokens": 319061770.0, + "step": 12328 + }, + { + "epoch": 1.3539424555238304, + "grad_norm": 1.8527556657791138, + "learning_rate": 5e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7339110970497131, + "num_tokens": 319087768.0, + "step": 12329 + }, + { + "epoch": 1.3540522732264442, + "grad_norm": 2.1520919799804688, + "learning_rate": 5e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.748275876045227, + "num_tokens": 319109306.0, + "step": 12330 + }, + { + "epoch": 1.3541620909290577, + "grad_norm": 1.9717531204223633, + "learning_rate": 5e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7345714569091797, + "num_tokens": 319133102.0, + "step": 12331 + }, + { + "epoch": 1.3542719086316715, + "grad_norm": 1.8345725536346436, + "learning_rate": 5e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7235199809074402, + "num_tokens": 319164431.0, + "step": 12332 + }, + { + "epoch": 1.3543817263342852, + "grad_norm": 1.9717110395431519, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7604528069496155, + "num_tokens": 319187765.0, + "step": 12333 + }, + { + "epoch": 1.3544915440368988, + "grad_norm": 1.8511477708816528, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.743558406829834, + "num_tokens": 319214189.0, + "step": 12334 + }, + { + "epoch": 1.3546013617395123, + "grad_norm": 1.9137181043624878, + "learning_rate": 5e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7289917469024658, + "num_tokens": 319240648.0, + "step": 12335 + }, + { + "epoch": 1.354711179442126, + "grad_norm": 1.8915963172912598, + "learning_rate": 5e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7321139574050903, + "num_tokens": 319266360.0, + "step": 12336 + }, + { + "epoch": 1.3548209971447398, + "grad_norm": 1.8435643911361694, + "learning_rate": 5e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7296447157859802, + "num_tokens": 319292578.0, + "step": 12337 + }, + { + "epoch": 1.3549308148473533, + "grad_norm": 1.654771089553833, + "learning_rate": 5e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7117591500282288, + "num_tokens": 319327327.0, + "step": 12338 + }, + { + "epoch": 1.355040632549967, + "grad_norm": 2.100217580795288, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.718515932559967, + "num_tokens": 319349537.0, + "step": 12339 + }, + { + "epoch": 1.3551504502525806, + "grad_norm": 2.004559278488159, + "learning_rate": 5e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7478089332580566, + "num_tokens": 319371952.0, + "step": 12340 + }, + { + "epoch": 1.3552602679551944, + "grad_norm": 1.7276639938354492, + "learning_rate": 5e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7189822196960449, + "num_tokens": 319400851.0, + "step": 12341 + }, + { + "epoch": 1.3553700856578081, + "grad_norm": 1.813408613204956, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7391082048416138, + "num_tokens": 319425673.0, + "step": 12342 + }, + { + "epoch": 1.3554799033604217, + "grad_norm": 1.7827954292297363, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7232248187065125, + "num_tokens": 319453685.0, + "step": 12343 + }, + { + "epoch": 1.3555897210630354, + "grad_norm": 2.053647518157959, + "learning_rate": 5e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.747873067855835, + "num_tokens": 319475190.0, + "step": 12344 + }, + { + "epoch": 1.355699538765649, + "grad_norm": 1.8107948303222656, + "learning_rate": 5e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7129763960838318, + "num_tokens": 319506077.0, + "step": 12345 + }, + { + "epoch": 1.3558093564682627, + "grad_norm": 1.7542502880096436, + "learning_rate": 5e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7057720422744751, + "num_tokens": 319536773.0, + "step": 12346 + }, + { + "epoch": 1.3559191741708763, + "grad_norm": 1.6005669832229614, + "learning_rate": 5e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.745790958404541, + "num_tokens": 319566976.0, + "step": 12347 + }, + { + "epoch": 1.35602899187349, + "grad_norm": 1.9065572023391724, + "learning_rate": 5e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7538225650787354, + "num_tokens": 319589686.0, + "step": 12348 + }, + { + "epoch": 1.3561388095761036, + "grad_norm": 1.8546829223632812, + "learning_rate": 5e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7133989334106445, + "num_tokens": 319617705.0, + "step": 12349 + }, + { + "epoch": 1.3562486272787173, + "grad_norm": 1.9131498336791992, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7344714999198914, + "num_tokens": 319641280.0, + "step": 12350 + }, + { + "epoch": 1.356358444981331, + "grad_norm": 2.092068910598755, + "learning_rate": 5e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7566425800323486, + "num_tokens": 319661668.0, + "step": 12351 + }, + { + "epoch": 1.3564682626839446, + "grad_norm": 1.9174705743789673, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7521474957466125, + "num_tokens": 319683916.0, + "step": 12352 + }, + { + "epoch": 1.3565780803865584, + "grad_norm": 2.0159733295440674, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7320830225944519, + "num_tokens": 319707076.0, + "step": 12353 + }, + { + "epoch": 1.356687898089172, + "grad_norm": 1.7704832553863525, + "learning_rate": 5e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7349363565444946, + "num_tokens": 319734545.0, + "step": 12354 + }, + { + "epoch": 1.3567977157917857, + "grad_norm": 2.1758601665496826, + "learning_rate": 5e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.756380558013916, + "num_tokens": 319754064.0, + "step": 12355 + }, + { + "epoch": 1.3569075334943994, + "grad_norm": 1.852779507637024, + "learning_rate": 5e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7283267378807068, + "num_tokens": 319782895.0, + "step": 12356 + }, + { + "epoch": 1.357017351197013, + "grad_norm": 1.812064290046692, + "learning_rate": 5e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.744655191898346, + "num_tokens": 319807824.0, + "step": 12357 + }, + { + "epoch": 1.3571271688996267, + "grad_norm": 1.8485010862350464, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7224420309066772, + "num_tokens": 319835124.0, + "step": 12358 + }, + { + "epoch": 1.3572369866022402, + "grad_norm": 1.816025733947754, + "learning_rate": 5e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6942007541656494, + "num_tokens": 319865012.0, + "step": 12359 + }, + { + "epoch": 1.357346804304854, + "grad_norm": 1.8323523998260498, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7163280248641968, + "num_tokens": 319891816.0, + "step": 12360 + }, + { + "epoch": 1.3574566220074675, + "grad_norm": 1.866499662399292, + "learning_rate": 5e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.763446033000946, + "num_tokens": 319916401.0, + "step": 12361 + }, + { + "epoch": 1.3575664397100813, + "grad_norm": 1.825113296508789, + "learning_rate": 5e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7221852540969849, + "num_tokens": 319944425.0, + "step": 12362 + }, + { + "epoch": 1.3576762574126948, + "grad_norm": 1.9412791728973389, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.726641058921814, + "num_tokens": 319969792.0, + "step": 12363 + }, + { + "epoch": 1.3577860751153086, + "grad_norm": 2.0569722652435303, + "learning_rate": 5e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7254998087882996, + "num_tokens": 319990958.0, + "step": 12364 + }, + { + "epoch": 1.3578958928179223, + "grad_norm": 1.8017675876617432, + "learning_rate": 5e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7431160807609558, + "num_tokens": 320017611.0, + "step": 12365 + }, + { + "epoch": 1.3580057105205359, + "grad_norm": 1.9290889501571655, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7338306903839111, + "num_tokens": 320041875.0, + "step": 12366 + }, + { + "epoch": 1.3581155282231496, + "grad_norm": 1.8415825366973877, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.745058536529541, + "num_tokens": 320067430.0, + "step": 12367 + }, + { + "epoch": 1.3582253459257632, + "grad_norm": 1.9439952373504639, + "learning_rate": 5e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7496087551116943, + "num_tokens": 320090999.0, + "step": 12368 + }, + { + "epoch": 1.358335163628377, + "grad_norm": 1.716626763343811, + "learning_rate": 5e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7419151067733765, + "num_tokens": 320118501.0, + "step": 12369 + }, + { + "epoch": 1.3584449813309907, + "grad_norm": 1.9236328601837158, + "learning_rate": 5e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7094197273254395, + "num_tokens": 320145050.0, + "step": 12370 + }, + { + "epoch": 1.3585547990336042, + "grad_norm": 1.8461875915527344, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7192620635032654, + "num_tokens": 320174065.0, + "step": 12371 + }, + { + "epoch": 1.3586646167362177, + "grad_norm": 1.9495559930801392, + "learning_rate": 5e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7296807169914246, + "num_tokens": 320195701.0, + "step": 12372 + }, + { + "epoch": 1.3587744344388315, + "grad_norm": 2.0060460567474365, + "learning_rate": 5e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7445813417434692, + "num_tokens": 320216644.0, + "step": 12373 + }, + { + "epoch": 1.3588842521414453, + "grad_norm": 2.0078108310699463, + "learning_rate": 5e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7328733205795288, + "num_tokens": 320239617.0, + "step": 12374 + }, + { + "epoch": 1.3589940698440588, + "grad_norm": 1.7346998453140259, + "learning_rate": 5e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7345001697540283, + "num_tokens": 320268955.0, + "step": 12375 + }, + { + "epoch": 1.3591038875466726, + "grad_norm": 1.7579361200332642, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7212310433387756, + "num_tokens": 320299273.0, + "step": 12376 + }, + { + "epoch": 1.359213705249286, + "grad_norm": 1.9602652788162231, + "learning_rate": 5e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7048068046569824, + "num_tokens": 320325783.0, + "step": 12377 + }, + { + "epoch": 1.3593235229518998, + "grad_norm": 1.8818848133087158, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7378486394882202, + "num_tokens": 320351617.0, + "step": 12378 + }, + { + "epoch": 1.3594333406545136, + "grad_norm": 2.0849595069885254, + "learning_rate": 5e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7437744140625, + "num_tokens": 320372751.0, + "step": 12379 + }, + { + "epoch": 1.3595431583571271, + "grad_norm": 2.119126796722412, + "learning_rate": 5e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7223347425460815, + "num_tokens": 320396512.0, + "step": 12380 + }, + { + "epoch": 1.359652976059741, + "grad_norm": 2.1784870624542236, + "learning_rate": 5e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7457996606826782, + "num_tokens": 320416398.0, + "step": 12381 + }, + { + "epoch": 1.3597627937623544, + "grad_norm": 1.7505923509597778, + "learning_rate": 5e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7203683257102966, + "num_tokens": 320444261.0, + "step": 12382 + }, + { + "epoch": 1.3598726114649682, + "grad_norm": 1.8702152967453003, + "learning_rate": 5e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7126648426055908, + "num_tokens": 320469617.0, + "step": 12383 + }, + { + "epoch": 1.359982429167582, + "grad_norm": 1.828450322151184, + "learning_rate": 5e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7391022443771362, + "num_tokens": 320495548.0, + "step": 12384 + }, + { + "epoch": 1.3600922468701955, + "grad_norm": 1.4628652334213257, + "learning_rate": 5e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.704323410987854, + "num_tokens": 320533365.0, + "step": 12385 + }, + { + "epoch": 1.360202064572809, + "grad_norm": 1.8047072887420654, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7423779964447021, + "num_tokens": 320560159.0, + "step": 12386 + }, + { + "epoch": 1.3603118822754228, + "grad_norm": 1.735635757446289, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7519916296005249, + "num_tokens": 320587599.0, + "step": 12387 + }, + { + "epoch": 1.3604216999780365, + "grad_norm": 1.9443999528884888, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7436143159866333, + "num_tokens": 320613095.0, + "step": 12388 + }, + { + "epoch": 1.36053151768065, + "grad_norm": 1.6473548412322998, + "learning_rate": 5e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7211396098136902, + "num_tokens": 320643530.0, + "step": 12389 + }, + { + "epoch": 1.3606413353832638, + "grad_norm": 1.7224215269088745, + "learning_rate": 5e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7454946041107178, + "num_tokens": 320672152.0, + "step": 12390 + }, + { + "epoch": 1.3607511530858774, + "grad_norm": 1.8005528450012207, + "learning_rate": 5e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.6970964074134827, + "num_tokens": 320700997.0, + "step": 12391 + }, + { + "epoch": 1.360860970788491, + "grad_norm": 1.6881295442581177, + "learning_rate": 5e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7223920822143555, + "num_tokens": 320734398.0, + "step": 12392 + }, + { + "epoch": 1.3609707884911049, + "grad_norm": 1.6817547082901, + "learning_rate": 5e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.694186806678772, + "num_tokens": 320769226.0, + "step": 12393 + }, + { + "epoch": 1.3610806061937184, + "grad_norm": 1.7984613180160522, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7515483498573303, + "num_tokens": 320797021.0, + "step": 12394 + }, + { + "epoch": 1.3611904238963322, + "grad_norm": 2.12249755859375, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7240761518478394, + "num_tokens": 320818466.0, + "step": 12395 + }, + { + "epoch": 1.3613002415989457, + "grad_norm": 2.101675510406494, + "learning_rate": 5e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7511733174324036, + "num_tokens": 320838979.0, + "step": 12396 + }, + { + "epoch": 1.3614100593015594, + "grad_norm": 1.738645315170288, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7390718460083008, + "num_tokens": 320867610.0, + "step": 12397 + }, + { + "epoch": 1.3615198770041732, + "grad_norm": 1.9225839376449585, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7107608914375305, + "num_tokens": 320892890.0, + "step": 12398 + }, + { + "epoch": 1.3616296947067867, + "grad_norm": 1.891302466392517, + "learning_rate": 5e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.754715085029602, + "num_tokens": 320917152.0, + "step": 12399 + }, + { + "epoch": 1.3617395124094003, + "grad_norm": 1.8818325996398926, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7442618608474731, + "num_tokens": 320942750.0, + "step": 12400 + }, + { + "epoch": 1.361849330112014, + "grad_norm": 1.9799147844314575, + "learning_rate": 5e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7384668588638306, + "num_tokens": 320967294.0, + "step": 12401 + }, + { + "epoch": 1.3619591478146278, + "grad_norm": 1.8479410409927368, + "learning_rate": 5e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7242535948753357, + "num_tokens": 320995400.0, + "step": 12402 + }, + { + "epoch": 1.3620689655172413, + "grad_norm": 2.1295523643493652, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7341221570968628, + "num_tokens": 321016683.0, + "step": 12403 + }, + { + "epoch": 1.362178783219855, + "grad_norm": 1.657395601272583, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7302588820457458, + "num_tokens": 321046071.0, + "step": 12404 + }, + { + "epoch": 1.3622886009224686, + "grad_norm": 1.9082585573196411, + "learning_rate": 5e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7370054721832275, + "num_tokens": 321071144.0, + "step": 12405 + }, + { + "epoch": 1.3623984186250824, + "grad_norm": 1.8665492534637451, + "learning_rate": 5e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7016401290893555, + "num_tokens": 321098668.0, + "step": 12406 + }, + { + "epoch": 1.3625082363276961, + "grad_norm": 1.7556750774383545, + "learning_rate": 5e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7545769214630127, + "num_tokens": 321124669.0, + "step": 12407 + }, + { + "epoch": 1.3626180540303097, + "grad_norm": 1.9159677028656006, + "learning_rate": 5e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.71573805809021, + "num_tokens": 321154798.0, + "step": 12408 + }, + { + "epoch": 1.3627278717329234, + "grad_norm": 1.7161983251571655, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7185802459716797, + "num_tokens": 321185271.0, + "step": 12409 + }, + { + "epoch": 1.362837689435537, + "grad_norm": 1.6517871618270874, + "learning_rate": 5e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7363308072090149, + "num_tokens": 321214683.0, + "step": 12410 + }, + { + "epoch": 1.3629475071381507, + "grad_norm": 1.6590447425842285, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7263177633285522, + "num_tokens": 321245310.0, + "step": 12411 + }, + { + "epoch": 1.3630573248407643, + "grad_norm": 1.9836037158966064, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7622647285461426, + "num_tokens": 321266848.0, + "step": 12412 + }, + { + "epoch": 1.363167142543378, + "grad_norm": 1.7468271255493164, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7310230135917664, + "num_tokens": 321294316.0, + "step": 12413 + }, + { + "epoch": 1.3632769602459915, + "grad_norm": 1.8579925298690796, + "learning_rate": 5e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7197070717811584, + "num_tokens": 321319398.0, + "step": 12414 + }, + { + "epoch": 1.3633867779486053, + "grad_norm": 1.9619808197021484, + "learning_rate": 5e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7190526723861694, + "num_tokens": 321343837.0, + "step": 12415 + }, + { + "epoch": 1.363496595651219, + "grad_norm": 1.9059258699417114, + "learning_rate": 5e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7208415269851685, + "num_tokens": 321369606.0, + "step": 12416 + }, + { + "epoch": 1.3636064133538326, + "grad_norm": 2.102043390274048, + "learning_rate": 5e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7498770952224731, + "num_tokens": 321389583.0, + "step": 12417 + }, + { + "epoch": 1.3637162310564463, + "grad_norm": 1.9197726249694824, + "learning_rate": 5e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7189973592758179, + "num_tokens": 321415057.0, + "step": 12418 + }, + { + "epoch": 1.3638260487590599, + "grad_norm": 1.722259521484375, + "learning_rate": 5e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7128390073776245, + "num_tokens": 321447675.0, + "step": 12419 + }, + { + "epoch": 1.3639358664616736, + "grad_norm": 1.9937591552734375, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7488170862197876, + "num_tokens": 321468958.0, + "step": 12420 + }, + { + "epoch": 1.3640456841642874, + "grad_norm": 1.9026455879211426, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7572932839393616, + "num_tokens": 321489644.0, + "step": 12421 + }, + { + "epoch": 1.364155501866901, + "grad_norm": 1.8530325889587402, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7213187217712402, + "num_tokens": 321518080.0, + "step": 12422 + }, + { + "epoch": 1.3642653195695145, + "grad_norm": 2.045121669769287, + "learning_rate": 5e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.747134268283844, + "num_tokens": 321539533.0, + "step": 12423 + }, + { + "epoch": 1.3643751372721282, + "grad_norm": 1.882331132888794, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7287979125976562, + "num_tokens": 321570333.0, + "step": 12424 + }, + { + "epoch": 1.364484954974742, + "grad_norm": 1.831083059310913, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7180627584457397, + "num_tokens": 321597259.0, + "step": 12425 + }, + { + "epoch": 1.3645947726773555, + "grad_norm": 1.8097394704818726, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7148492932319641, + "num_tokens": 321624086.0, + "step": 12426 + }, + { + "epoch": 1.3647045903799693, + "grad_norm": 2.0644209384918213, + "learning_rate": 5e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7395834922790527, + "num_tokens": 321646442.0, + "step": 12427 + }, + { + "epoch": 1.3648144080825828, + "grad_norm": 1.9702470302581787, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7343809604644775, + "num_tokens": 321670209.0, + "step": 12428 + }, + { + "epoch": 1.3649242257851966, + "grad_norm": 1.7105846405029297, + "learning_rate": 5e-06, + "loss": 0.774, + "mean_token_accuracy": 0.7573012113571167, + "num_tokens": 321695447.0, + "step": 12429 + }, + { + "epoch": 1.3650340434878103, + "grad_norm": 1.8228230476379395, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7394128441810608, + "num_tokens": 321723712.0, + "step": 12430 + }, + { + "epoch": 1.3651438611904239, + "grad_norm": 1.7855981588363647, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7467693090438843, + "num_tokens": 321749076.0, + "step": 12431 + }, + { + "epoch": 1.3652536788930376, + "grad_norm": 1.8216239213943481, + "learning_rate": 5e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7459228038787842, + "num_tokens": 321774827.0, + "step": 12432 + }, + { + "epoch": 1.3653634965956511, + "grad_norm": 1.942043423652649, + "learning_rate": 5e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7324018478393555, + "num_tokens": 321798315.0, + "step": 12433 + }, + { + "epoch": 1.365473314298265, + "grad_norm": 1.7285014390945435, + "learning_rate": 5e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7313182353973389, + "num_tokens": 321825243.0, + "step": 12434 + }, + { + "epoch": 1.3655831320008787, + "grad_norm": 1.9387657642364502, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7329158782958984, + "num_tokens": 321850752.0, + "step": 12435 + }, + { + "epoch": 1.3656929497034922, + "grad_norm": 1.8327085971832275, + "learning_rate": 5e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7075369358062744, + "num_tokens": 321879382.0, + "step": 12436 + }, + { + "epoch": 1.3658027674061057, + "grad_norm": 1.8467968702316284, + "learning_rate": 5e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7525668144226074, + "num_tokens": 321906852.0, + "step": 12437 + }, + { + "epoch": 1.3659125851087195, + "grad_norm": 1.8611257076263428, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7373141646385193, + "num_tokens": 321935182.0, + "step": 12438 + }, + { + "epoch": 1.3660224028113332, + "grad_norm": 1.9371798038482666, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7261182069778442, + "num_tokens": 321960672.0, + "step": 12439 + }, + { + "epoch": 1.3661322205139468, + "grad_norm": 1.79192054271698, + "learning_rate": 5e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7378058433532715, + "num_tokens": 321985891.0, + "step": 12440 + }, + { + "epoch": 1.3662420382165605, + "grad_norm": 1.7947230339050293, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7417114973068237, + "num_tokens": 322013238.0, + "step": 12441 + }, + { + "epoch": 1.366351855919174, + "grad_norm": 2.151840925216675, + "learning_rate": 5e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7548567056655884, + "num_tokens": 322034825.0, + "step": 12442 + }, + { + "epoch": 1.3664616736217878, + "grad_norm": 1.9092624187469482, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.744195818901062, + "num_tokens": 322057561.0, + "step": 12443 + }, + { + "epoch": 1.3665714913244016, + "grad_norm": 1.9495511054992676, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7205551862716675, + "num_tokens": 322081855.0, + "step": 12444 + }, + { + "epoch": 1.3666813090270151, + "grad_norm": 1.9180465936660767, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7553092241287231, + "num_tokens": 322103217.0, + "step": 12445 + }, + { + "epoch": 1.3667911267296289, + "grad_norm": 1.707746148109436, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7498272657394409, + "num_tokens": 322130619.0, + "step": 12446 + }, + { + "epoch": 1.3669009444322424, + "grad_norm": 1.9548479318618774, + "learning_rate": 5e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7377187013626099, + "num_tokens": 322153392.0, + "step": 12447 + }, + { + "epoch": 1.3670107621348562, + "grad_norm": 2.111422300338745, + "learning_rate": 5e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7361790537834167, + "num_tokens": 322176077.0, + "step": 12448 + }, + { + "epoch": 1.36712057983747, + "grad_norm": 1.9090726375579834, + "learning_rate": 5e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7218996286392212, + "num_tokens": 322203386.0, + "step": 12449 + }, + { + "epoch": 1.3672303975400835, + "grad_norm": 1.9643374681472778, + "learning_rate": 5e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.735625684261322, + "num_tokens": 322230513.0, + "step": 12450 + }, + { + "epoch": 1.367340215242697, + "grad_norm": 1.98796546459198, + "learning_rate": 5e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7345829606056213, + "num_tokens": 322257119.0, + "step": 12451 + }, + { + "epoch": 1.3674500329453108, + "grad_norm": 1.8249422311782837, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7345400452613831, + "num_tokens": 322285264.0, + "step": 12452 + }, + { + "epoch": 1.3675598506479245, + "grad_norm": 1.996768593788147, + "learning_rate": 5e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7308394908905029, + "num_tokens": 322307227.0, + "step": 12453 + }, + { + "epoch": 1.367669668350538, + "grad_norm": 1.8313801288604736, + "learning_rate": 5e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7261635065078735, + "num_tokens": 322331916.0, + "step": 12454 + }, + { + "epoch": 1.3677794860531518, + "grad_norm": 2.0316362380981445, + "learning_rate": 5e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7438962459564209, + "num_tokens": 322352236.0, + "step": 12455 + }, + { + "epoch": 1.3678893037557653, + "grad_norm": 2.036123514175415, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.740983247756958, + "num_tokens": 322373194.0, + "step": 12456 + }, + { + "epoch": 1.367999121458379, + "grad_norm": 2.0574164390563965, + "learning_rate": 5e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7405003309249878, + "num_tokens": 322393727.0, + "step": 12457 + }, + { + "epoch": 1.3681089391609929, + "grad_norm": 2.040904998779297, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7315924167633057, + "num_tokens": 322416154.0, + "step": 12458 + }, + { + "epoch": 1.3682187568636064, + "grad_norm": 2.1209981441497803, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7497193217277527, + "num_tokens": 322434213.0, + "step": 12459 + }, + { + "epoch": 1.3683285745662201, + "grad_norm": 1.879102110862732, + "learning_rate": 5e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7420384287834167, + "num_tokens": 322457747.0, + "step": 12460 + }, + { + "epoch": 1.3684383922688337, + "grad_norm": 1.6035548448562622, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7163325548171997, + "num_tokens": 322491233.0, + "step": 12461 + }, + { + "epoch": 1.3685482099714474, + "grad_norm": 1.8420206308364868, + "learning_rate": 5e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7238724827766418, + "num_tokens": 322520532.0, + "step": 12462 + }, + { + "epoch": 1.3686580276740612, + "grad_norm": 1.8374419212341309, + "learning_rate": 5e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7389786243438721, + "num_tokens": 322548798.0, + "step": 12463 + }, + { + "epoch": 1.3687678453766747, + "grad_norm": 1.7450768947601318, + "learning_rate": 5e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7229534387588501, + "num_tokens": 322578422.0, + "step": 12464 + }, + { + "epoch": 1.3688776630792883, + "grad_norm": 1.7353731393814087, + "learning_rate": 5e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7187540531158447, + "num_tokens": 322608286.0, + "step": 12465 + }, + { + "epoch": 1.368987480781902, + "grad_norm": 1.9138379096984863, + "learning_rate": 5e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7430185675621033, + "num_tokens": 322633077.0, + "step": 12466 + }, + { + "epoch": 1.3690972984845158, + "grad_norm": 1.7391605377197266, + "learning_rate": 5e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7381293773651123, + "num_tokens": 322662558.0, + "step": 12467 + }, + { + "epoch": 1.3692071161871293, + "grad_norm": 1.950061321258545, + "learning_rate": 5e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7365083694458008, + "num_tokens": 322689755.0, + "step": 12468 + }, + { + "epoch": 1.369316933889743, + "grad_norm": 1.944746732711792, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7403532266616821, + "num_tokens": 322712651.0, + "step": 12469 + }, + { + "epoch": 1.3694267515923566, + "grad_norm": 1.7189816236495972, + "learning_rate": 5e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7072913646697998, + "num_tokens": 322745420.0, + "step": 12470 + }, + { + "epoch": 1.3695365692949704, + "grad_norm": 2.0462052822113037, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.727096438407898, + "num_tokens": 322767440.0, + "step": 12471 + }, + { + "epoch": 1.3696463869975841, + "grad_norm": 1.6653213500976562, + "learning_rate": 5e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7181717753410339, + "num_tokens": 322801879.0, + "step": 12472 + }, + { + "epoch": 1.3697562047001977, + "grad_norm": 1.9042799472808838, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7200161814689636, + "num_tokens": 322826587.0, + "step": 12473 + }, + { + "epoch": 1.3698660224028114, + "grad_norm": 1.908279299736023, + "learning_rate": 5e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7290968894958496, + "num_tokens": 322852561.0, + "step": 12474 + }, + { + "epoch": 1.369975840105425, + "grad_norm": 2.1457247734069824, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7340957522392273, + "num_tokens": 322874350.0, + "step": 12475 + }, + { + "epoch": 1.3700856578080387, + "grad_norm": 1.8832429647445679, + "learning_rate": 5e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7424644231796265, + "num_tokens": 322898604.0, + "step": 12476 + }, + { + "epoch": 1.3701954755106522, + "grad_norm": 1.9493299722671509, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7561662197113037, + "num_tokens": 322922082.0, + "step": 12477 + }, + { + "epoch": 1.370305293213266, + "grad_norm": 1.9524809122085571, + "learning_rate": 5e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7613307237625122, + "num_tokens": 322944188.0, + "step": 12478 + }, + { + "epoch": 1.3704151109158795, + "grad_norm": 2.0074760913848877, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7434529066085815, + "num_tokens": 322966675.0, + "step": 12479 + }, + { + "epoch": 1.3705249286184933, + "grad_norm": 1.8611193895339966, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7300635576248169, + "num_tokens": 322992410.0, + "step": 12480 + }, + { + "epoch": 1.370634746321107, + "grad_norm": 1.9813807010650635, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7191148996353149, + "num_tokens": 323017439.0, + "step": 12481 + }, + { + "epoch": 1.3707445640237206, + "grad_norm": 1.9213330745697021, + "learning_rate": 5e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7404482364654541, + "num_tokens": 323042097.0, + "step": 12482 + }, + { + "epoch": 1.3708543817263343, + "grad_norm": 2.014364719390869, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7267164587974548, + "num_tokens": 323064813.0, + "step": 12483 + }, + { + "epoch": 1.3709641994289479, + "grad_norm": 1.8687855005264282, + "learning_rate": 5e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7476971745491028, + "num_tokens": 323088558.0, + "step": 12484 + }, + { + "epoch": 1.3710740171315616, + "grad_norm": 1.8948936462402344, + "learning_rate": 5e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7471498250961304, + "num_tokens": 323112679.0, + "step": 12485 + }, + { + "epoch": 1.3711838348341754, + "grad_norm": 1.6562026739120483, + "learning_rate": 5e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7134619951248169, + "num_tokens": 323144851.0, + "step": 12486 + }, + { + "epoch": 1.371293652536789, + "grad_norm": 2.188009262084961, + "learning_rate": 5e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7740428447723389, + "num_tokens": 323163048.0, + "step": 12487 + }, + { + "epoch": 1.3714034702394025, + "grad_norm": 1.8116123676300049, + "learning_rate": 5e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7208088636398315, + "num_tokens": 323192478.0, + "step": 12488 + }, + { + "epoch": 1.3715132879420162, + "grad_norm": 1.898490071296692, + "learning_rate": 5e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7322497367858887, + "num_tokens": 323215981.0, + "step": 12489 + }, + { + "epoch": 1.37162310564463, + "grad_norm": 1.9564992189407349, + "learning_rate": 5e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7551237344741821, + "num_tokens": 323238209.0, + "step": 12490 + }, + { + "epoch": 1.3717329233472435, + "grad_norm": 1.9110709428787231, + "learning_rate": 5e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7226738929748535, + "num_tokens": 323263667.0, + "step": 12491 + }, + { + "epoch": 1.3718427410498573, + "grad_norm": 1.8051838874816895, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7318399548530579, + "num_tokens": 323291785.0, + "step": 12492 + }, + { + "epoch": 1.3719525587524708, + "grad_norm": 1.9161216020584106, + "learning_rate": 5e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7383396625518799, + "num_tokens": 323317206.0, + "step": 12493 + }, + { + "epoch": 1.3720623764550846, + "grad_norm": 1.9779025316238403, + "learning_rate": 5e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7276086807250977, + "num_tokens": 323343841.0, + "step": 12494 + }, + { + "epoch": 1.3721721941576983, + "grad_norm": 2.071150541305542, + "learning_rate": 5e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7462189197540283, + "num_tokens": 323364773.0, + "step": 12495 + }, + { + "epoch": 1.3722820118603118, + "grad_norm": 1.811477780342102, + "learning_rate": 5e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7344803810119629, + "num_tokens": 323391031.0, + "step": 12496 + }, + { + "epoch": 1.3723918295629256, + "grad_norm": 1.7209917306900024, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7461499571800232, + "num_tokens": 323420233.0, + "step": 12497 + }, + { + "epoch": 1.3725016472655391, + "grad_norm": 1.7197239398956299, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7294995784759521, + "num_tokens": 323449019.0, + "step": 12498 + }, + { + "epoch": 1.372611464968153, + "grad_norm": 2.006542682647705, + "learning_rate": 5e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7448067665100098, + "num_tokens": 323470182.0, + "step": 12499 + }, + { + "epoch": 1.3727212826707667, + "grad_norm": 2.1903207302093506, + "learning_rate": 5e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7477757930755615, + "num_tokens": 323489776.0, + "step": 12500 + }, + { + "epoch": 1.3728311003733802, + "grad_norm": 1.8335152864456177, + "learning_rate": 5e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7062471508979797, + "num_tokens": 323518801.0, + "step": 12501 + }, + { + "epoch": 1.3729409180759937, + "grad_norm": 1.8311716318130493, + "learning_rate": 5e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7446373701095581, + "num_tokens": 323543488.0, + "step": 12502 + }, + { + "epoch": 1.3730507357786075, + "grad_norm": 1.6642624139785767, + "learning_rate": 5e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7630892992019653, + "num_tokens": 323574019.0, + "step": 12503 + }, + { + "epoch": 1.3731605534812212, + "grad_norm": 1.6863558292388916, + "learning_rate": 5e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.739704966545105, + "num_tokens": 323603476.0, + "step": 12504 + }, + { + "epoch": 1.3732703711838348, + "grad_norm": 2.0261898040771484, + "learning_rate": 5e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.73970627784729, + "num_tokens": 323626935.0, + "step": 12505 + }, + { + "epoch": 1.3733801888864485, + "grad_norm": 1.9008334875106812, + "learning_rate": 5e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7551136612892151, + "num_tokens": 323650977.0, + "step": 12506 + }, + { + "epoch": 1.373490006589062, + "grad_norm": 2.046900749206543, + "learning_rate": 5e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7222493886947632, + "num_tokens": 323673108.0, + "step": 12507 + }, + { + "epoch": 1.3735998242916758, + "grad_norm": 1.8523510694503784, + "learning_rate": 5e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7450915575027466, + "num_tokens": 323698156.0, + "step": 12508 + }, + { + "epoch": 1.3737096419942896, + "grad_norm": 2.0718212127685547, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7433620691299438, + "num_tokens": 323718550.0, + "step": 12509 + }, + { + "epoch": 1.3738194596969031, + "grad_norm": 1.554124116897583, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7271742820739746, + "num_tokens": 323752755.0, + "step": 12510 + }, + { + "epoch": 1.3739292773995169, + "grad_norm": 2.0060770511627197, + "learning_rate": 5e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.6983646154403687, + "num_tokens": 323782183.0, + "step": 12511 + }, + { + "epoch": 1.3740390951021304, + "grad_norm": 1.96808660030365, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.744255781173706, + "num_tokens": 323807315.0, + "step": 12512 + }, + { + "epoch": 1.3741489128047442, + "grad_norm": 1.7813136577606201, + "learning_rate": 5e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7382180094718933, + "num_tokens": 323831667.0, + "step": 12513 + }, + { + "epoch": 1.374258730507358, + "grad_norm": 1.6707005500793457, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7340764999389648, + "num_tokens": 323862246.0, + "step": 12514 + }, + { + "epoch": 1.3743685482099715, + "grad_norm": 1.9339261054992676, + "learning_rate": 5e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7361997365951538, + "num_tokens": 323884530.0, + "step": 12515 + }, + { + "epoch": 1.374478365912585, + "grad_norm": 1.895599126815796, + "learning_rate": 5e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7075271010398865, + "num_tokens": 323910096.0, + "step": 12516 + }, + { + "epoch": 1.3745881836151987, + "grad_norm": 1.9429887533187866, + "learning_rate": 5e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7333111763000488, + "num_tokens": 323932960.0, + "step": 12517 + }, + { + "epoch": 1.3746980013178125, + "grad_norm": 2.153451442718506, + "learning_rate": 5e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7100684642791748, + "num_tokens": 323958047.0, + "step": 12518 + }, + { + "epoch": 1.374807819020426, + "grad_norm": 1.86574387550354, + "learning_rate": 5e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7462548017501831, + "num_tokens": 323981615.0, + "step": 12519 + }, + { + "epoch": 1.3749176367230398, + "grad_norm": 1.8118832111358643, + "learning_rate": 5e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7211259603500366, + "num_tokens": 324010589.0, + "step": 12520 + }, + { + "epoch": 1.3750274544256533, + "grad_norm": 1.9253511428833008, + "learning_rate": 5e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7196606993675232, + "num_tokens": 324035084.0, + "step": 12521 + }, + { + "epoch": 1.375137272128267, + "grad_norm": 1.7492536306381226, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7342233657836914, + "num_tokens": 324062415.0, + "step": 12522 + }, + { + "epoch": 1.3752470898308808, + "grad_norm": 1.80166494846344, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.748241662979126, + "num_tokens": 324089036.0, + "step": 12523 + }, + { + "epoch": 1.3753569075334944, + "grad_norm": 1.7559499740600586, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.74233478307724, + "num_tokens": 324115967.0, + "step": 12524 + }, + { + "epoch": 1.3754667252361081, + "grad_norm": 1.806251883506775, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7342312335968018, + "num_tokens": 324141085.0, + "step": 12525 + }, + { + "epoch": 1.3755765429387217, + "grad_norm": 1.7247148752212524, + "learning_rate": 5e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7262063026428223, + "num_tokens": 324171066.0, + "step": 12526 + }, + { + "epoch": 1.3756863606413354, + "grad_norm": 1.8847107887268066, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7422543168067932, + "num_tokens": 324195415.0, + "step": 12527 + }, + { + "epoch": 1.3757961783439492, + "grad_norm": 1.8805298805236816, + "learning_rate": 5e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.736111044883728, + "num_tokens": 324220397.0, + "step": 12528 + }, + { + "epoch": 1.3759059960465627, + "grad_norm": 1.7781789302825928, + "learning_rate": 5e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7287986278533936, + "num_tokens": 324246321.0, + "step": 12529 + }, + { + "epoch": 1.3760158137491763, + "grad_norm": 1.848881483078003, + "learning_rate": 5e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7493736743927002, + "num_tokens": 324270819.0, + "step": 12530 + }, + { + "epoch": 1.37612563145179, + "grad_norm": 1.8911445140838623, + "learning_rate": 5e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7269195318222046, + "num_tokens": 324295359.0, + "step": 12531 + }, + { + "epoch": 1.3762354491544038, + "grad_norm": 1.993688702583313, + "learning_rate": 5e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7406247854232788, + "num_tokens": 324317972.0, + "step": 12532 + }, + { + "epoch": 1.3763452668570173, + "grad_norm": 1.8524061441421509, + "learning_rate": 5e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.726067304611206, + "num_tokens": 324343919.0, + "step": 12533 + }, + { + "epoch": 1.376455084559631, + "grad_norm": 2.1580188274383545, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7448037266731262, + "num_tokens": 324362156.0, + "step": 12534 + }, + { + "epoch": 1.3765649022622446, + "grad_norm": 1.6202939748764038, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7238768935203552, + "num_tokens": 324394525.0, + "step": 12535 + }, + { + "epoch": 1.3766747199648584, + "grad_norm": 1.7392475605010986, + "learning_rate": 5e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7277502417564392, + "num_tokens": 324424014.0, + "step": 12536 + }, + { + "epoch": 1.376784537667472, + "grad_norm": 1.867655873298645, + "learning_rate": 5e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7263708114624023, + "num_tokens": 324451772.0, + "step": 12537 + }, + { + "epoch": 1.3768943553700856, + "grad_norm": 1.789079189300537, + "learning_rate": 5e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.744475245475769, + "num_tokens": 324480821.0, + "step": 12538 + }, + { + "epoch": 1.3770041730726994, + "grad_norm": 1.8362261056900024, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7201422452926636, + "num_tokens": 324509541.0, + "step": 12539 + }, + { + "epoch": 1.377113990775313, + "grad_norm": 1.773856520652771, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7258567810058594, + "num_tokens": 324536575.0, + "step": 12540 + }, + { + "epoch": 1.3772238084779267, + "grad_norm": 2.2314581871032715, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7324389815330505, + "num_tokens": 324557991.0, + "step": 12541 + }, + { + "epoch": 1.3773336261805402, + "grad_norm": 1.6316182613372803, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.741042971611023, + "num_tokens": 324589718.0, + "step": 12542 + }, + { + "epoch": 1.377443443883154, + "grad_norm": 2.011592149734497, + "learning_rate": 5e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7512697577476501, + "num_tokens": 324612210.0, + "step": 12543 + }, + { + "epoch": 1.3775532615857675, + "grad_norm": 1.7906631231307983, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7265307903289795, + "num_tokens": 324637378.0, + "step": 12544 + }, + { + "epoch": 1.3776630792883813, + "grad_norm": 2.146605968475342, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7397674322128296, + "num_tokens": 324659260.0, + "step": 12545 + }, + { + "epoch": 1.377772896990995, + "grad_norm": 1.673312783241272, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7126384377479553, + "num_tokens": 324690281.0, + "step": 12546 + }, + { + "epoch": 1.3778827146936086, + "grad_norm": 1.761521339416504, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7296321392059326, + "num_tokens": 324717505.0, + "step": 12547 + }, + { + "epoch": 1.3779925323962223, + "grad_norm": 2.019699811935425, + "learning_rate": 5e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7459819316864014, + "num_tokens": 324738801.0, + "step": 12548 + }, + { + "epoch": 1.3781023500988359, + "grad_norm": 1.7195261716842651, + "learning_rate": 5e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7274715900421143, + "num_tokens": 324769060.0, + "step": 12549 + }, + { + "epoch": 1.3782121678014496, + "grad_norm": 1.9910414218902588, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7499030828475952, + "num_tokens": 324792221.0, + "step": 12550 + }, + { + "epoch": 1.3783219855040634, + "grad_norm": 1.974827527999878, + "learning_rate": 5e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7307132482528687, + "num_tokens": 324815897.0, + "step": 12551 + }, + { + "epoch": 1.378431803206677, + "grad_norm": 2.0140552520751953, + "learning_rate": 5e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7257207632064819, + "num_tokens": 324840157.0, + "step": 12552 + }, + { + "epoch": 1.3785416209092904, + "grad_norm": 1.9228979349136353, + "learning_rate": 5e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7336330413818359, + "num_tokens": 324866369.0, + "step": 12553 + }, + { + "epoch": 1.3786514386119042, + "grad_norm": 1.900976300239563, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7498375177383423, + "num_tokens": 324890246.0, + "step": 12554 + }, + { + "epoch": 1.378761256314518, + "grad_norm": 1.7229338884353638, + "learning_rate": 5e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7574199438095093, + "num_tokens": 324918048.0, + "step": 12555 + }, + { + "epoch": 1.3788710740171315, + "grad_norm": 1.9248548746109009, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7203688621520996, + "num_tokens": 324943608.0, + "step": 12556 + }, + { + "epoch": 1.3789808917197452, + "grad_norm": 1.8034964799880981, + "learning_rate": 5e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7216932773590088, + "num_tokens": 324971629.0, + "step": 12557 + }, + { + "epoch": 1.3790907094223588, + "grad_norm": 1.845300316810608, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7282006740570068, + "num_tokens": 324999827.0, + "step": 12558 + }, + { + "epoch": 1.3792005271249725, + "grad_norm": 1.7476915121078491, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7305959463119507, + "num_tokens": 325029438.0, + "step": 12559 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 1.89493727684021, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7672482132911682, + "num_tokens": 325050931.0, + "step": 12560 + }, + { + "epoch": 1.3794201625301998, + "grad_norm": 1.8035728931427002, + "learning_rate": 5e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7290222644805908, + "num_tokens": 325077200.0, + "step": 12561 + }, + { + "epoch": 1.3795299802328136, + "grad_norm": 1.8064504861831665, + "learning_rate": 5e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.7443692088127136, + "num_tokens": 325102183.0, + "step": 12562 + }, + { + "epoch": 1.3796397979354271, + "grad_norm": 1.8457471132278442, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7218326330184937, + "num_tokens": 325129490.0, + "step": 12563 + }, + { + "epoch": 1.3797496156380409, + "grad_norm": 1.9940580129623413, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7194879055023193, + "num_tokens": 325150955.0, + "step": 12564 + }, + { + "epoch": 1.3798594333406546, + "grad_norm": 2.1288628578186035, + "learning_rate": 5e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7440246343612671, + "num_tokens": 325172690.0, + "step": 12565 + }, + { + "epoch": 1.3799692510432682, + "grad_norm": 1.83817720413208, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7245470285415649, + "num_tokens": 325199552.0, + "step": 12566 + }, + { + "epoch": 1.3800790687458817, + "grad_norm": 2.0146706104278564, + "learning_rate": 5e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7084213495254517, + "num_tokens": 325223232.0, + "step": 12567 + }, + { + "epoch": 1.3801888864484955, + "grad_norm": 1.743270993232727, + "learning_rate": 5e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.694993257522583, + "num_tokens": 325252875.0, + "step": 12568 + }, + { + "epoch": 1.3802987041511092, + "grad_norm": 1.893511176109314, + "learning_rate": 5e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.739885687828064, + "num_tokens": 325278203.0, + "step": 12569 + }, + { + "epoch": 1.3804085218537228, + "grad_norm": 2.11592435836792, + "learning_rate": 5e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7386549711227417, + "num_tokens": 325297381.0, + "step": 12570 + }, + { + "epoch": 1.3805183395563365, + "grad_norm": 1.790653944015503, + "learning_rate": 5e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.75963294506073, + "num_tokens": 325322179.0, + "step": 12571 + }, + { + "epoch": 1.38062815725895, + "grad_norm": 1.8989789485931396, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7475159168243408, + "num_tokens": 325344898.0, + "step": 12572 + }, + { + "epoch": 1.3807379749615638, + "grad_norm": 1.670562982559204, + "learning_rate": 5e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7284420728683472, + "num_tokens": 325373915.0, + "step": 12573 + }, + { + "epoch": 1.3808477926641776, + "grad_norm": 1.8289401531219482, + "learning_rate": 5e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7133520841598511, + "num_tokens": 325399175.0, + "step": 12574 + }, + { + "epoch": 1.380957610366791, + "grad_norm": 1.7357888221740723, + "learning_rate": 5e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7186844348907471, + "num_tokens": 325427777.0, + "step": 12575 + }, + { + "epoch": 1.3810674280694049, + "grad_norm": 1.902259349822998, + "learning_rate": 5e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7109712362289429, + "num_tokens": 325454127.0, + "step": 12576 + }, + { + "epoch": 1.3811772457720184, + "grad_norm": 1.7960983514785767, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7500404119491577, + "num_tokens": 325478345.0, + "step": 12577 + }, + { + "epoch": 1.3812870634746321, + "grad_norm": 1.7374780178070068, + "learning_rate": 5e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7086232900619507, + "num_tokens": 325507544.0, + "step": 12578 + }, + { + "epoch": 1.381396881177246, + "grad_norm": 1.8992786407470703, + "learning_rate": 5e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7413098216056824, + "num_tokens": 325532162.0, + "step": 12579 + }, + { + "epoch": 1.3815066988798594, + "grad_norm": 1.8275706768035889, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7214167714118958, + "num_tokens": 325559741.0, + "step": 12580 + }, + { + "epoch": 1.381616516582473, + "grad_norm": 1.8244116306304932, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7401111125946045, + "num_tokens": 325585959.0, + "step": 12581 + }, + { + "epoch": 1.3817263342850867, + "grad_norm": 1.9695899486541748, + "learning_rate": 5e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7469027042388916, + "num_tokens": 325609104.0, + "step": 12582 + }, + { + "epoch": 1.3818361519877005, + "grad_norm": 1.7164050340652466, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7256703972816467, + "num_tokens": 325637850.0, + "step": 12583 + }, + { + "epoch": 1.381945969690314, + "grad_norm": 1.6059552431106567, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7382031679153442, + "num_tokens": 325669561.0, + "step": 12584 + }, + { + "epoch": 1.3820557873929278, + "grad_norm": 1.836728572845459, + "learning_rate": 5e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7268610000610352, + "num_tokens": 325694786.0, + "step": 12585 + }, + { + "epoch": 1.3821656050955413, + "grad_norm": 1.7014318704605103, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7372832298278809, + "num_tokens": 325725310.0, + "step": 12586 + }, + { + "epoch": 1.382275422798155, + "grad_norm": 2.093226432800293, + "learning_rate": 5e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7402721643447876, + "num_tokens": 325745335.0, + "step": 12587 + }, + { + "epoch": 1.3823852405007688, + "grad_norm": 2.0074474811553955, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7250001430511475, + "num_tokens": 325771151.0, + "step": 12588 + }, + { + "epoch": 1.3824950582033824, + "grad_norm": 1.7371182441711426, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7306510210037231, + "num_tokens": 325800435.0, + "step": 12589 + }, + { + "epoch": 1.3826048759059961, + "grad_norm": 2.089996814727783, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7409511804580688, + "num_tokens": 325821803.0, + "step": 12590 + }, + { + "epoch": 1.3827146936086097, + "grad_norm": 2.0005593299865723, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7630041837692261, + "num_tokens": 325844234.0, + "step": 12591 + }, + { + "epoch": 1.3828245113112234, + "grad_norm": 2.0706586837768555, + "learning_rate": 5e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7308118939399719, + "num_tokens": 325866779.0, + "step": 12592 + }, + { + "epoch": 1.382934329013837, + "grad_norm": 1.5799269676208496, + "learning_rate": 5e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7307038903236389, + "num_tokens": 325900148.0, + "step": 12593 + }, + { + "epoch": 1.3830441467164507, + "grad_norm": 1.9542149305343628, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7255041599273682, + "num_tokens": 325926613.0, + "step": 12594 + }, + { + "epoch": 1.3831539644190642, + "grad_norm": 1.8899133205413818, + "learning_rate": 5e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7194706201553345, + "num_tokens": 325952736.0, + "step": 12595 + }, + { + "epoch": 1.383263782121678, + "grad_norm": 1.8952205181121826, + "learning_rate": 5e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7356078624725342, + "num_tokens": 325977899.0, + "step": 12596 + }, + { + "epoch": 1.3833735998242918, + "grad_norm": 1.9063133001327515, + "learning_rate": 5e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7294518351554871, + "num_tokens": 326002376.0, + "step": 12597 + }, + { + "epoch": 1.3834834175269053, + "grad_norm": 1.7551932334899902, + "learning_rate": 5e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7067815065383911, + "num_tokens": 326032942.0, + "step": 12598 + }, + { + "epoch": 1.383593235229519, + "grad_norm": 1.8151167631149292, + "learning_rate": 5e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7161341905593872, + "num_tokens": 326061862.0, + "step": 12599 + }, + { + "epoch": 1.3837030529321326, + "grad_norm": 1.6945968866348267, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7432194948196411, + "num_tokens": 326089182.0, + "step": 12600 + }, + { + "epoch": 1.3838128706347463, + "grad_norm": 1.9087419509887695, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7589204907417297, + "num_tokens": 326110570.0, + "step": 12601 + }, + { + "epoch": 1.38392268833736, + "grad_norm": 1.895871877670288, + "learning_rate": 5e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7197602987289429, + "num_tokens": 326138674.0, + "step": 12602 + }, + { + "epoch": 1.3840325060399736, + "grad_norm": 2.151439666748047, + "learning_rate": 5e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7394818067550659, + "num_tokens": 326158707.0, + "step": 12603 + }, + { + "epoch": 1.3841423237425874, + "grad_norm": 1.758252739906311, + "learning_rate": 5e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7356221675872803, + "num_tokens": 326188238.0, + "step": 12604 + }, + { + "epoch": 1.384252141445201, + "grad_norm": 1.6512017250061035, + "learning_rate": 5e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7124446630477905, + "num_tokens": 326222499.0, + "step": 12605 + }, + { + "epoch": 1.3843619591478147, + "grad_norm": 1.787875771522522, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7273523211479187, + "num_tokens": 326251367.0, + "step": 12606 + }, + { + "epoch": 1.3844717768504282, + "grad_norm": 1.779935359954834, + "learning_rate": 5e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7225055694580078, + "num_tokens": 326276945.0, + "step": 12607 + }, + { + "epoch": 1.384581594553042, + "grad_norm": 1.7860350608825684, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.73377525806427, + "num_tokens": 326305228.0, + "step": 12608 + }, + { + "epoch": 1.3846914122556555, + "grad_norm": 2.037595510482788, + "learning_rate": 5e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7372314929962158, + "num_tokens": 326325802.0, + "step": 12609 + }, + { + "epoch": 1.3848012299582693, + "grad_norm": 1.9448437690734863, + "learning_rate": 5e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7417216300964355, + "num_tokens": 326350090.0, + "step": 12610 + }, + { + "epoch": 1.384911047660883, + "grad_norm": 1.7206721305847168, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7152328491210938, + "num_tokens": 326382222.0, + "step": 12611 + }, + { + "epoch": 1.3850208653634966, + "grad_norm": 1.8379418849945068, + "learning_rate": 5e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7322624921798706, + "num_tokens": 326407163.0, + "step": 12612 + }, + { + "epoch": 1.3851306830661103, + "grad_norm": 1.7148557901382446, + "learning_rate": 5e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7568725347518921, + "num_tokens": 326431928.0, + "step": 12613 + }, + { + "epoch": 1.3852405007687238, + "grad_norm": 1.80509614944458, + "learning_rate": 5e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7366639971733093, + "num_tokens": 326460283.0, + "step": 12614 + }, + { + "epoch": 1.3853503184713376, + "grad_norm": 1.8144149780273438, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7176679372787476, + "num_tokens": 326488117.0, + "step": 12615 + }, + { + "epoch": 1.3854601361739514, + "grad_norm": 1.7292083501815796, + "learning_rate": 5e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7273784279823303, + "num_tokens": 326516623.0, + "step": 12616 + }, + { + "epoch": 1.385569953876565, + "grad_norm": 2.157167434692383, + "learning_rate": 5e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7383964657783508, + "num_tokens": 326536528.0, + "step": 12617 + }, + { + "epoch": 1.3856797715791784, + "grad_norm": 1.9105591773986816, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7319309711456299, + "num_tokens": 326560988.0, + "step": 12618 + }, + { + "epoch": 1.3857895892817922, + "grad_norm": 1.822208046913147, + "learning_rate": 5e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7218747138977051, + "num_tokens": 326588020.0, + "step": 12619 + }, + { + "epoch": 1.385899406984406, + "grad_norm": 1.8866816759109497, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7177739143371582, + "num_tokens": 326614556.0, + "step": 12620 + }, + { + "epoch": 1.3860092246870195, + "grad_norm": 1.8715721368789673, + "learning_rate": 5e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7084730863571167, + "num_tokens": 326642011.0, + "step": 12621 + }, + { + "epoch": 1.3861190423896332, + "grad_norm": 2.1051900386810303, + "learning_rate": 5e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7482194900512695, + "num_tokens": 326662955.0, + "step": 12622 + }, + { + "epoch": 1.3862288600922468, + "grad_norm": 1.7467912435531616, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7197483777999878, + "num_tokens": 326689759.0, + "step": 12623 + }, + { + "epoch": 1.3863386777948605, + "grad_norm": 1.7552192211151123, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7309725284576416, + "num_tokens": 326717468.0, + "step": 12624 + }, + { + "epoch": 1.3864484954974743, + "grad_norm": 2.087738513946533, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.732846736907959, + "num_tokens": 326740588.0, + "step": 12625 + }, + { + "epoch": 1.3865583132000878, + "grad_norm": 1.8646565675735474, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.770538330078125, + "num_tokens": 326763249.0, + "step": 12626 + }, + { + "epoch": 1.3866681309027016, + "grad_norm": 1.7385038137435913, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7253023982048035, + "num_tokens": 326791774.0, + "step": 12627 + }, + { + "epoch": 1.3867779486053151, + "grad_norm": 1.9020055532455444, + "learning_rate": 5e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7543927431106567, + "num_tokens": 326813837.0, + "step": 12628 + }, + { + "epoch": 1.3868877663079289, + "grad_norm": 1.787902593612671, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7352343201637268, + "num_tokens": 326841804.0, + "step": 12629 + }, + { + "epoch": 1.3869975840105426, + "grad_norm": 1.8348809480667114, + "learning_rate": 5e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7294888496398926, + "num_tokens": 326870020.0, + "step": 12630 + }, + { + "epoch": 1.3871074017131562, + "grad_norm": 2.0430004596710205, + "learning_rate": 5e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7362778186798096, + "num_tokens": 326894748.0, + "step": 12631 + }, + { + "epoch": 1.3872172194157697, + "grad_norm": 1.951069951057434, + "learning_rate": 5e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7131193280220032, + "num_tokens": 326920229.0, + "step": 12632 + }, + { + "epoch": 1.3873270371183835, + "grad_norm": 1.6575113534927368, + "learning_rate": 5e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.6895842552185059, + "num_tokens": 326953082.0, + "step": 12633 + }, + { + "epoch": 1.3874368548209972, + "grad_norm": 1.6228443384170532, + "learning_rate": 5e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.719052791595459, + "num_tokens": 326989663.0, + "step": 12634 + }, + { + "epoch": 1.3875466725236107, + "grad_norm": 1.9246678352355957, + "learning_rate": 5e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.743424654006958, + "num_tokens": 327015344.0, + "step": 12635 + }, + { + "epoch": 1.3876564902262245, + "grad_norm": 1.8841890096664429, + "learning_rate": 5e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7326653003692627, + "num_tokens": 327041790.0, + "step": 12636 + }, + { + "epoch": 1.387766307928838, + "grad_norm": 1.8404234647750854, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7444852590560913, + "num_tokens": 327067389.0, + "step": 12637 + }, + { + "epoch": 1.3878761256314518, + "grad_norm": 1.715722680091858, + "learning_rate": 5e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.732437252998352, + "num_tokens": 327097895.0, + "step": 12638 + }, + { + "epoch": 1.3879859433340656, + "grad_norm": 1.7589305639266968, + "learning_rate": 5e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7237352728843689, + "num_tokens": 327125106.0, + "step": 12639 + }, + { + "epoch": 1.388095761036679, + "grad_norm": 1.6560219526290894, + "learning_rate": 5e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7232844233512878, + "num_tokens": 327157484.0, + "step": 12640 + }, + { + "epoch": 1.3882055787392928, + "grad_norm": 1.8665695190429688, + "learning_rate": 5e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7310439348220825, + "num_tokens": 327184735.0, + "step": 12641 + }, + { + "epoch": 1.3883153964419064, + "grad_norm": 2.010040760040283, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.72868812084198, + "num_tokens": 327206677.0, + "step": 12642 + }, + { + "epoch": 1.3884252141445201, + "grad_norm": 2.1772377490997314, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7355716824531555, + "num_tokens": 327225440.0, + "step": 12643 + }, + { + "epoch": 1.388535031847134, + "grad_norm": 1.8706003427505493, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7474700808525085, + "num_tokens": 327252190.0, + "step": 12644 + }, + { + "epoch": 1.3886448495497474, + "grad_norm": 1.8350499868392944, + "learning_rate": 5e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7221293449401855, + "num_tokens": 327278409.0, + "step": 12645 + }, + { + "epoch": 1.388754667252361, + "grad_norm": 1.8830318450927734, + "learning_rate": 5e-06, + "loss": 0.927, + "mean_token_accuracy": 0.711197555065155, + "num_tokens": 327307785.0, + "step": 12646 + }, + { + "epoch": 1.3888644849549747, + "grad_norm": 1.8284441232681274, + "learning_rate": 5e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7190653085708618, + "num_tokens": 327336867.0, + "step": 12647 + }, + { + "epoch": 1.3889743026575885, + "grad_norm": 2.002255916595459, + "learning_rate": 5e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7303359508514404, + "num_tokens": 327358657.0, + "step": 12648 + }, + { + "epoch": 1.389084120360202, + "grad_norm": 1.7972426414489746, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7613110542297363, + "num_tokens": 327383792.0, + "step": 12649 + }, + { + "epoch": 1.3891939380628158, + "grad_norm": 1.8187122344970703, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7323678135871887, + "num_tokens": 327410242.0, + "step": 12650 + }, + { + "epoch": 1.3893037557654293, + "grad_norm": 1.763751745223999, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7163230776786804, + "num_tokens": 327440161.0, + "step": 12651 + }, + { + "epoch": 1.389413573468043, + "grad_norm": 1.8165568113327026, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7454068660736084, + "num_tokens": 327465652.0, + "step": 12652 + }, + { + "epoch": 1.3895233911706568, + "grad_norm": 1.8733075857162476, + "learning_rate": 5e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7412919998168945, + "num_tokens": 327488606.0, + "step": 12653 + }, + { + "epoch": 1.3896332088732704, + "grad_norm": 1.8975791931152344, + "learning_rate": 5e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7203836441040039, + "num_tokens": 327513850.0, + "step": 12654 + }, + { + "epoch": 1.389743026575884, + "grad_norm": 1.8616340160369873, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7220311164855957, + "num_tokens": 327540490.0, + "step": 12655 + }, + { + "epoch": 1.3898528442784976, + "grad_norm": 1.747451663017273, + "learning_rate": 5e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7202199101448059, + "num_tokens": 327569799.0, + "step": 12656 + }, + { + "epoch": 1.3899626619811114, + "grad_norm": 1.764162302017212, + "learning_rate": 5e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7321867942810059, + "num_tokens": 327595265.0, + "step": 12657 + }, + { + "epoch": 1.390072479683725, + "grad_norm": 1.6508982181549072, + "learning_rate": 5e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7246724367141724, + "num_tokens": 327625998.0, + "step": 12658 + }, + { + "epoch": 1.3901822973863387, + "grad_norm": 1.7888911962509155, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7663379907608032, + "num_tokens": 327650648.0, + "step": 12659 + }, + { + "epoch": 1.3902921150889522, + "grad_norm": 2.0704846382141113, + "learning_rate": 5e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7471219301223755, + "num_tokens": 327672765.0, + "step": 12660 + }, + { + "epoch": 1.390401932791566, + "grad_norm": 1.9695067405700684, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7356505393981934, + "num_tokens": 327698137.0, + "step": 12661 + }, + { + "epoch": 1.3905117504941797, + "grad_norm": 1.8142434358596802, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7310874462127686, + "num_tokens": 327724222.0, + "step": 12662 + }, + { + "epoch": 1.3906215681967933, + "grad_norm": 1.7533483505249023, + "learning_rate": 5e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7282913327217102, + "num_tokens": 327753404.0, + "step": 12663 + }, + { + "epoch": 1.390731385899407, + "grad_norm": 2.192028522491455, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7203391790390015, + "num_tokens": 327774654.0, + "step": 12664 + }, + { + "epoch": 1.3908412036020206, + "grad_norm": 2.0183651447296143, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7530208826065063, + "num_tokens": 327795482.0, + "step": 12665 + }, + { + "epoch": 1.3909510213046343, + "grad_norm": 1.7758201360702515, + "learning_rate": 5e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7449971437454224, + "num_tokens": 327822111.0, + "step": 12666 + }, + { + "epoch": 1.391060839007248, + "grad_norm": 1.8567719459533691, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7448675632476807, + "num_tokens": 327847125.0, + "step": 12667 + }, + { + "epoch": 1.3911706567098616, + "grad_norm": 1.6596702337265015, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7405635118484497, + "num_tokens": 327877037.0, + "step": 12668 + }, + { + "epoch": 1.3912804744124752, + "grad_norm": 1.9815905094146729, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7336145043373108, + "num_tokens": 327900061.0, + "step": 12669 + }, + { + "epoch": 1.391390292115089, + "grad_norm": 1.8964155912399292, + "learning_rate": 5e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7357480525970459, + "num_tokens": 327924423.0, + "step": 12670 + }, + { + "epoch": 1.3915001098177027, + "grad_norm": 1.6548209190368652, + "learning_rate": 5e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7004234790802002, + "num_tokens": 327960638.0, + "step": 12671 + }, + { + "epoch": 1.3916099275203162, + "grad_norm": 2.007901191711426, + "learning_rate": 5e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7241361141204834, + "num_tokens": 327983493.0, + "step": 12672 + }, + { + "epoch": 1.39171974522293, + "grad_norm": 2.0261940956115723, + "learning_rate": 5e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7554581165313721, + "num_tokens": 328007096.0, + "step": 12673 + }, + { + "epoch": 1.3918295629255435, + "grad_norm": 1.7533146142959595, + "learning_rate": 5e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7389432191848755, + "num_tokens": 328034214.0, + "step": 12674 + }, + { + "epoch": 1.3919393806281573, + "grad_norm": 1.769668459892273, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7261448502540588, + "num_tokens": 328062404.0, + "step": 12675 + }, + { + "epoch": 1.392049198330771, + "grad_norm": 1.7181198596954346, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7314988374710083, + "num_tokens": 328090510.0, + "step": 12676 + }, + { + "epoch": 1.3921590160333845, + "grad_norm": 2.110515594482422, + "learning_rate": 5e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7243720889091492, + "num_tokens": 328113570.0, + "step": 12677 + }, + { + "epoch": 1.3922688337359983, + "grad_norm": 2.1602530479431152, + "learning_rate": 5e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7568395137786865, + "num_tokens": 328132873.0, + "step": 12678 + }, + { + "epoch": 1.3923786514386118, + "grad_norm": 1.8970787525177002, + "learning_rate": 5e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7250444889068604, + "num_tokens": 328158547.0, + "step": 12679 + }, + { + "epoch": 1.3924884691412256, + "grad_norm": 1.8977464437484741, + "learning_rate": 5e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7325229644775391, + "num_tokens": 328184205.0, + "step": 12680 + }, + { + "epoch": 1.3925982868438394, + "grad_norm": 1.9032700061798096, + "learning_rate": 5e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7468510866165161, + "num_tokens": 328209482.0, + "step": 12681 + }, + { + "epoch": 1.3927081045464529, + "grad_norm": 1.726163625717163, + "learning_rate": 5e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7013422250747681, + "num_tokens": 328240229.0, + "step": 12682 + }, + { + "epoch": 1.3928179222490664, + "grad_norm": 1.7428333759307861, + "learning_rate": 5e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7176764011383057, + "num_tokens": 328271432.0, + "step": 12683 + }, + { + "epoch": 1.3929277399516802, + "grad_norm": 1.9721081256866455, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7372778654098511, + "num_tokens": 328293800.0, + "step": 12684 + }, + { + "epoch": 1.393037557654294, + "grad_norm": 1.9411284923553467, + "learning_rate": 5e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7251236438751221, + "num_tokens": 328319026.0, + "step": 12685 + }, + { + "epoch": 1.3931473753569075, + "grad_norm": 2.0391786098480225, + "learning_rate": 5e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7372414469718933, + "num_tokens": 328344164.0, + "step": 12686 + }, + { + "epoch": 1.3932571930595212, + "grad_norm": 1.8266056776046753, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7373498678207397, + "num_tokens": 328372588.0, + "step": 12687 + }, + { + "epoch": 1.3933670107621348, + "grad_norm": 2.015726327896118, + "learning_rate": 5e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7306604385375977, + "num_tokens": 328396647.0, + "step": 12688 + }, + { + "epoch": 1.3934768284647485, + "grad_norm": 1.8877288103103638, + "learning_rate": 5e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7269315719604492, + "num_tokens": 328422750.0, + "step": 12689 + }, + { + "epoch": 1.3935866461673623, + "grad_norm": 2.05534029006958, + "learning_rate": 5e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7394039630889893, + "num_tokens": 328445758.0, + "step": 12690 + }, + { + "epoch": 1.3936964638699758, + "grad_norm": 1.8656690120697021, + "learning_rate": 5e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7077478170394897, + "num_tokens": 328472264.0, + "step": 12691 + }, + { + "epoch": 1.3938062815725896, + "grad_norm": 1.830907940864563, + "learning_rate": 5e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7112180590629578, + "num_tokens": 328499949.0, + "step": 12692 + }, + { + "epoch": 1.393916099275203, + "grad_norm": 1.8263970613479614, + "learning_rate": 5e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7315082550048828, + "num_tokens": 328527062.0, + "step": 12693 + }, + { + "epoch": 1.3940259169778169, + "grad_norm": 1.7677165269851685, + "learning_rate": 5e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7473018169403076, + "num_tokens": 328553912.0, + "step": 12694 + }, + { + "epoch": 1.3941357346804306, + "grad_norm": 1.9623786211013794, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7393628358840942, + "num_tokens": 328578800.0, + "step": 12695 + }, + { + "epoch": 1.3942455523830442, + "grad_norm": 2.013673782348633, + "learning_rate": 5e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7467385530471802, + "num_tokens": 328599927.0, + "step": 12696 + }, + { + "epoch": 1.3943553700856577, + "grad_norm": 1.9939152002334595, + "learning_rate": 5e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7422910928726196, + "num_tokens": 328623333.0, + "step": 12697 + }, + { + "epoch": 1.3944651877882714, + "grad_norm": 1.6222723722457886, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7117184996604919, + "num_tokens": 328656044.0, + "step": 12698 + }, + { + "epoch": 1.3945750054908852, + "grad_norm": 1.7562761306762695, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.719694197177887, + "num_tokens": 328684809.0, + "step": 12699 + }, + { + "epoch": 1.3946848231934987, + "grad_norm": 2.1147024631500244, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7300002574920654, + "num_tokens": 328707523.0, + "step": 12700 + }, + { + "epoch": 1.3947946408961125, + "grad_norm": 1.83424973487854, + "learning_rate": 5e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.6998454928398132, + "num_tokens": 328735946.0, + "step": 12701 + }, + { + "epoch": 1.394904458598726, + "grad_norm": 1.8928574323654175, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7337703108787537, + "num_tokens": 328763239.0, + "step": 12702 + }, + { + "epoch": 1.3950142763013398, + "grad_norm": 2.102003335952759, + "learning_rate": 5e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7427330613136292, + "num_tokens": 328783972.0, + "step": 12703 + }, + { + "epoch": 1.3951240940039535, + "grad_norm": 1.7555527687072754, + "learning_rate": 5e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7368912100791931, + "num_tokens": 328811976.0, + "step": 12704 + }, + { + "epoch": 1.395233911706567, + "grad_norm": 2.0373082160949707, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7125125527381897, + "num_tokens": 328837334.0, + "step": 12705 + }, + { + "epoch": 1.3953437294091808, + "grad_norm": 2.207056760787964, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.766187846660614, + "num_tokens": 328854108.0, + "step": 12706 + }, + { + "epoch": 1.3954535471117944, + "grad_norm": 2.0381879806518555, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7278028726577759, + "num_tokens": 328875976.0, + "step": 12707 + }, + { + "epoch": 1.3955633648144081, + "grad_norm": 2.159667730331421, + "learning_rate": 5e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7534347772598267, + "num_tokens": 328895882.0, + "step": 12708 + }, + { + "epoch": 1.3956731825170219, + "grad_norm": 1.8499720096588135, + "learning_rate": 5e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7360490560531616, + "num_tokens": 328920609.0, + "step": 12709 + }, + { + "epoch": 1.3957830002196354, + "grad_norm": 1.9667671918869019, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7276744842529297, + "num_tokens": 328943409.0, + "step": 12710 + }, + { + "epoch": 1.395892817922249, + "grad_norm": 2.1340696811676025, + "learning_rate": 5e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7457579374313354, + "num_tokens": 328963996.0, + "step": 12711 + }, + { + "epoch": 1.3960026356248627, + "grad_norm": 1.8574204444885254, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.749820351600647, + "num_tokens": 328989157.0, + "step": 12712 + }, + { + "epoch": 1.3961124533274765, + "grad_norm": 1.7244164943695068, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7401412129402161, + "num_tokens": 329019488.0, + "step": 12713 + }, + { + "epoch": 1.39622227103009, + "grad_norm": 2.269083261489868, + "learning_rate": 5e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7166422605514526, + "num_tokens": 329041062.0, + "step": 12714 + }, + { + "epoch": 1.3963320887327038, + "grad_norm": 2.0362308025360107, + "learning_rate": 5e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7238789796829224, + "num_tokens": 329065080.0, + "step": 12715 + }, + { + "epoch": 1.3964419064353173, + "grad_norm": 1.8449536561965942, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7400492429733276, + "num_tokens": 329089795.0, + "step": 12716 + }, + { + "epoch": 1.396551724137931, + "grad_norm": 1.7640081644058228, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7111014723777771, + "num_tokens": 329118587.0, + "step": 12717 + }, + { + "epoch": 1.3966615418405448, + "grad_norm": 1.748401165008545, + "learning_rate": 5e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7273055911064148, + "num_tokens": 329147312.0, + "step": 12718 + }, + { + "epoch": 1.3967713595431583, + "grad_norm": 2.0157318115234375, + "learning_rate": 5e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.733439028263092, + "num_tokens": 329169702.0, + "step": 12719 + }, + { + "epoch": 1.396881177245772, + "grad_norm": 1.6484521627426147, + "learning_rate": 5e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.738505482673645, + "num_tokens": 329199123.0, + "step": 12720 + }, + { + "epoch": 1.3969909949483856, + "grad_norm": 1.8497545719146729, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.721579372882843, + "num_tokens": 329227387.0, + "step": 12721 + }, + { + "epoch": 1.3971008126509994, + "grad_norm": 2.0207889080047607, + "learning_rate": 5e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7125244736671448, + "num_tokens": 329251844.0, + "step": 12722 + }, + { + "epoch": 1.397210630353613, + "grad_norm": 2.020303964614868, + "learning_rate": 5e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.741515576839447, + "num_tokens": 329272358.0, + "step": 12723 + }, + { + "epoch": 1.3973204480562267, + "grad_norm": 2.1143229007720947, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7279607057571411, + "num_tokens": 329294740.0, + "step": 12724 + }, + { + "epoch": 1.3974302657588402, + "grad_norm": 1.8699769973754883, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7365169525146484, + "num_tokens": 329318435.0, + "step": 12725 + }, + { + "epoch": 1.397540083461454, + "grad_norm": 1.9266752004623413, + "learning_rate": 5e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7468171715736389, + "num_tokens": 329343144.0, + "step": 12726 + }, + { + "epoch": 1.3976499011640677, + "grad_norm": 1.9280248880386353, + "learning_rate": 5e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7264262437820435, + "num_tokens": 329366642.0, + "step": 12727 + }, + { + "epoch": 1.3977597188666813, + "grad_norm": 1.6698811054229736, + "learning_rate": 5e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7213051915168762, + "num_tokens": 329399846.0, + "step": 12728 + }, + { + "epoch": 1.397869536569295, + "grad_norm": 1.841022253036499, + "learning_rate": 5e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.731147289276123, + "num_tokens": 329426578.0, + "step": 12729 + }, + { + "epoch": 1.3979793542719086, + "grad_norm": 1.5767135620117188, + "learning_rate": 5e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7193964719772339, + "num_tokens": 329462760.0, + "step": 12730 + }, + { + "epoch": 1.3980891719745223, + "grad_norm": 1.9502434730529785, + "learning_rate": 5e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7198585867881775, + "num_tokens": 329489671.0, + "step": 12731 + }, + { + "epoch": 1.398198989677136, + "grad_norm": 1.907979965209961, + "learning_rate": 5e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7480687499046326, + "num_tokens": 329514756.0, + "step": 12732 + }, + { + "epoch": 1.3983088073797496, + "grad_norm": 1.6125794649124146, + "learning_rate": 5e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7162604331970215, + "num_tokens": 329548220.0, + "step": 12733 + }, + { + "epoch": 1.3984186250823631, + "grad_norm": 1.8565106391906738, + "learning_rate": 5e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.737704873085022, + "num_tokens": 329574673.0, + "step": 12734 + }, + { + "epoch": 1.398528442784977, + "grad_norm": 1.807050347328186, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7202998399734497, + "num_tokens": 329602255.0, + "step": 12735 + }, + { + "epoch": 1.3986382604875907, + "grad_norm": 1.6463018655776978, + "learning_rate": 5e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7404839396476746, + "num_tokens": 329634919.0, + "step": 12736 + }, + { + "epoch": 1.3987480781902042, + "grad_norm": 2.04358172416687, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7229217290878296, + "num_tokens": 329657669.0, + "step": 12737 + }, + { + "epoch": 1.398857895892818, + "grad_norm": 1.7707496881484985, + "learning_rate": 5e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7382506728172302, + "num_tokens": 329686505.0, + "step": 12738 + }, + { + "epoch": 1.3989677135954315, + "grad_norm": 1.7672686576843262, + "learning_rate": 5e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7339319586753845, + "num_tokens": 329712722.0, + "step": 12739 + }, + { + "epoch": 1.3990775312980452, + "grad_norm": 1.8808550834655762, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7276188731193542, + "num_tokens": 329737562.0, + "step": 12740 + }, + { + "epoch": 1.399187349000659, + "grad_norm": 1.978363275527954, + "learning_rate": 5e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7345839738845825, + "num_tokens": 329760591.0, + "step": 12741 + }, + { + "epoch": 1.3992971667032725, + "grad_norm": 1.7862474918365479, + "learning_rate": 5e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7377060055732727, + "num_tokens": 329788131.0, + "step": 12742 + }, + { + "epoch": 1.3994069844058863, + "grad_norm": 1.6489477157592773, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7420653700828552, + "num_tokens": 329818754.0, + "step": 12743 + }, + { + "epoch": 1.3995168021084998, + "grad_norm": 1.9938814640045166, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7487403154373169, + "num_tokens": 329839108.0, + "step": 12744 + }, + { + "epoch": 1.3996266198111136, + "grad_norm": 2.0453908443450928, + "learning_rate": 5e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.752601146697998, + "num_tokens": 329860027.0, + "step": 12745 + }, + { + "epoch": 1.3997364375137273, + "grad_norm": 1.9025272130966187, + "learning_rate": 5e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7179105877876282, + "num_tokens": 329887398.0, + "step": 12746 + }, + { + "epoch": 1.3998462552163409, + "grad_norm": 1.954416275024414, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7216130495071411, + "num_tokens": 329911914.0, + "step": 12747 + }, + { + "epoch": 1.3999560729189544, + "grad_norm": 1.943069577217102, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7246878743171692, + "num_tokens": 329936363.0, + "step": 12748 + }, + { + "epoch": 1.4000658906215682, + "grad_norm": 1.7646859884262085, + "learning_rate": 5e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7174584865570068, + "num_tokens": 329962503.0, + "step": 12749 + }, + { + "epoch": 1.400175708324182, + "grad_norm": 1.5958058834075928, + "learning_rate": 5e-06, + "loss": 0.928, + "mean_token_accuracy": 0.710432767868042, + "num_tokens": 329998957.0, + "step": 12750 + }, + { + "epoch": 1.4002855260267955, + "grad_norm": 1.8352102041244507, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7436491250991821, + "num_tokens": 330021284.0, + "step": 12751 + }, + { + "epoch": 1.4003953437294092, + "grad_norm": 1.98280668258667, + "learning_rate": 5e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7399383187294006, + "num_tokens": 330045334.0, + "step": 12752 + }, + { + "epoch": 1.4005051614320227, + "grad_norm": 1.946035385131836, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7269342541694641, + "num_tokens": 330068449.0, + "step": 12753 + }, + { + "epoch": 1.4006149791346365, + "grad_norm": 1.819034457206726, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7328377962112427, + "num_tokens": 330093436.0, + "step": 12754 + }, + { + "epoch": 1.4007247968372503, + "grad_norm": 1.9562186002731323, + "learning_rate": 5e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7418888211250305, + "num_tokens": 330118186.0, + "step": 12755 + }, + { + "epoch": 1.4008346145398638, + "grad_norm": 1.700736165046692, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.739940881729126, + "num_tokens": 330150459.0, + "step": 12756 + }, + { + "epoch": 1.4009444322424776, + "grad_norm": 1.8155548572540283, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7308793663978577, + "num_tokens": 330178026.0, + "step": 12757 + }, + { + "epoch": 1.401054249945091, + "grad_norm": 2.0087296962738037, + "learning_rate": 5e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.736598551273346, + "num_tokens": 330200408.0, + "step": 12758 + }, + { + "epoch": 1.4011640676477048, + "grad_norm": 1.7363793849945068, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7279399633407593, + "num_tokens": 330231567.0, + "step": 12759 + }, + { + "epoch": 1.4012738853503186, + "grad_norm": 1.9166438579559326, + "learning_rate": 5e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7600642442703247, + "num_tokens": 330255722.0, + "step": 12760 + }, + { + "epoch": 1.4013837030529321, + "grad_norm": 1.9240646362304688, + "learning_rate": 5e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7411112189292908, + "num_tokens": 330281718.0, + "step": 12761 + }, + { + "epoch": 1.4014935207555457, + "grad_norm": 1.857775330543518, + "learning_rate": 5e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7290040254592896, + "num_tokens": 330307967.0, + "step": 12762 + }, + { + "epoch": 1.4016033384581594, + "grad_norm": 1.997825264930725, + "learning_rate": 5e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7155165672302246, + "num_tokens": 330333902.0, + "step": 12763 + }, + { + "epoch": 1.4017131561607732, + "grad_norm": 1.7536444664001465, + "learning_rate": 5e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7438716888427734, + "num_tokens": 330362859.0, + "step": 12764 + }, + { + "epoch": 1.4018229738633867, + "grad_norm": 1.7067461013793945, + "learning_rate": 5e-06, + "loss": 0.7368, + "mean_token_accuracy": 0.7549861669540405, + "num_tokens": 330390677.0, + "step": 12765 + }, + { + "epoch": 1.4019327915660005, + "grad_norm": 1.8087071180343628, + "learning_rate": 5e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7355695962905884, + "num_tokens": 330417015.0, + "step": 12766 + }, + { + "epoch": 1.402042609268614, + "grad_norm": 2.014799118041992, + "learning_rate": 5e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7447179555892944, + "num_tokens": 330437670.0, + "step": 12767 + }, + { + "epoch": 1.4021524269712278, + "grad_norm": 1.694630742073059, + "learning_rate": 5e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7361456751823425, + "num_tokens": 330464312.0, + "step": 12768 + }, + { + "epoch": 1.4022622446738415, + "grad_norm": 1.8641542196273804, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7446424961090088, + "num_tokens": 330488995.0, + "step": 12769 + }, + { + "epoch": 1.402372062376455, + "grad_norm": 1.7084259986877441, + "learning_rate": 5e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.725975751876831, + "num_tokens": 330521018.0, + "step": 12770 + }, + { + "epoch": 1.4024818800790688, + "grad_norm": 1.7905246019363403, + "learning_rate": 5e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7247724533081055, + "num_tokens": 330547156.0, + "step": 12771 + }, + { + "epoch": 1.4025916977816824, + "grad_norm": 1.745666742324829, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7379238605499268, + "num_tokens": 330575930.0, + "step": 12772 + }, + { + "epoch": 1.4027015154842961, + "grad_norm": 1.8853260278701782, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7505512237548828, + "num_tokens": 330601375.0, + "step": 12773 + }, + { + "epoch": 1.4028113331869096, + "grad_norm": 1.7642043828964233, + "learning_rate": 5e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7261437773704529, + "num_tokens": 330630723.0, + "step": 12774 + }, + { + "epoch": 1.4029211508895234, + "grad_norm": 1.9121112823486328, + "learning_rate": 5e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7430949807167053, + "num_tokens": 330653767.0, + "step": 12775 + }, + { + "epoch": 1.403030968592137, + "grad_norm": 2.0377585887908936, + "learning_rate": 5e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7404149770736694, + "num_tokens": 330676071.0, + "step": 12776 + }, + { + "epoch": 1.4031407862947507, + "grad_norm": 1.758692741394043, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7495860457420349, + "num_tokens": 330704925.0, + "step": 12777 + }, + { + "epoch": 1.4032506039973645, + "grad_norm": 1.9231277704238892, + "learning_rate": 5e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7399691343307495, + "num_tokens": 330729550.0, + "step": 12778 + }, + { + "epoch": 1.403360421699978, + "grad_norm": 1.901401162147522, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7313368916511536, + "num_tokens": 330752888.0, + "step": 12779 + }, + { + "epoch": 1.4034702394025917, + "grad_norm": 1.7826169729232788, + "learning_rate": 5e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7421700358390808, + "num_tokens": 330779396.0, + "step": 12780 + }, + { + "epoch": 1.4035800571052053, + "grad_norm": 1.7634297609329224, + "learning_rate": 5e-06, + "loss": 0.964, + "mean_token_accuracy": 0.6982788443565369, + "num_tokens": 330806988.0, + "step": 12781 + }, + { + "epoch": 1.403689874807819, + "grad_norm": 1.7597657442092896, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7296094298362732, + "num_tokens": 330836082.0, + "step": 12782 + }, + { + "epoch": 1.4037996925104328, + "grad_norm": 1.6749211549758911, + "learning_rate": 5e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7339056730270386, + "num_tokens": 330867745.0, + "step": 12783 + }, + { + "epoch": 1.4039095102130463, + "grad_norm": 1.7691470384597778, + "learning_rate": 5e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7322864532470703, + "num_tokens": 330898478.0, + "step": 12784 + }, + { + "epoch": 1.40401932791566, + "grad_norm": 1.8453131914138794, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7350002527236938, + "num_tokens": 330926144.0, + "step": 12785 + }, + { + "epoch": 1.4041291456182736, + "grad_norm": 1.9427921772003174, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7336030602455139, + "num_tokens": 330949483.0, + "step": 12786 + }, + { + "epoch": 1.4042389633208874, + "grad_norm": 1.8655353784561157, + "learning_rate": 5e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.722718358039856, + "num_tokens": 330976238.0, + "step": 12787 + }, + { + "epoch": 1.404348781023501, + "grad_norm": 1.8477160930633545, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7381404042243958, + "num_tokens": 331001380.0, + "step": 12788 + }, + { + "epoch": 1.4044585987261147, + "grad_norm": 1.9378087520599365, + "learning_rate": 5e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7455247640609741, + "num_tokens": 331023797.0, + "step": 12789 + }, + { + "epoch": 1.4045684164287282, + "grad_norm": 1.9000202417373657, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7309300303459167, + "num_tokens": 331047938.0, + "step": 12790 + }, + { + "epoch": 1.404678234131342, + "grad_norm": 2.010481595993042, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7299115657806396, + "num_tokens": 331070583.0, + "step": 12791 + }, + { + "epoch": 1.4047880518339557, + "grad_norm": 2.0820302963256836, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7269112467765808, + "num_tokens": 331093258.0, + "step": 12792 + }, + { + "epoch": 1.4048978695365693, + "grad_norm": 2.0211474895477295, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7460398077964783, + "num_tokens": 331113729.0, + "step": 12793 + }, + { + "epoch": 1.405007687239183, + "grad_norm": 1.7809468507766724, + "learning_rate": 5e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.7526665925979614, + "num_tokens": 331138044.0, + "step": 12794 + }, + { + "epoch": 1.4051175049417965, + "grad_norm": 1.8330413103103638, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.709403932094574, + "num_tokens": 331166418.0, + "step": 12795 + }, + { + "epoch": 1.4052273226444103, + "grad_norm": 1.745097279548645, + "learning_rate": 5e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7076547145843506, + "num_tokens": 331196150.0, + "step": 12796 + }, + { + "epoch": 1.405337140347024, + "grad_norm": 2.0366134643554688, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7353670597076416, + "num_tokens": 331222471.0, + "step": 12797 + }, + { + "epoch": 1.4054469580496376, + "grad_norm": 1.878157615661621, + "learning_rate": 5e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7585320472717285, + "num_tokens": 331244929.0, + "step": 12798 + }, + { + "epoch": 1.4055567757522511, + "grad_norm": 1.9575119018554688, + "learning_rate": 5e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7344205975532532, + "num_tokens": 331267478.0, + "step": 12799 + }, + { + "epoch": 1.4056665934548649, + "grad_norm": 1.8562037944793701, + "learning_rate": 5e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7416500449180603, + "num_tokens": 331293315.0, + "step": 12800 + }, + { + "epoch": 1.4057764111574786, + "grad_norm": 2.18869948387146, + "learning_rate": 5e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7128784656524658, + "num_tokens": 331314198.0, + "step": 12801 + }, + { + "epoch": 1.4058862288600922, + "grad_norm": 2.0580689907073975, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7306523323059082, + "num_tokens": 331338147.0, + "step": 12802 + }, + { + "epoch": 1.405996046562706, + "grad_norm": 1.863682508468628, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.737832248210907, + "num_tokens": 331364101.0, + "step": 12803 + }, + { + "epoch": 1.4061058642653195, + "grad_norm": 1.9031575918197632, + "learning_rate": 5e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7197481989860535, + "num_tokens": 331390644.0, + "step": 12804 + }, + { + "epoch": 1.4062156819679332, + "grad_norm": 1.8376109600067139, + "learning_rate": 5e-06, + "loss": 0.836, + "mean_token_accuracy": 0.733394980430603, + "num_tokens": 331416246.0, + "step": 12805 + }, + { + "epoch": 1.406325499670547, + "grad_norm": 1.7628389596939087, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7317478656768799, + "num_tokens": 331446751.0, + "step": 12806 + }, + { + "epoch": 1.4064353173731605, + "grad_norm": 2.0347304344177246, + "learning_rate": 5e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7199455499649048, + "num_tokens": 331468853.0, + "step": 12807 + }, + { + "epoch": 1.4065451350757743, + "grad_norm": 1.7089096307754517, + "learning_rate": 5e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7364323139190674, + "num_tokens": 331497448.0, + "step": 12808 + }, + { + "epoch": 1.4066549527783878, + "grad_norm": 1.807830810546875, + "learning_rate": 5e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7553547620773315, + "num_tokens": 331521627.0, + "step": 12809 + }, + { + "epoch": 1.4067647704810016, + "grad_norm": 2.1735265254974365, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7138680219650269, + "num_tokens": 331544071.0, + "step": 12810 + }, + { + "epoch": 1.4068745881836153, + "grad_norm": 1.8691364526748657, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.742045521736145, + "num_tokens": 331569665.0, + "step": 12811 + }, + { + "epoch": 1.4069844058862289, + "grad_norm": 1.7243754863739014, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7496861219406128, + "num_tokens": 331595860.0, + "step": 12812 + }, + { + "epoch": 1.4070942235888424, + "grad_norm": 2.015362024307251, + "learning_rate": 5e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.742591142654419, + "num_tokens": 331618159.0, + "step": 12813 + }, + { + "epoch": 1.4072040412914562, + "grad_norm": 1.7750351428985596, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7429783344268799, + "num_tokens": 331643527.0, + "step": 12814 + }, + { + "epoch": 1.40731385899407, + "grad_norm": 1.8443018198013306, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7455909252166748, + "num_tokens": 331669243.0, + "step": 12815 + }, + { + "epoch": 1.4074236766966834, + "grad_norm": 1.9146020412445068, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7322683334350586, + "num_tokens": 331693222.0, + "step": 12816 + }, + { + "epoch": 1.4075334943992972, + "grad_norm": 1.9070771932601929, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7355223298072815, + "num_tokens": 331716791.0, + "step": 12817 + }, + { + "epoch": 1.4076433121019107, + "grad_norm": 2.217700719833374, + "learning_rate": 5e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7592334151268005, + "num_tokens": 331734982.0, + "step": 12818 + }, + { + "epoch": 1.4077531298045245, + "grad_norm": 1.8888884782791138, + "learning_rate": 5e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7478795051574707, + "num_tokens": 331760627.0, + "step": 12819 + }, + { + "epoch": 1.4078629475071383, + "grad_norm": 1.8178517818450928, + "learning_rate": 5e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7058305144309998, + "num_tokens": 331789884.0, + "step": 12820 + }, + { + "epoch": 1.4079727652097518, + "grad_norm": 1.8193938732147217, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7382920384407043, + "num_tokens": 331816526.0, + "step": 12821 + }, + { + "epoch": 1.4080825829123655, + "grad_norm": 1.8548685312271118, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7251667380332947, + "num_tokens": 331843384.0, + "step": 12822 + }, + { + "epoch": 1.408192400614979, + "grad_norm": 1.9336806535720825, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7614910006523132, + "num_tokens": 331868532.0, + "step": 12823 + }, + { + "epoch": 1.4083022183175928, + "grad_norm": 1.8253635168075562, + "learning_rate": 5e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7315003871917725, + "num_tokens": 331895340.0, + "step": 12824 + }, + { + "epoch": 1.4084120360202066, + "grad_norm": 1.8729063272476196, + "learning_rate": 5e-06, + "loss": 0.872, + "mean_token_accuracy": 0.725250780582428, + "num_tokens": 331922770.0, + "step": 12825 + }, + { + "epoch": 1.4085218537228201, + "grad_norm": 2.107194185256958, + "learning_rate": 5e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7439290285110474, + "num_tokens": 331942915.0, + "step": 12826 + }, + { + "epoch": 1.4086316714254337, + "grad_norm": 1.6675846576690674, + "learning_rate": 5e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7327033281326294, + "num_tokens": 331973293.0, + "step": 12827 + }, + { + "epoch": 1.4087414891280474, + "grad_norm": 1.9039716720581055, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7317624688148499, + "num_tokens": 331997790.0, + "step": 12828 + }, + { + "epoch": 1.4088513068306612, + "grad_norm": 1.9003682136535645, + "learning_rate": 5e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7417474389076233, + "num_tokens": 332020714.0, + "step": 12829 + }, + { + "epoch": 1.4089611245332747, + "grad_norm": 1.607806921005249, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7121133804321289, + "num_tokens": 332056516.0, + "step": 12830 + }, + { + "epoch": 1.4090709422358885, + "grad_norm": 1.9297149181365967, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7269124984741211, + "num_tokens": 332082470.0, + "step": 12831 + }, + { + "epoch": 1.409180759938502, + "grad_norm": 1.7954461574554443, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7185046076774597, + "num_tokens": 332110218.0, + "step": 12832 + }, + { + "epoch": 1.4092905776411158, + "grad_norm": 1.713942050933838, + "learning_rate": 5e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7127071619033813, + "num_tokens": 332139730.0, + "step": 12833 + }, + { + "epoch": 1.4094003953437295, + "grad_norm": 1.8917865753173828, + "learning_rate": 5e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7257652878761292, + "num_tokens": 332165837.0, + "step": 12834 + }, + { + "epoch": 1.409510213046343, + "grad_norm": 1.699862003326416, + "learning_rate": 5e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7569593191146851, + "num_tokens": 332193387.0, + "step": 12835 + }, + { + "epoch": 1.4096200307489568, + "grad_norm": 2.0161240100860596, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7384762167930603, + "num_tokens": 332215387.0, + "step": 12836 + }, + { + "epoch": 1.4097298484515703, + "grad_norm": 1.7186081409454346, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7475094795227051, + "num_tokens": 332241723.0, + "step": 12837 + }, + { + "epoch": 1.409839666154184, + "grad_norm": 1.835902214050293, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7175761461257935, + "num_tokens": 332266887.0, + "step": 12838 + }, + { + "epoch": 1.4099494838567976, + "grad_norm": 1.8847233057022095, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.759260892868042, + "num_tokens": 332290012.0, + "step": 12839 + }, + { + "epoch": 1.4100593015594114, + "grad_norm": 1.808693289756775, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7544867992401123, + "num_tokens": 332314021.0, + "step": 12840 + }, + { + "epoch": 1.410169119262025, + "grad_norm": 1.8167850971221924, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7236728668212891, + "num_tokens": 332339840.0, + "step": 12841 + }, + { + "epoch": 1.4102789369646387, + "grad_norm": 1.7578322887420654, + "learning_rate": 5e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7287418842315674, + "num_tokens": 332370079.0, + "step": 12842 + }, + { + "epoch": 1.4103887546672524, + "grad_norm": 1.9887197017669678, + "learning_rate": 5e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7311979532241821, + "num_tokens": 332395000.0, + "step": 12843 + }, + { + "epoch": 1.410498572369866, + "grad_norm": 1.69084632396698, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.740687370300293, + "num_tokens": 332425061.0, + "step": 12844 + }, + { + "epoch": 1.4106083900724797, + "grad_norm": 2.0252087116241455, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7721767425537109, + "num_tokens": 332445258.0, + "step": 12845 + }, + { + "epoch": 1.4107182077750933, + "grad_norm": 1.7303673028945923, + "learning_rate": 5e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7124965786933899, + "num_tokens": 332474147.0, + "step": 12846 + }, + { + "epoch": 1.410828025477707, + "grad_norm": 1.6898382902145386, + "learning_rate": 5e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7174521684646606, + "num_tokens": 332505935.0, + "step": 12847 + }, + { + "epoch": 1.4109378431803208, + "grad_norm": 1.8361960649490356, + "learning_rate": 5e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7340438365936279, + "num_tokens": 332535581.0, + "step": 12848 + }, + { + "epoch": 1.4110476608829343, + "grad_norm": 1.6868987083435059, + "learning_rate": 5e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.745873212814331, + "num_tokens": 332564685.0, + "step": 12849 + }, + { + "epoch": 1.4111574785855479, + "grad_norm": 1.8317204713821411, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7466607093811035, + "num_tokens": 332588380.0, + "step": 12850 + }, + { + "epoch": 1.4112672962881616, + "grad_norm": 1.5759247541427612, + "learning_rate": 5e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7159126996994019, + "num_tokens": 332624729.0, + "step": 12851 + }, + { + "epoch": 1.4113771139907754, + "grad_norm": 1.7063664197921753, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7459107041358948, + "num_tokens": 332652422.0, + "step": 12852 + }, + { + "epoch": 1.411486931693389, + "grad_norm": 1.9718670845031738, + "learning_rate": 5e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7522496581077576, + "num_tokens": 332674742.0, + "step": 12853 + }, + { + "epoch": 1.4115967493960027, + "grad_norm": 1.8381654024124146, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7241740226745605, + "num_tokens": 332703166.0, + "step": 12854 + }, + { + "epoch": 1.4117065670986162, + "grad_norm": 1.782117486000061, + "learning_rate": 5e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7286281585693359, + "num_tokens": 332730765.0, + "step": 12855 + }, + { + "epoch": 1.41181638480123, + "grad_norm": 1.677297830581665, + "learning_rate": 5e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7218533754348755, + "num_tokens": 332764521.0, + "step": 12856 + }, + { + "epoch": 1.4119262025038437, + "grad_norm": 1.7554394006729126, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7045035362243652, + "num_tokens": 332792609.0, + "step": 12857 + }, + { + "epoch": 1.4120360202064572, + "grad_norm": 1.9481539726257324, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7199273109436035, + "num_tokens": 332817614.0, + "step": 12858 + }, + { + "epoch": 1.412145837909071, + "grad_norm": 1.757855772972107, + "learning_rate": 5e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7374351620674133, + "num_tokens": 332843457.0, + "step": 12859 + }, + { + "epoch": 1.4122556556116845, + "grad_norm": 1.998724102973938, + "learning_rate": 5e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7311921119689941, + "num_tokens": 332865794.0, + "step": 12860 + }, + { + "epoch": 1.4123654733142983, + "grad_norm": 2.08327054977417, + "learning_rate": 5e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7428566813468933, + "num_tokens": 332885923.0, + "step": 12861 + }, + { + "epoch": 1.412475291016912, + "grad_norm": 1.747159481048584, + "learning_rate": 5e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7321721315383911, + "num_tokens": 332915687.0, + "step": 12862 + }, + { + "epoch": 1.4125851087195256, + "grad_norm": 1.7976839542388916, + "learning_rate": 5e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7205462455749512, + "num_tokens": 332945211.0, + "step": 12863 + }, + { + "epoch": 1.4126949264221391, + "grad_norm": 1.8095263242721558, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7402167320251465, + "num_tokens": 332970802.0, + "step": 12864 + }, + { + "epoch": 1.4128047441247529, + "grad_norm": 1.5785783529281616, + "learning_rate": 5e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7210544943809509, + "num_tokens": 333003813.0, + "step": 12865 + }, + { + "epoch": 1.4129145618273666, + "grad_norm": 2.043701648712158, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7163379788398743, + "num_tokens": 333028243.0, + "step": 12866 + }, + { + "epoch": 1.4130243795299802, + "grad_norm": 1.8061158657073975, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7355836033821106, + "num_tokens": 333054030.0, + "step": 12867 + }, + { + "epoch": 1.413134197232594, + "grad_norm": 1.823968768119812, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7462469339370728, + "num_tokens": 333081051.0, + "step": 12868 + }, + { + "epoch": 1.4132440149352075, + "grad_norm": 1.8331977128982544, + "learning_rate": 5e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7380682229995728, + "num_tokens": 333107833.0, + "step": 12869 + }, + { + "epoch": 1.4133538326378212, + "grad_norm": 1.9931097030639648, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7259584665298462, + "num_tokens": 333131414.0, + "step": 12870 + }, + { + "epoch": 1.413463650340435, + "grad_norm": 2.192284107208252, + "learning_rate": 5e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7440456748008728, + "num_tokens": 333150847.0, + "step": 12871 + }, + { + "epoch": 1.4135734680430485, + "grad_norm": 1.8456882238388062, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7511759996414185, + "num_tokens": 333173680.0, + "step": 12872 + }, + { + "epoch": 1.4136832857456623, + "grad_norm": 1.8947445154190063, + "learning_rate": 5e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7260158061981201, + "num_tokens": 333198074.0, + "step": 12873 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 2.0991146564483643, + "learning_rate": 5e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7376948595046997, + "num_tokens": 333217717.0, + "step": 12874 + }, + { + "epoch": 1.4139029211508896, + "grad_norm": 1.8948742151260376, + "learning_rate": 5e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7506340742111206, + "num_tokens": 333242557.0, + "step": 12875 + }, + { + "epoch": 1.4140127388535033, + "grad_norm": 1.7764033079147339, + "learning_rate": 5e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7403039932250977, + "num_tokens": 333271406.0, + "step": 12876 + }, + { + "epoch": 1.4141225565561168, + "grad_norm": 2.107264280319214, + "learning_rate": 5e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7406986355781555, + "num_tokens": 333293137.0, + "step": 12877 + }, + { + "epoch": 1.4142323742587304, + "grad_norm": 1.9279435873031616, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7339816093444824, + "num_tokens": 333316733.0, + "step": 12878 + }, + { + "epoch": 1.4143421919613441, + "grad_norm": 2.0537734031677246, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7276438474655151, + "num_tokens": 333338336.0, + "step": 12879 + }, + { + "epoch": 1.414452009663958, + "grad_norm": 1.7818564176559448, + "learning_rate": 5e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7266191244125366, + "num_tokens": 333366678.0, + "step": 12880 + }, + { + "epoch": 1.4145618273665714, + "grad_norm": 2.063817024230957, + "learning_rate": 5e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7477709650993347, + "num_tokens": 333387197.0, + "step": 12881 + }, + { + "epoch": 1.4146716450691852, + "grad_norm": 2.23576021194458, + "learning_rate": 5e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.7481502294540405, + "num_tokens": 333405526.0, + "step": 12882 + }, + { + "epoch": 1.4147814627717987, + "grad_norm": 1.828657627105713, + "learning_rate": 5e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7271536588668823, + "num_tokens": 333432750.0, + "step": 12883 + }, + { + "epoch": 1.4148912804744125, + "grad_norm": 2.048124313354492, + "learning_rate": 5e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7549000382423401, + "num_tokens": 333454221.0, + "step": 12884 + }, + { + "epoch": 1.4150010981770262, + "grad_norm": 1.9959945678710938, + "learning_rate": 5e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7371414303779602, + "num_tokens": 333477776.0, + "step": 12885 + }, + { + "epoch": 1.4151109158796398, + "grad_norm": 1.8268109560012817, + "learning_rate": 5e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7216610908508301, + "num_tokens": 333504504.0, + "step": 12886 + }, + { + "epoch": 1.4152207335822535, + "grad_norm": 1.9136070013046265, + "learning_rate": 5e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7170168161392212, + "num_tokens": 333532146.0, + "step": 12887 + }, + { + "epoch": 1.415330551284867, + "grad_norm": 1.529117465019226, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7169135212898254, + "num_tokens": 333565470.0, + "step": 12888 + }, + { + "epoch": 1.4154403689874808, + "grad_norm": 1.772132158279419, + "learning_rate": 5e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7182233929634094, + "num_tokens": 333593636.0, + "step": 12889 + }, + { + "epoch": 1.4155501866900946, + "grad_norm": 1.6823352575302124, + "learning_rate": 5e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7349435091018677, + "num_tokens": 333624128.0, + "step": 12890 + }, + { + "epoch": 1.4156600043927081, + "grad_norm": 1.6990149021148682, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.729207456111908, + "num_tokens": 333652414.0, + "step": 12891 + }, + { + "epoch": 1.4157698220953217, + "grad_norm": 1.9009864330291748, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7244857549667358, + "num_tokens": 333676860.0, + "step": 12892 + }, + { + "epoch": 1.4158796397979354, + "grad_norm": 1.9035049676895142, + "learning_rate": 5e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7301575541496277, + "num_tokens": 333700231.0, + "step": 12893 + }, + { + "epoch": 1.4159894575005492, + "grad_norm": 1.838009238243103, + "learning_rate": 5e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7267618179321289, + "num_tokens": 333728700.0, + "step": 12894 + }, + { + "epoch": 1.4160992752031627, + "grad_norm": 1.9406262636184692, + "learning_rate": 5e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7263780832290649, + "num_tokens": 333757005.0, + "step": 12895 + }, + { + "epoch": 1.4162090929057765, + "grad_norm": 1.949945092201233, + "learning_rate": 5e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.720313549041748, + "num_tokens": 333782371.0, + "step": 12896 + }, + { + "epoch": 1.41631891060839, + "grad_norm": 1.9988579750061035, + "learning_rate": 5e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7174508571624756, + "num_tokens": 333806159.0, + "step": 12897 + }, + { + "epoch": 1.4164287283110037, + "grad_norm": 1.8452179431915283, + "learning_rate": 5e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.714005708694458, + "num_tokens": 333833199.0, + "step": 12898 + }, + { + "epoch": 1.4165385460136175, + "grad_norm": 1.6770970821380615, + "learning_rate": 5e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7188299894332886, + "num_tokens": 333866882.0, + "step": 12899 + }, + { + "epoch": 1.416648363716231, + "grad_norm": 1.7664768695831299, + "learning_rate": 5e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7208291292190552, + "num_tokens": 333896344.0, + "step": 12900 + }, + { + "epoch": 1.4167581814188448, + "grad_norm": 2.070953845977783, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.738179087638855, + "num_tokens": 333917111.0, + "step": 12901 + }, + { + "epoch": 1.4168679991214583, + "grad_norm": 1.8799223899841309, + "learning_rate": 5e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.6948346495628357, + "num_tokens": 333945943.0, + "step": 12902 + }, + { + "epoch": 1.416977816824072, + "grad_norm": 1.8183282613754272, + "learning_rate": 5e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7276362776756287, + "num_tokens": 333972611.0, + "step": 12903 + }, + { + "epoch": 1.4170876345266856, + "grad_norm": 2.102832078933716, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7590631246566772, + "num_tokens": 333992962.0, + "step": 12904 + }, + { + "epoch": 1.4171974522292994, + "grad_norm": 1.7293275594711304, + "learning_rate": 5e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7209738492965698, + "num_tokens": 334021769.0, + "step": 12905 + }, + { + "epoch": 1.417307269931913, + "grad_norm": 1.935071349143982, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7335338592529297, + "num_tokens": 334046102.0, + "step": 12906 + }, + { + "epoch": 1.4174170876345267, + "grad_norm": 1.935750126838684, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7260370254516602, + "num_tokens": 334069633.0, + "step": 12907 + }, + { + "epoch": 1.4175269053371404, + "grad_norm": 1.8862276077270508, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7248584032058716, + "num_tokens": 334093415.0, + "step": 12908 + }, + { + "epoch": 1.417636723039754, + "grad_norm": 1.814353108406067, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7456082105636597, + "num_tokens": 334119456.0, + "step": 12909 + }, + { + "epoch": 1.4177465407423677, + "grad_norm": 1.878259539604187, + "learning_rate": 5e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7080340385437012, + "num_tokens": 334146382.0, + "step": 12910 + }, + { + "epoch": 1.4178563584449813, + "grad_norm": 1.8315691947937012, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7425180673599243, + "num_tokens": 334174649.0, + "step": 12911 + }, + { + "epoch": 1.417966176147595, + "grad_norm": 1.8719638586044312, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7326028347015381, + "num_tokens": 334201428.0, + "step": 12912 + }, + { + "epoch": 1.4180759938502088, + "grad_norm": 1.6148203611373901, + "learning_rate": 5e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7261087894439697, + "num_tokens": 334234586.0, + "step": 12913 + }, + { + "epoch": 1.4181858115528223, + "grad_norm": 2.0026063919067383, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.755103588104248, + "num_tokens": 334256070.0, + "step": 12914 + }, + { + "epoch": 1.4182956292554358, + "grad_norm": 2.0007410049438477, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7239797115325928, + "num_tokens": 334277936.0, + "step": 12915 + }, + { + "epoch": 1.4184054469580496, + "grad_norm": 2.043936014175415, + "learning_rate": 5e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7254093885421753, + "num_tokens": 334300822.0, + "step": 12916 + }, + { + "epoch": 1.4185152646606634, + "grad_norm": 1.8116241693496704, + "learning_rate": 5e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.745333194732666, + "num_tokens": 334326383.0, + "step": 12917 + }, + { + "epoch": 1.418625082363277, + "grad_norm": 1.8305107355117798, + "learning_rate": 5e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7452013492584229, + "num_tokens": 334350786.0, + "step": 12918 + }, + { + "epoch": 1.4187349000658906, + "grad_norm": 1.7819064855575562, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7356476187705994, + "num_tokens": 334374849.0, + "step": 12919 + }, + { + "epoch": 1.4188447177685042, + "grad_norm": 1.9520248174667358, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7174508571624756, + "num_tokens": 334398408.0, + "step": 12920 + }, + { + "epoch": 1.418954535471118, + "grad_norm": 1.8447500467300415, + "learning_rate": 5e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7312408685684204, + "num_tokens": 334424037.0, + "step": 12921 + }, + { + "epoch": 1.4190643531737317, + "grad_norm": 1.8207945823669434, + "learning_rate": 5e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7345209121704102, + "num_tokens": 334449800.0, + "step": 12922 + }, + { + "epoch": 1.4191741708763452, + "grad_norm": 2.291936159133911, + "learning_rate": 5e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7052785754203796, + "num_tokens": 334471133.0, + "step": 12923 + }, + { + "epoch": 1.419283988578959, + "grad_norm": 2.005767822265625, + "learning_rate": 5e-06, + "loss": 0.944, + "mean_token_accuracy": 0.709577739238739, + "num_tokens": 334499350.0, + "step": 12924 + }, + { + "epoch": 1.4193938062815725, + "grad_norm": 1.8854529857635498, + "learning_rate": 5e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7422260642051697, + "num_tokens": 334524286.0, + "step": 12925 + }, + { + "epoch": 1.4195036239841863, + "grad_norm": 1.9315614700317383, + "learning_rate": 5e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7465818524360657, + "num_tokens": 334547161.0, + "step": 12926 + }, + { + "epoch": 1.4196134416868, + "grad_norm": 1.7994658946990967, + "learning_rate": 5e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7220616340637207, + "num_tokens": 334572803.0, + "step": 12927 + }, + { + "epoch": 1.4197232593894136, + "grad_norm": 1.8378618955612183, + "learning_rate": 5e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7205008268356323, + "num_tokens": 334600189.0, + "step": 12928 + }, + { + "epoch": 1.419833077092027, + "grad_norm": 2.0326945781707764, + "learning_rate": 5e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7469851970672607, + "num_tokens": 334620510.0, + "step": 12929 + }, + { + "epoch": 1.4199428947946409, + "grad_norm": 2.0601346492767334, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7219094038009644, + "num_tokens": 334645013.0, + "step": 12930 + }, + { + "epoch": 1.4200527124972546, + "grad_norm": 2.018174409866333, + "learning_rate": 5e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7221041917800903, + "num_tokens": 334668246.0, + "step": 12931 + }, + { + "epoch": 1.4201625301998682, + "grad_norm": 1.6187615394592285, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7241024971008301, + "num_tokens": 334697566.0, + "step": 12932 + }, + { + "epoch": 1.420272347902482, + "grad_norm": 1.8919974565505981, + "learning_rate": 5e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7334206104278564, + "num_tokens": 334720888.0, + "step": 12933 + }, + { + "epoch": 1.4203821656050954, + "grad_norm": 1.98304283618927, + "learning_rate": 5e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7393516302108765, + "num_tokens": 334743433.0, + "step": 12934 + }, + { + "epoch": 1.4204919833077092, + "grad_norm": 1.6173537969589233, + "learning_rate": 5e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.732981264591217, + "num_tokens": 334772795.0, + "step": 12935 + }, + { + "epoch": 1.420601801010323, + "grad_norm": 1.972227931022644, + "learning_rate": 5e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7543527483940125, + "num_tokens": 334793925.0, + "step": 12936 + }, + { + "epoch": 1.4207116187129365, + "grad_norm": 1.9096488952636719, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7327244281768799, + "num_tokens": 334818438.0, + "step": 12937 + }, + { + "epoch": 1.4208214364155503, + "grad_norm": 1.918190598487854, + "learning_rate": 5e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7391043305397034, + "num_tokens": 334843577.0, + "step": 12938 + }, + { + "epoch": 1.4209312541181638, + "grad_norm": 1.9308713674545288, + "learning_rate": 5e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7331697940826416, + "num_tokens": 334868379.0, + "step": 12939 + }, + { + "epoch": 1.4210410718207775, + "grad_norm": 1.8766332864761353, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7351924777030945, + "num_tokens": 334894453.0, + "step": 12940 + }, + { + "epoch": 1.4211508895233913, + "grad_norm": 1.746954083442688, + "learning_rate": 5e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.7515590190887451, + "num_tokens": 334919799.0, + "step": 12941 + }, + { + "epoch": 1.4212607072260048, + "grad_norm": 1.8713529109954834, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7253975868225098, + "num_tokens": 334945103.0, + "step": 12942 + }, + { + "epoch": 1.4213705249286184, + "grad_norm": 1.942293643951416, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7481657266616821, + "num_tokens": 334968227.0, + "step": 12943 + }, + { + "epoch": 1.4214803426312321, + "grad_norm": 1.635744571685791, + "learning_rate": 5e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7486136555671692, + "num_tokens": 334997126.0, + "step": 12944 + }, + { + "epoch": 1.4215901603338459, + "grad_norm": 1.9147601127624512, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.754636287689209, + "num_tokens": 335019988.0, + "step": 12945 + }, + { + "epoch": 1.4216999780364594, + "grad_norm": 2.02486515045166, + "learning_rate": 5e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7307363748550415, + "num_tokens": 335043052.0, + "step": 12946 + }, + { + "epoch": 1.4218097957390732, + "grad_norm": 1.8928102254867554, + "learning_rate": 5e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7399893403053284, + "num_tokens": 335070188.0, + "step": 12947 + }, + { + "epoch": 1.4219196134416867, + "grad_norm": 1.9229949712753296, + "learning_rate": 5e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7340729236602783, + "num_tokens": 335095028.0, + "step": 12948 + }, + { + "epoch": 1.4220294311443005, + "grad_norm": 1.924788475036621, + "learning_rate": 5e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.7474433779716492, + "num_tokens": 335120044.0, + "step": 12949 + }, + { + "epoch": 1.4221392488469142, + "grad_norm": 1.8200502395629883, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7372563481330872, + "num_tokens": 335145744.0, + "step": 12950 + }, + { + "epoch": 1.4222490665495278, + "grad_norm": 1.5629359483718872, + "learning_rate": 5e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7105606198310852, + "num_tokens": 335179887.0, + "step": 12951 + }, + { + "epoch": 1.4223588842521415, + "grad_norm": 1.8772387504577637, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7307526469230652, + "num_tokens": 335205258.0, + "step": 12952 + }, + { + "epoch": 1.422468701954755, + "grad_norm": 1.8359272480010986, + "learning_rate": 5e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7260346412658691, + "num_tokens": 335233966.0, + "step": 12953 + }, + { + "epoch": 1.4225785196573688, + "grad_norm": 2.0969431400299072, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7136873602867126, + "num_tokens": 335256125.0, + "step": 12954 + }, + { + "epoch": 1.4226883373599823, + "grad_norm": 1.764862298965454, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.731258749961853, + "num_tokens": 335285921.0, + "step": 12955 + }, + { + "epoch": 1.422798155062596, + "grad_norm": 1.8462355136871338, + "learning_rate": 5e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7478678822517395, + "num_tokens": 335312218.0, + "step": 12956 + }, + { + "epoch": 1.4229079727652096, + "grad_norm": 1.8539015054702759, + "learning_rate": 5e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7124208807945251, + "num_tokens": 335339851.0, + "step": 12957 + }, + { + "epoch": 1.4230177904678234, + "grad_norm": 1.940292477607727, + "learning_rate": 5e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6938877105712891, + "num_tokens": 335367221.0, + "step": 12958 + }, + { + "epoch": 1.4231276081704372, + "grad_norm": 1.858466386795044, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7308927774429321, + "num_tokens": 335393231.0, + "step": 12959 + }, + { + "epoch": 1.4232374258730507, + "grad_norm": 2.07342529296875, + "learning_rate": 5e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7402582168579102, + "num_tokens": 335415681.0, + "step": 12960 + }, + { + "epoch": 1.4233472435756644, + "grad_norm": 1.8261758089065552, + "learning_rate": 5e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7286473512649536, + "num_tokens": 335445276.0, + "step": 12961 + }, + { + "epoch": 1.423457061278278, + "grad_norm": 1.900681495666504, + "learning_rate": 5e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7427669763565063, + "num_tokens": 335468144.0, + "step": 12962 + }, + { + "epoch": 1.4235668789808917, + "grad_norm": 1.8130995035171509, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7172859907150269, + "num_tokens": 335494575.0, + "step": 12963 + }, + { + "epoch": 1.4236766966835055, + "grad_norm": 1.8851553201675415, + "learning_rate": 5e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7330421209335327, + "num_tokens": 335518774.0, + "step": 12964 + }, + { + "epoch": 1.423786514386119, + "grad_norm": 2.186168909072876, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7464443445205688, + "num_tokens": 335537757.0, + "step": 12965 + }, + { + "epoch": 1.4238963320887328, + "grad_norm": 1.7212355136871338, + "learning_rate": 5e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7166789770126343, + "num_tokens": 335569050.0, + "step": 12966 + }, + { + "epoch": 1.4240061497913463, + "grad_norm": 1.7623590230941772, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7274543046951294, + "num_tokens": 335596989.0, + "step": 12967 + }, + { + "epoch": 1.42411596749396, + "grad_norm": 1.725887656211853, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7693170309066772, + "num_tokens": 335624760.0, + "step": 12968 + }, + { + "epoch": 1.4242257851965736, + "grad_norm": 1.6098171472549438, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.736479640007019, + "num_tokens": 335656057.0, + "step": 12969 + }, + { + "epoch": 1.4243356028991874, + "grad_norm": 1.9758304357528687, + "learning_rate": 5e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7286901473999023, + "num_tokens": 335678864.0, + "step": 12970 + }, + { + "epoch": 1.424445420601801, + "grad_norm": 1.8389935493469238, + "learning_rate": 5e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7254107594490051, + "num_tokens": 335704073.0, + "step": 12971 + }, + { + "epoch": 1.4245552383044147, + "grad_norm": 1.9130271673202515, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7036457061767578, + "num_tokens": 335730139.0, + "step": 12972 + }, + { + "epoch": 1.4246650560070284, + "grad_norm": 2.056300401687622, + "learning_rate": 5e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7178729176521301, + "num_tokens": 335752124.0, + "step": 12973 + }, + { + "epoch": 1.424774873709642, + "grad_norm": 2.0716030597686768, + "learning_rate": 5e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.755936324596405, + "num_tokens": 335771760.0, + "step": 12974 + }, + { + "epoch": 1.4248846914122557, + "grad_norm": 1.9315078258514404, + "learning_rate": 5e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7410455346107483, + "num_tokens": 335795194.0, + "step": 12975 + }, + { + "epoch": 1.4249945091148692, + "grad_norm": 1.8629893064498901, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7003952264785767, + "num_tokens": 335822882.0, + "step": 12976 + }, + { + "epoch": 1.425104326817483, + "grad_norm": 1.816141128540039, + "learning_rate": 5e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.731235921382904, + "num_tokens": 335850606.0, + "step": 12977 + }, + { + "epoch": 1.4252141445200968, + "grad_norm": 1.6548385620117188, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7393598556518555, + "num_tokens": 335880412.0, + "step": 12978 + }, + { + "epoch": 1.4253239622227103, + "grad_norm": 1.8109776973724365, + "learning_rate": 5e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7391289472579956, + "num_tokens": 335907205.0, + "step": 12979 + }, + { + "epoch": 1.4254337799253238, + "grad_norm": 1.9492453336715698, + "learning_rate": 5e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7543506622314453, + "num_tokens": 335931816.0, + "step": 12980 + }, + { + "epoch": 1.4255435976279376, + "grad_norm": 1.7364355325698853, + "learning_rate": 5e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7395191192626953, + "num_tokens": 335958019.0, + "step": 12981 + }, + { + "epoch": 1.4256534153305513, + "grad_norm": 1.735870599746704, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7094923853874207, + "num_tokens": 335987600.0, + "step": 12982 + }, + { + "epoch": 1.4257632330331649, + "grad_norm": 2.1051669120788574, + "learning_rate": 5e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7122205495834351, + "num_tokens": 336009288.0, + "step": 12983 + }, + { + "epoch": 1.4258730507357786, + "grad_norm": 1.8802374601364136, + "learning_rate": 5e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7381494641304016, + "num_tokens": 336033628.0, + "step": 12984 + }, + { + "epoch": 1.4259828684383922, + "grad_norm": 1.8310843706130981, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7498509883880615, + "num_tokens": 336058228.0, + "step": 12985 + }, + { + "epoch": 1.426092686141006, + "grad_norm": 1.8997650146484375, + "learning_rate": 5e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7292932868003845, + "num_tokens": 336082300.0, + "step": 12986 + }, + { + "epoch": 1.4262025038436197, + "grad_norm": 1.753755807876587, + "learning_rate": 5e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7257006168365479, + "num_tokens": 336112287.0, + "step": 12987 + }, + { + "epoch": 1.4263123215462332, + "grad_norm": 1.6754759550094604, + "learning_rate": 5e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7499023675918579, + "num_tokens": 336139828.0, + "step": 12988 + }, + { + "epoch": 1.426422139248847, + "grad_norm": 1.9205838441848755, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7674906253814697, + "num_tokens": 336163176.0, + "step": 12989 + }, + { + "epoch": 1.4265319569514605, + "grad_norm": 1.730757474899292, + "learning_rate": 5e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7203607559204102, + "num_tokens": 336192216.0, + "step": 12990 + }, + { + "epoch": 1.4266417746540743, + "grad_norm": 1.9843792915344238, + "learning_rate": 5e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7371277213096619, + "num_tokens": 336215600.0, + "step": 12991 + }, + { + "epoch": 1.426751592356688, + "grad_norm": 2.020236015319824, + "learning_rate": 5e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7666340470314026, + "num_tokens": 336235576.0, + "step": 12992 + }, + { + "epoch": 1.4268614100593016, + "grad_norm": 1.7523359060287476, + "learning_rate": 5e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7267963886260986, + "num_tokens": 336262815.0, + "step": 12993 + }, + { + "epoch": 1.426971227761915, + "grad_norm": 1.6750807762145996, + "learning_rate": 5e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7486517429351807, + "num_tokens": 336289898.0, + "step": 12994 + }, + { + "epoch": 1.4270810454645289, + "grad_norm": 1.8276519775390625, + "learning_rate": 5e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7277605533599854, + "num_tokens": 336317178.0, + "step": 12995 + }, + { + "epoch": 1.4271908631671426, + "grad_norm": 1.9594579935073853, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7394428253173828, + "num_tokens": 336342920.0, + "step": 12996 + }, + { + "epoch": 1.4273006808697561, + "grad_norm": 1.8651585578918457, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7408398389816284, + "num_tokens": 336368004.0, + "step": 12997 + }, + { + "epoch": 1.42741049857237, + "grad_norm": 2.2121832370758057, + "learning_rate": 5e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7151702642440796, + "num_tokens": 336389193.0, + "step": 12998 + }, + { + "epoch": 1.4275203162749834, + "grad_norm": 1.7945903539657593, + "learning_rate": 5e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.759894847869873, + "num_tokens": 336412146.0, + "step": 12999 + }, + { + "epoch": 1.4276301339775972, + "grad_norm": 1.744948148727417, + "learning_rate": 5e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.736709713935852, + "num_tokens": 336440015.0, + "step": 13000 + }, + { + "epoch": 1.427739951680211, + "grad_norm": 1.6904710531234741, + "learning_rate": 5e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7251323461532593, + "num_tokens": 336468212.0, + "step": 13001 + }, + { + "epoch": 1.4278497693828245, + "grad_norm": 1.7687300443649292, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.729337751865387, + "num_tokens": 336493943.0, + "step": 13002 + }, + { + "epoch": 1.4279595870854382, + "grad_norm": 1.800901174545288, + "learning_rate": 5e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7287468314170837, + "num_tokens": 336520115.0, + "step": 13003 + }, + { + "epoch": 1.4280694047880518, + "grad_norm": 1.8585031032562256, + "learning_rate": 5e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7230831384658813, + "num_tokens": 336544665.0, + "step": 13004 + }, + { + "epoch": 1.4281792224906655, + "grad_norm": 2.0017285346984863, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7231771945953369, + "num_tokens": 336567808.0, + "step": 13005 + }, + { + "epoch": 1.4282890401932793, + "grad_norm": 1.8769340515136719, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7493799924850464, + "num_tokens": 336589342.0, + "step": 13006 + }, + { + "epoch": 1.4283988578958928, + "grad_norm": 1.6590723991394043, + "learning_rate": 5e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7633090615272522, + "num_tokens": 336618677.0, + "step": 13007 + }, + { + "epoch": 1.4285086755985064, + "grad_norm": 2.003094434738159, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.750848650932312, + "num_tokens": 336642314.0, + "step": 13008 + }, + { + "epoch": 1.4286184933011201, + "grad_norm": 2.0697879791259766, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7413368225097656, + "num_tokens": 336664050.0, + "step": 13009 + }, + { + "epoch": 1.4287283110037339, + "grad_norm": 1.7280107736587524, + "learning_rate": 5e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.727653980255127, + "num_tokens": 336691964.0, + "step": 13010 + }, + { + "epoch": 1.4288381287063474, + "grad_norm": 1.8974906206130981, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7130672931671143, + "num_tokens": 336718876.0, + "step": 13011 + }, + { + "epoch": 1.4289479464089612, + "grad_norm": 1.884240984916687, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7103286385536194, + "num_tokens": 336745677.0, + "step": 13012 + }, + { + "epoch": 1.4290577641115747, + "grad_norm": 1.8237330913543701, + "learning_rate": 5e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7518147230148315, + "num_tokens": 336771117.0, + "step": 13013 + }, + { + "epoch": 1.4291675818141885, + "grad_norm": 1.7423080205917358, + "learning_rate": 5e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7087621092796326, + "num_tokens": 336800280.0, + "step": 13014 + }, + { + "epoch": 1.4292773995168022, + "grad_norm": 1.845647931098938, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7311208844184875, + "num_tokens": 336825894.0, + "step": 13015 + }, + { + "epoch": 1.4293872172194158, + "grad_norm": 2.1031405925750732, + "learning_rate": 5e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7257863283157349, + "num_tokens": 336847637.0, + "step": 13016 + }, + { + "epoch": 1.4294970349220295, + "grad_norm": 2.0022060871124268, + "learning_rate": 5e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7508740425109863, + "num_tokens": 336867506.0, + "step": 13017 + }, + { + "epoch": 1.429606852624643, + "grad_norm": 1.9238837957382202, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7256618142127991, + "num_tokens": 336892244.0, + "step": 13018 + }, + { + "epoch": 1.4297166703272568, + "grad_norm": 1.7704659700393677, + "learning_rate": 5e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7022784948348999, + "num_tokens": 336922827.0, + "step": 13019 + }, + { + "epoch": 1.4298264880298703, + "grad_norm": 1.9544190168380737, + "learning_rate": 5e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7605924606323242, + "num_tokens": 336944212.0, + "step": 13020 + }, + { + "epoch": 1.429936305732484, + "grad_norm": 1.9329075813293457, + "learning_rate": 5e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7218592762947083, + "num_tokens": 336967720.0, + "step": 13021 + }, + { + "epoch": 1.4300461234350976, + "grad_norm": 1.7334903478622437, + "learning_rate": 5e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7401083707809448, + "num_tokens": 336995672.0, + "step": 13022 + }, + { + "epoch": 1.4301559411377114, + "grad_norm": 1.7443482875823975, + "learning_rate": 5e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7101662755012512, + "num_tokens": 337026379.0, + "step": 13023 + }, + { + "epoch": 1.4302657588403251, + "grad_norm": 2.027970552444458, + "learning_rate": 5e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7326219081878662, + "num_tokens": 337048242.0, + "step": 13024 + }, + { + "epoch": 1.4303755765429387, + "grad_norm": 1.7277940511703491, + "learning_rate": 5e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7212430238723755, + "num_tokens": 337080272.0, + "step": 13025 + }, + { + "epoch": 1.4304853942455524, + "grad_norm": 1.7821922302246094, + "learning_rate": 5e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7112975120544434, + "num_tokens": 337111164.0, + "step": 13026 + }, + { + "epoch": 1.430595211948166, + "grad_norm": 1.7591685056686401, + "learning_rate": 5e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7438527345657349, + "num_tokens": 337139291.0, + "step": 13027 + }, + { + "epoch": 1.4307050296507797, + "grad_norm": 1.7680821418762207, + "learning_rate": 5e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7441586256027222, + "num_tokens": 337165058.0, + "step": 13028 + }, + { + "epoch": 1.4308148473533935, + "grad_norm": 1.8955367803573608, + "learning_rate": 5e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7453902959823608, + "num_tokens": 337187946.0, + "step": 13029 + }, + { + "epoch": 1.430924665056007, + "grad_norm": 1.858046531677246, + "learning_rate": 5e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7375017404556274, + "num_tokens": 337213655.0, + "step": 13030 + }, + { + "epoch": 1.4310344827586206, + "grad_norm": 1.9732000827789307, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7430768609046936, + "num_tokens": 337237239.0, + "step": 13031 + }, + { + "epoch": 1.4311443004612343, + "grad_norm": 1.754516363143921, + "learning_rate": 5e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7353680729866028, + "num_tokens": 337262915.0, + "step": 13032 + }, + { + "epoch": 1.431254118163848, + "grad_norm": 1.758022427558899, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7329214811325073, + "num_tokens": 337290237.0, + "step": 13033 + }, + { + "epoch": 1.4313639358664616, + "grad_norm": 1.809890866279602, + "learning_rate": 5e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7037991285324097, + "num_tokens": 337318817.0, + "step": 13034 + }, + { + "epoch": 1.4314737535690754, + "grad_norm": 1.8607852458953857, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7282860279083252, + "num_tokens": 337347850.0, + "step": 13035 + }, + { + "epoch": 1.431583571271689, + "grad_norm": 1.7204021215438843, + "learning_rate": 5e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7275329828262329, + "num_tokens": 337376932.0, + "step": 13036 + }, + { + "epoch": 1.4316933889743026, + "grad_norm": 1.857621669769287, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7531284689903259, + "num_tokens": 337399975.0, + "step": 13037 + }, + { + "epoch": 1.4318032066769164, + "grad_norm": 1.7089778184890747, + "learning_rate": 5e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.6998765468597412, + "num_tokens": 337431267.0, + "step": 13038 + }, + { + "epoch": 1.43191302437953, + "grad_norm": 1.83856201171875, + "learning_rate": 5e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7249338626861572, + "num_tokens": 337457678.0, + "step": 13039 + }, + { + "epoch": 1.4320228420821437, + "grad_norm": 1.739776611328125, + "learning_rate": 5e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7388004064559937, + "num_tokens": 337483796.0, + "step": 13040 + }, + { + "epoch": 1.4321326597847572, + "grad_norm": 1.7468968629837036, + "learning_rate": 5e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7120571732521057, + "num_tokens": 337512013.0, + "step": 13041 + }, + { + "epoch": 1.432242477487371, + "grad_norm": 1.7726455926895142, + "learning_rate": 5e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7415416240692139, + "num_tokens": 337538724.0, + "step": 13042 + }, + { + "epoch": 1.4323522951899847, + "grad_norm": 1.7936184406280518, + "learning_rate": 5e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7387267351150513, + "num_tokens": 337561722.0, + "step": 13043 + }, + { + "epoch": 1.4324621128925983, + "grad_norm": 1.6129521131515503, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.754510760307312, + "num_tokens": 337589466.0, + "step": 13044 + }, + { + "epoch": 1.4325719305952118, + "grad_norm": 1.781972885131836, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.737919807434082, + "num_tokens": 337617185.0, + "step": 13045 + }, + { + "epoch": 1.4326817482978256, + "grad_norm": 1.9333970546722412, + "learning_rate": 5e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7382565140724182, + "num_tokens": 337642442.0, + "step": 13046 + }, + { + "epoch": 1.4327915660004393, + "grad_norm": 1.9676134586334229, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7262364625930786, + "num_tokens": 337667747.0, + "step": 13047 + }, + { + "epoch": 1.4329013837030529, + "grad_norm": 2.023082733154297, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7328925728797913, + "num_tokens": 337690889.0, + "step": 13048 + }, + { + "epoch": 1.4330112014056666, + "grad_norm": 2.0119407176971436, + "learning_rate": 5e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.758027970790863, + "num_tokens": 337711631.0, + "step": 13049 + }, + { + "epoch": 1.4331210191082802, + "grad_norm": 1.8221144676208496, + "learning_rate": 5e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7379762530326843, + "num_tokens": 337739243.0, + "step": 13050 + }, + { + "epoch": 1.433230836810894, + "grad_norm": 1.6760859489440918, + "learning_rate": 5e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7052812576293945, + "num_tokens": 337771332.0, + "step": 13051 + }, + { + "epoch": 1.4333406545135077, + "grad_norm": 1.8138277530670166, + "learning_rate": 5e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7401139736175537, + "num_tokens": 337795228.0, + "step": 13052 + }, + { + "epoch": 1.4334504722161212, + "grad_norm": 1.826951503753662, + "learning_rate": 5e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.745521068572998, + "num_tokens": 337821086.0, + "step": 13053 + }, + { + "epoch": 1.433560289918735, + "grad_norm": 1.847808599472046, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7268722057342529, + "num_tokens": 337847593.0, + "step": 13054 + }, + { + "epoch": 1.4336701076213485, + "grad_norm": 1.9147241115570068, + "learning_rate": 5e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7387847900390625, + "num_tokens": 337871795.0, + "step": 13055 + }, + { + "epoch": 1.4337799253239623, + "grad_norm": 1.9610905647277832, + "learning_rate": 5e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7423750758171082, + "num_tokens": 337895562.0, + "step": 13056 + }, + { + "epoch": 1.433889743026576, + "grad_norm": 2.065068006515503, + "learning_rate": 5e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7269545793533325, + "num_tokens": 337918121.0, + "step": 13057 + }, + { + "epoch": 1.4339995607291895, + "grad_norm": 1.9577854871749878, + "learning_rate": 5e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7393342852592468, + "num_tokens": 337940943.0, + "step": 13058 + }, + { + "epoch": 1.434109378431803, + "grad_norm": 1.9948710203170776, + "learning_rate": 5e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7296680212020874, + "num_tokens": 337964845.0, + "step": 13059 + }, + { + "epoch": 1.4342191961344168, + "grad_norm": 1.873623013496399, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.738465428352356, + "num_tokens": 337990797.0, + "step": 13060 + }, + { + "epoch": 1.4343290138370306, + "grad_norm": 1.7563345432281494, + "learning_rate": 5e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7142661213874817, + "num_tokens": 338021545.0, + "step": 13061 + }, + { + "epoch": 1.4344388315396441, + "grad_norm": 1.6723512411117554, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7303099036216736, + "num_tokens": 338049958.0, + "step": 13062 + }, + { + "epoch": 1.434548649242258, + "grad_norm": 2.2349181175231934, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7443987131118774, + "num_tokens": 338070263.0, + "step": 13063 + }, + { + "epoch": 1.4346584669448714, + "grad_norm": 1.7108783721923828, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7274396419525146, + "num_tokens": 338100929.0, + "step": 13064 + }, + { + "epoch": 1.4347682846474852, + "grad_norm": 1.7627933025360107, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7088850140571594, + "num_tokens": 338129775.0, + "step": 13065 + }, + { + "epoch": 1.434878102350099, + "grad_norm": 1.8578057289123535, + "learning_rate": 5e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7365589737892151, + "num_tokens": 338154477.0, + "step": 13066 + }, + { + "epoch": 1.4349879200527125, + "grad_norm": 1.7871394157409668, + "learning_rate": 5e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7302576303482056, + "num_tokens": 338180587.0, + "step": 13067 + }, + { + "epoch": 1.4350977377553262, + "grad_norm": 1.9935601949691772, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7367225289344788, + "num_tokens": 338202059.0, + "step": 13068 + }, + { + "epoch": 1.4352075554579398, + "grad_norm": 1.8847109079360962, + "learning_rate": 5e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7572749853134155, + "num_tokens": 338226555.0, + "step": 13069 + }, + { + "epoch": 1.4353173731605535, + "grad_norm": 1.8214783668518066, + "learning_rate": 5e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7057684659957886, + "num_tokens": 338260567.0, + "step": 13070 + }, + { + "epoch": 1.4354271908631673, + "grad_norm": 1.6966444253921509, + "learning_rate": 5e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7013375163078308, + "num_tokens": 338290812.0, + "step": 13071 + }, + { + "epoch": 1.4355370085657808, + "grad_norm": 1.9571090936660767, + "learning_rate": 5e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7407915592193604, + "num_tokens": 338313652.0, + "step": 13072 + }, + { + "epoch": 1.4356468262683943, + "grad_norm": 1.9317634105682373, + "learning_rate": 5e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7337483763694763, + "num_tokens": 338340316.0, + "step": 13073 + }, + { + "epoch": 1.435756643971008, + "grad_norm": 1.9085404872894287, + "learning_rate": 5e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7528841495513916, + "num_tokens": 338362178.0, + "step": 13074 + }, + { + "epoch": 1.4358664616736219, + "grad_norm": 1.6079622507095337, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7162759900093079, + "num_tokens": 338395822.0, + "step": 13075 + }, + { + "epoch": 1.4359762793762354, + "grad_norm": 1.7322721481323242, + "learning_rate": 5e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7120239734649658, + "num_tokens": 338426575.0, + "step": 13076 + }, + { + "epoch": 1.4360860970788492, + "grad_norm": 1.911139965057373, + "learning_rate": 5e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7296847701072693, + "num_tokens": 338450941.0, + "step": 13077 + }, + { + "epoch": 1.4361959147814627, + "grad_norm": 2.0236663818359375, + "learning_rate": 5e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7369203567504883, + "num_tokens": 338473133.0, + "step": 13078 + }, + { + "epoch": 1.4363057324840764, + "grad_norm": 1.9050146341323853, + "learning_rate": 5e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7409117221832275, + "num_tokens": 338495862.0, + "step": 13079 + }, + { + "epoch": 1.4364155501866902, + "grad_norm": 1.7948857545852661, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7462887167930603, + "num_tokens": 338521534.0, + "step": 13080 + }, + { + "epoch": 1.4365253678893037, + "grad_norm": 1.7945631742477417, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7427616119384766, + "num_tokens": 338550186.0, + "step": 13081 + }, + { + "epoch": 1.4366351855919175, + "grad_norm": 2.013155460357666, + "learning_rate": 5e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7242729663848877, + "num_tokens": 338576042.0, + "step": 13082 + }, + { + "epoch": 1.436745003294531, + "grad_norm": 1.8476470708847046, + "learning_rate": 5e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7296701669692993, + "num_tokens": 338600282.0, + "step": 13083 + }, + { + "epoch": 1.4368548209971448, + "grad_norm": 1.7248631715774536, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7232596278190613, + "num_tokens": 338632583.0, + "step": 13084 + }, + { + "epoch": 1.4369646386997583, + "grad_norm": 1.9038668870925903, + "learning_rate": 5e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7138767242431641, + "num_tokens": 338658558.0, + "step": 13085 + }, + { + "epoch": 1.437074456402372, + "grad_norm": 1.8090200424194336, + "learning_rate": 5e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7327746152877808, + "num_tokens": 338685535.0, + "step": 13086 + }, + { + "epoch": 1.4371842741049856, + "grad_norm": 1.8750886917114258, + "learning_rate": 5e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7229385375976562, + "num_tokens": 338711280.0, + "step": 13087 + }, + { + "epoch": 1.4372940918075994, + "grad_norm": 1.9790067672729492, + "learning_rate": 5e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7320438623428345, + "num_tokens": 338734642.0, + "step": 13088 + }, + { + "epoch": 1.4374039095102131, + "grad_norm": 1.7531081438064575, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7364224195480347, + "num_tokens": 338764051.0, + "step": 13089 + }, + { + "epoch": 1.4375137272128267, + "grad_norm": 1.7424269914627075, + "learning_rate": 5e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.7827574014663696, + "num_tokens": 338787303.0, + "step": 13090 + }, + { + "epoch": 1.4376235449154404, + "grad_norm": 1.9486360549926758, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7477086782455444, + "num_tokens": 338810016.0, + "step": 13091 + }, + { + "epoch": 1.437733362618054, + "grad_norm": 2.0441431999206543, + "learning_rate": 5e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7229413986206055, + "num_tokens": 338834060.0, + "step": 13092 + }, + { + "epoch": 1.4378431803206677, + "grad_norm": 1.8821717500686646, + "learning_rate": 5e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7419913411140442, + "num_tokens": 338859187.0, + "step": 13093 + }, + { + "epoch": 1.4379529980232815, + "grad_norm": 1.715078592300415, + "learning_rate": 5e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7042654156684875, + "num_tokens": 338888134.0, + "step": 13094 + }, + { + "epoch": 1.438062815725895, + "grad_norm": 1.9848970174789429, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.74433434009552, + "num_tokens": 338910849.0, + "step": 13095 + }, + { + "epoch": 1.4381726334285085, + "grad_norm": 1.9193737506866455, + "learning_rate": 5e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7386664748191833, + "num_tokens": 338936517.0, + "step": 13096 + }, + { + "epoch": 1.4382824511311223, + "grad_norm": 1.8470171689987183, + "learning_rate": 5e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7251147031784058, + "num_tokens": 338961788.0, + "step": 13097 + }, + { + "epoch": 1.438392268833736, + "grad_norm": 1.8774645328521729, + "learning_rate": 5e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7179983854293823, + "num_tokens": 338988418.0, + "step": 13098 + }, + { + "epoch": 1.4385020865363496, + "grad_norm": 2.03653621673584, + "learning_rate": 5e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7451948523521423, + "num_tokens": 339009819.0, + "step": 13099 + }, + { + "epoch": 1.4386119042389633, + "grad_norm": 1.764245867729187, + "learning_rate": 5e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7092795372009277, + "num_tokens": 339037021.0, + "step": 13100 + }, + { + "epoch": 1.4387217219415769, + "grad_norm": 1.6416709423065186, + "learning_rate": 5e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7154563665390015, + "num_tokens": 339070199.0, + "step": 13101 + }, + { + "epoch": 1.4388315396441906, + "grad_norm": 1.6981313228607178, + "learning_rate": 5e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7004134654998779, + "num_tokens": 339102881.0, + "step": 13102 + }, + { + "epoch": 1.4389413573468044, + "grad_norm": 2.0599136352539062, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7546605467796326, + "num_tokens": 339122722.0, + "step": 13103 + }, + { + "epoch": 1.439051175049418, + "grad_norm": 1.9046204090118408, + "learning_rate": 5e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7619985938072205, + "num_tokens": 339144871.0, + "step": 13104 + }, + { + "epoch": 1.4391609927520317, + "grad_norm": 1.7886003255844116, + "learning_rate": 5e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7457334399223328, + "num_tokens": 339170747.0, + "step": 13105 + }, + { + "epoch": 1.4392708104546452, + "grad_norm": 1.815299391746521, + "learning_rate": 5e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7397423982620239, + "num_tokens": 339196500.0, + "step": 13106 + }, + { + "epoch": 1.439380628157259, + "grad_norm": 1.8808809518814087, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7387120723724365, + "num_tokens": 339221634.0, + "step": 13107 + }, + { + "epoch": 1.4394904458598727, + "grad_norm": 1.8822190761566162, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7465644478797913, + "num_tokens": 339244241.0, + "step": 13108 + }, + { + "epoch": 1.4396002635624863, + "grad_norm": 1.7816740274429321, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7202523946762085, + "num_tokens": 339270149.0, + "step": 13109 + }, + { + "epoch": 1.4397100812650998, + "grad_norm": 1.6475523710250854, + "learning_rate": 5e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7358964085578918, + "num_tokens": 339299425.0, + "step": 13110 + }, + { + "epoch": 1.4398198989677136, + "grad_norm": 1.722740650177002, + "learning_rate": 5e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7195069789886475, + "num_tokens": 339327999.0, + "step": 13111 + }, + { + "epoch": 1.4399297166703273, + "grad_norm": 1.9083871841430664, + "learning_rate": 5e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.726751983165741, + "num_tokens": 339351328.0, + "step": 13112 + }, + { + "epoch": 1.4400395343729409, + "grad_norm": 1.8732109069824219, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7373488545417786, + "num_tokens": 339375328.0, + "step": 13113 + }, + { + "epoch": 1.4401493520755546, + "grad_norm": 1.619430422782898, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7308375835418701, + "num_tokens": 339409175.0, + "step": 13114 + }, + { + "epoch": 1.4402591697781681, + "grad_norm": 2.0376312732696533, + "learning_rate": 5e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7328571081161499, + "num_tokens": 339432490.0, + "step": 13115 + }, + { + "epoch": 1.440368987480782, + "grad_norm": 1.8499544858932495, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7310084700584412, + "num_tokens": 339460439.0, + "step": 13116 + }, + { + "epoch": 1.4404788051833957, + "grad_norm": 1.7027397155761719, + "learning_rate": 5e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7388489842414856, + "num_tokens": 339490261.0, + "step": 13117 + }, + { + "epoch": 1.4405886228860092, + "grad_norm": 1.7332278490066528, + "learning_rate": 5e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7195236682891846, + "num_tokens": 339518661.0, + "step": 13118 + }, + { + "epoch": 1.440698440588623, + "grad_norm": 1.9045318365097046, + "learning_rate": 5e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7342281937599182, + "num_tokens": 339545711.0, + "step": 13119 + }, + { + "epoch": 1.4408082582912365, + "grad_norm": 1.8042054176330566, + "learning_rate": 5e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7377197742462158, + "num_tokens": 339572201.0, + "step": 13120 + }, + { + "epoch": 1.4409180759938502, + "grad_norm": 1.9111664295196533, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7333513498306274, + "num_tokens": 339597002.0, + "step": 13121 + }, + { + "epoch": 1.441027893696464, + "grad_norm": 1.824180006980896, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7402962446212769, + "num_tokens": 339623634.0, + "step": 13122 + }, + { + "epoch": 1.4411377113990775, + "grad_norm": 1.7179392576217651, + "learning_rate": 5e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7402117252349854, + "num_tokens": 339651279.0, + "step": 13123 + }, + { + "epoch": 1.441247529101691, + "grad_norm": 1.7298933267593384, + "learning_rate": 5e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7314774990081787, + "num_tokens": 339678470.0, + "step": 13124 + }, + { + "epoch": 1.4413573468043048, + "grad_norm": 1.7889366149902344, + "learning_rate": 5e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7449044585227966, + "num_tokens": 339704396.0, + "step": 13125 + }, + { + "epoch": 1.4414671645069186, + "grad_norm": 1.4757838249206543, + "learning_rate": 5e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7401362657546997, + "num_tokens": 339737292.0, + "step": 13126 + }, + { + "epoch": 1.4415769822095321, + "grad_norm": 1.7210755348205566, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7454736232757568, + "num_tokens": 339767998.0, + "step": 13127 + }, + { + "epoch": 1.4416867999121459, + "grad_norm": 2.0000030994415283, + "learning_rate": 5e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7453341484069824, + "num_tokens": 339791438.0, + "step": 13128 + }, + { + "epoch": 1.4417966176147594, + "grad_norm": 1.847537875175476, + "learning_rate": 5e-06, + "loss": 0.802, + "mean_token_accuracy": 0.744795560836792, + "num_tokens": 339816915.0, + "step": 13129 + }, + { + "epoch": 1.4419064353173732, + "grad_norm": 1.9921897649765015, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7256613969802856, + "num_tokens": 339840360.0, + "step": 13130 + }, + { + "epoch": 1.442016253019987, + "grad_norm": 2.111743688583374, + "learning_rate": 5e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7254009246826172, + "num_tokens": 339861878.0, + "step": 13131 + }, + { + "epoch": 1.4421260707226005, + "grad_norm": 2.0996294021606445, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7651760578155518, + "num_tokens": 339881688.0, + "step": 13132 + }, + { + "epoch": 1.4422358884252142, + "grad_norm": 2.030768632888794, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7237712144851685, + "num_tokens": 339904340.0, + "step": 13133 + }, + { + "epoch": 1.4423457061278278, + "grad_norm": 1.7631313800811768, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7403611540794373, + "num_tokens": 339933756.0, + "step": 13134 + }, + { + "epoch": 1.4424555238304415, + "grad_norm": 1.804368257522583, + "learning_rate": 5e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7492095828056335, + "num_tokens": 339959854.0, + "step": 13135 + }, + { + "epoch": 1.442565341533055, + "grad_norm": 1.9848650693893433, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7225210666656494, + "num_tokens": 339984843.0, + "step": 13136 + }, + { + "epoch": 1.4426751592356688, + "grad_norm": 1.9081902503967285, + "learning_rate": 5e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7445347309112549, + "num_tokens": 340009109.0, + "step": 13137 + }, + { + "epoch": 1.4427849769382823, + "grad_norm": 1.9433847665786743, + "learning_rate": 5e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7300278544425964, + "num_tokens": 340032799.0, + "step": 13138 + }, + { + "epoch": 1.442894794640896, + "grad_norm": 1.6616551876068115, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7362567186355591, + "num_tokens": 340061660.0, + "step": 13139 + }, + { + "epoch": 1.4430046123435099, + "grad_norm": 1.6295619010925293, + "learning_rate": 5e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7117942571640015, + "num_tokens": 340093429.0, + "step": 13140 + }, + { + "epoch": 1.4431144300461234, + "grad_norm": 1.7629222869873047, + "learning_rate": 5e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7452201247215271, + "num_tokens": 340119081.0, + "step": 13141 + }, + { + "epoch": 1.4432242477487371, + "grad_norm": 1.685468077659607, + "learning_rate": 5e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7166366577148438, + "num_tokens": 340148954.0, + "step": 13142 + }, + { + "epoch": 1.4433340654513507, + "grad_norm": 2.0388362407684326, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7512648105621338, + "num_tokens": 340170047.0, + "step": 13143 + }, + { + "epoch": 1.4434438831539644, + "grad_norm": 1.750504732131958, + "learning_rate": 5e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7500307559967041, + "num_tokens": 340197664.0, + "step": 13144 + }, + { + "epoch": 1.4435537008565782, + "grad_norm": 2.0737807750701904, + "learning_rate": 5e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7338773012161255, + "num_tokens": 340223694.0, + "step": 13145 + }, + { + "epoch": 1.4436635185591917, + "grad_norm": 1.9062957763671875, + "learning_rate": 5e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7434664964675903, + "num_tokens": 340251732.0, + "step": 13146 + }, + { + "epoch": 1.4437733362618055, + "grad_norm": 1.7798267602920532, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7617194652557373, + "num_tokens": 340278249.0, + "step": 13147 + }, + { + "epoch": 1.443883153964419, + "grad_norm": 1.876638650894165, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.758689284324646, + "num_tokens": 340302680.0, + "step": 13148 + }, + { + "epoch": 1.4439929716670328, + "grad_norm": 1.802885890007019, + "learning_rate": 5e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7164641618728638, + "num_tokens": 340333318.0, + "step": 13149 + }, + { + "epoch": 1.4441027893696463, + "grad_norm": 1.868064045906067, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7433224320411682, + "num_tokens": 340359699.0, + "step": 13150 + }, + { + "epoch": 1.44421260707226, + "grad_norm": 2.036280870437622, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7324771881103516, + "num_tokens": 340382846.0, + "step": 13151 + }, + { + "epoch": 1.4443224247748736, + "grad_norm": 2.1574697494506836, + "learning_rate": 5e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.737785816192627, + "num_tokens": 340404044.0, + "step": 13152 + }, + { + "epoch": 1.4444322424774874, + "grad_norm": 2.024970769882202, + "learning_rate": 5e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7390057444572449, + "num_tokens": 340427536.0, + "step": 13153 + }, + { + "epoch": 1.4445420601801011, + "grad_norm": 2.0087902545928955, + "learning_rate": 5e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7565312385559082, + "num_tokens": 340447114.0, + "step": 13154 + }, + { + "epoch": 1.4446518778827147, + "grad_norm": 2.072087287902832, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7212195992469788, + "num_tokens": 340469662.0, + "step": 13155 + }, + { + "epoch": 1.4447616955853284, + "grad_norm": 1.8627338409423828, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7307097315788269, + "num_tokens": 340493394.0, + "step": 13156 + }, + { + "epoch": 1.444871513287942, + "grad_norm": 1.9469023942947388, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7503840923309326, + "num_tokens": 340514571.0, + "step": 13157 + }, + { + "epoch": 1.4449813309905557, + "grad_norm": 2.2529263496398926, + "learning_rate": 5e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7078487873077393, + "num_tokens": 340535607.0, + "step": 13158 + }, + { + "epoch": 1.4450911486931695, + "grad_norm": 1.8567689657211304, + "learning_rate": 5e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7352671027183533, + "num_tokens": 340561022.0, + "step": 13159 + }, + { + "epoch": 1.445200966395783, + "grad_norm": 1.9600468873977661, + "learning_rate": 5e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7450064420700073, + "num_tokens": 340582818.0, + "step": 13160 + }, + { + "epoch": 1.4453107840983965, + "grad_norm": 1.7330975532531738, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7357486486434937, + "num_tokens": 340611741.0, + "step": 13161 + }, + { + "epoch": 1.4454206018010103, + "grad_norm": 1.8434151411056519, + "learning_rate": 5e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7421801686286926, + "num_tokens": 340637736.0, + "step": 13162 + }, + { + "epoch": 1.445530419503624, + "grad_norm": 1.6669436693191528, + "learning_rate": 5e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7183987498283386, + "num_tokens": 340672366.0, + "step": 13163 + }, + { + "epoch": 1.4456402372062376, + "grad_norm": 1.6203948259353638, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7152942419052124, + "num_tokens": 340704949.0, + "step": 13164 + }, + { + "epoch": 1.4457500549088513, + "grad_norm": 1.7892714738845825, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.715912938117981, + "num_tokens": 340733206.0, + "step": 13165 + }, + { + "epoch": 1.4458598726114649, + "grad_norm": 1.9303810596466064, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7457239627838135, + "num_tokens": 340754644.0, + "step": 13166 + }, + { + "epoch": 1.4459696903140786, + "grad_norm": 1.8877079486846924, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7265224456787109, + "num_tokens": 340779743.0, + "step": 13167 + }, + { + "epoch": 1.4460795080166924, + "grad_norm": 1.8449811935424805, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7262427806854248, + "num_tokens": 340805901.0, + "step": 13168 + }, + { + "epoch": 1.446189325719306, + "grad_norm": 1.8479321002960205, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7327169179916382, + "num_tokens": 340830974.0, + "step": 13169 + }, + { + "epoch": 1.4462991434219197, + "grad_norm": 1.8953654766082764, + "learning_rate": 5e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.723362147808075, + "num_tokens": 340856874.0, + "step": 13170 + }, + { + "epoch": 1.4464089611245332, + "grad_norm": 2.0529301166534424, + "learning_rate": 5e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7284936904907227, + "num_tokens": 340879195.0, + "step": 13171 + }, + { + "epoch": 1.446518778827147, + "grad_norm": 1.710613489151001, + "learning_rate": 5e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7304567098617554, + "num_tokens": 340908163.0, + "step": 13172 + }, + { + "epoch": 1.4466285965297607, + "grad_norm": 2.0213325023651123, + "learning_rate": 5e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7475718259811401, + "num_tokens": 340929258.0, + "step": 13173 + }, + { + "epoch": 1.4467384142323743, + "grad_norm": 1.691880464553833, + "learning_rate": 5e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7388218641281128, + "num_tokens": 340956663.0, + "step": 13174 + }, + { + "epoch": 1.4468482319349878, + "grad_norm": 1.811019778251648, + "learning_rate": 5e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7126032114028931, + "num_tokens": 340983947.0, + "step": 13175 + }, + { + "epoch": 1.4469580496376016, + "grad_norm": 2.1447718143463135, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.716315507888794, + "num_tokens": 341005723.0, + "step": 13176 + }, + { + "epoch": 1.4470678673402153, + "grad_norm": 1.7279784679412842, + "learning_rate": 5e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7263463735580444, + "num_tokens": 341035743.0, + "step": 13177 + }, + { + "epoch": 1.4471776850428288, + "grad_norm": 1.7146484851837158, + "learning_rate": 5e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7368947863578796, + "num_tokens": 341063971.0, + "step": 13178 + }, + { + "epoch": 1.4472875027454426, + "grad_norm": 1.9750546216964722, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7335742115974426, + "num_tokens": 341086693.0, + "step": 13179 + }, + { + "epoch": 1.4473973204480561, + "grad_norm": 1.9178276062011719, + "learning_rate": 5e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7320887446403503, + "num_tokens": 341111023.0, + "step": 13180 + }, + { + "epoch": 1.44750713815067, + "grad_norm": 2.0987155437469482, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7503746747970581, + "num_tokens": 341130915.0, + "step": 13181 + }, + { + "epoch": 1.4476169558532836, + "grad_norm": 1.8472161293029785, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7367736101150513, + "num_tokens": 341155929.0, + "step": 13182 + }, + { + "epoch": 1.4477267735558972, + "grad_norm": 1.8310288190841675, + "learning_rate": 5e-06, + "loss": 0.873, + "mean_token_accuracy": 0.729227602481842, + "num_tokens": 341184712.0, + "step": 13183 + }, + { + "epoch": 1.447836591258511, + "grad_norm": 1.8630211353302002, + "learning_rate": 5e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7133247256278992, + "num_tokens": 341212694.0, + "step": 13184 + }, + { + "epoch": 1.4479464089611245, + "grad_norm": 1.901481032371521, + "learning_rate": 5e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.719538688659668, + "num_tokens": 341239873.0, + "step": 13185 + }, + { + "epoch": 1.4480562266637382, + "grad_norm": 1.9590175151824951, + "learning_rate": 5e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7341568470001221, + "num_tokens": 341264699.0, + "step": 13186 + }, + { + "epoch": 1.448166044366352, + "grad_norm": 1.6315573453903198, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7240155339241028, + "num_tokens": 341295796.0, + "step": 13187 + }, + { + "epoch": 1.4482758620689655, + "grad_norm": 1.9911092519760132, + "learning_rate": 5e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7384081482887268, + "num_tokens": 341317580.0, + "step": 13188 + }, + { + "epoch": 1.448385679771579, + "grad_norm": 1.9075239896774292, + "learning_rate": 5e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7234611511230469, + "num_tokens": 341345279.0, + "step": 13189 + }, + { + "epoch": 1.4484954974741928, + "grad_norm": 1.8623605966567993, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7294638156890869, + "num_tokens": 341370808.0, + "step": 13190 + }, + { + "epoch": 1.4486053151768066, + "grad_norm": 1.8712198734283447, + "learning_rate": 5e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7063313722610474, + "num_tokens": 341395940.0, + "step": 13191 + }, + { + "epoch": 1.44871513287942, + "grad_norm": 2.0771188735961914, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7354674339294434, + "num_tokens": 341417147.0, + "step": 13192 + }, + { + "epoch": 1.4488249505820339, + "grad_norm": 1.9870448112487793, + "learning_rate": 5e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7232706546783447, + "num_tokens": 341440408.0, + "step": 13193 + }, + { + "epoch": 1.4489347682846474, + "grad_norm": 1.832638144493103, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7170469760894775, + "num_tokens": 341471321.0, + "step": 13194 + }, + { + "epoch": 1.4490445859872612, + "grad_norm": 1.8714760541915894, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.722883939743042, + "num_tokens": 341500443.0, + "step": 13195 + }, + { + "epoch": 1.449154403689875, + "grad_norm": 1.6776341199874878, + "learning_rate": 5e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7126737236976624, + "num_tokens": 341534255.0, + "step": 13196 + }, + { + "epoch": 1.4492642213924884, + "grad_norm": 1.8197425603866577, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.751457691192627, + "num_tokens": 341558298.0, + "step": 13197 + }, + { + "epoch": 1.4493740390951022, + "grad_norm": 1.789231538772583, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7195261716842651, + "num_tokens": 341585410.0, + "step": 13198 + }, + { + "epoch": 1.4494838567977157, + "grad_norm": 1.967626929283142, + "learning_rate": 5e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7233490943908691, + "num_tokens": 341609435.0, + "step": 13199 + }, + { + "epoch": 1.4495936745003295, + "grad_norm": 1.934513807296753, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7213195562362671, + "num_tokens": 341635429.0, + "step": 13200 + }, + { + "epoch": 1.449703492202943, + "grad_norm": 1.6181484460830688, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7248404026031494, + "num_tokens": 341669536.0, + "step": 13201 + }, + { + "epoch": 1.4498133099055568, + "grad_norm": 1.8437938690185547, + "learning_rate": 5e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7400914430618286, + "num_tokens": 341693115.0, + "step": 13202 + }, + { + "epoch": 1.4499231276081703, + "grad_norm": 1.9210935831069946, + "learning_rate": 5e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.7460814118385315, + "num_tokens": 341715937.0, + "step": 13203 + }, + { + "epoch": 1.450032945310784, + "grad_norm": 1.9710251092910767, + "learning_rate": 5e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7084230184555054, + "num_tokens": 341742070.0, + "step": 13204 + }, + { + "epoch": 1.4501427630133978, + "grad_norm": 1.7633392810821533, + "learning_rate": 5e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7363979816436768, + "num_tokens": 341769442.0, + "step": 13205 + }, + { + "epoch": 1.4502525807160114, + "grad_norm": 1.9226082563400269, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7326220870018005, + "num_tokens": 341794885.0, + "step": 13206 + }, + { + "epoch": 1.4503623984186251, + "grad_norm": 1.9895873069763184, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7407088279724121, + "num_tokens": 341818052.0, + "step": 13207 + }, + { + "epoch": 1.4504722161212387, + "grad_norm": 2.037318468093872, + "learning_rate": 5e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7437325716018677, + "num_tokens": 341840450.0, + "step": 13208 + }, + { + "epoch": 1.4505820338238524, + "grad_norm": 1.7493144273757935, + "learning_rate": 5e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7281060218811035, + "num_tokens": 341869973.0, + "step": 13209 + }, + { + "epoch": 1.4506918515264662, + "grad_norm": 2.0880966186523438, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7321010828018188, + "num_tokens": 341889940.0, + "step": 13210 + }, + { + "epoch": 1.4508016692290797, + "grad_norm": 1.9229446649551392, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7333694696426392, + "num_tokens": 341913488.0, + "step": 13211 + }, + { + "epoch": 1.4509114869316933, + "grad_norm": 1.8430523872375488, + "learning_rate": 5e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7471706867218018, + "num_tokens": 341937972.0, + "step": 13212 + }, + { + "epoch": 1.451021304634307, + "grad_norm": 1.8202179670333862, + "learning_rate": 5e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7068610787391663, + "num_tokens": 341964679.0, + "step": 13213 + }, + { + "epoch": 1.4511311223369208, + "grad_norm": 1.7311547994613647, + "learning_rate": 5e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.741046130657196, + "num_tokens": 341992721.0, + "step": 13214 + }, + { + "epoch": 1.4512409400395343, + "grad_norm": 1.8042502403259277, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7421399354934692, + "num_tokens": 342017689.0, + "step": 13215 + }, + { + "epoch": 1.451350757742148, + "grad_norm": 1.6201543807983398, + "learning_rate": 5e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7405051589012146, + "num_tokens": 342048472.0, + "step": 13216 + }, + { + "epoch": 1.4514605754447616, + "grad_norm": 1.925517201423645, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7357480525970459, + "num_tokens": 342073298.0, + "step": 13217 + }, + { + "epoch": 1.4515703931473753, + "grad_norm": 1.865521788597107, + "learning_rate": 5e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7163065075874329, + "num_tokens": 342101210.0, + "step": 13218 + }, + { + "epoch": 1.451680210849989, + "grad_norm": 1.8732547760009766, + "learning_rate": 5e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.7709124088287354, + "num_tokens": 342123774.0, + "step": 13219 + }, + { + "epoch": 1.4517900285526026, + "grad_norm": 1.9868212938308716, + "learning_rate": 5e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7354953289031982, + "num_tokens": 342149263.0, + "step": 13220 + }, + { + "epoch": 1.4518998462552164, + "grad_norm": 1.7120271921157837, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7279754877090454, + "num_tokens": 342177469.0, + "step": 13221 + }, + { + "epoch": 1.45200966395783, + "grad_norm": 1.8485338687896729, + "learning_rate": 5e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7301138639450073, + "num_tokens": 342205502.0, + "step": 13222 + }, + { + "epoch": 1.4521194816604437, + "grad_norm": 1.8643357753753662, + "learning_rate": 5e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7393790483474731, + "num_tokens": 342231929.0, + "step": 13223 + }, + { + "epoch": 1.4522292993630574, + "grad_norm": 1.7171173095703125, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7321094274520874, + "num_tokens": 342263965.0, + "step": 13224 + }, + { + "epoch": 1.452339117065671, + "grad_norm": 1.8325417041778564, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.747229278087616, + "num_tokens": 342288003.0, + "step": 13225 + }, + { + "epoch": 1.4524489347682845, + "grad_norm": 1.8387937545776367, + "learning_rate": 5e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.6963438987731934, + "num_tokens": 342315490.0, + "step": 13226 + }, + { + "epoch": 1.4525587524708983, + "grad_norm": 1.8091884851455688, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7193670868873596, + "num_tokens": 342342016.0, + "step": 13227 + }, + { + "epoch": 1.452668570173512, + "grad_norm": 1.8510996103286743, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7233024835586548, + "num_tokens": 342368515.0, + "step": 13228 + }, + { + "epoch": 1.4527783878761256, + "grad_norm": 1.8090815544128418, + "learning_rate": 5e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.756179928779602, + "num_tokens": 342392758.0, + "step": 13229 + }, + { + "epoch": 1.4528882055787393, + "grad_norm": 1.9440189599990845, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7240549325942993, + "num_tokens": 342416190.0, + "step": 13230 + }, + { + "epoch": 1.4529980232813529, + "grad_norm": 1.8906654119491577, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7338476777076721, + "num_tokens": 342440452.0, + "step": 13231 + }, + { + "epoch": 1.4531078409839666, + "grad_norm": 1.7048758268356323, + "learning_rate": 5e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7213069200515747, + "num_tokens": 342468343.0, + "step": 13232 + }, + { + "epoch": 1.4532176586865804, + "grad_norm": 1.817090630531311, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.764485239982605, + "num_tokens": 342492280.0, + "step": 13233 + }, + { + "epoch": 1.453327476389194, + "grad_norm": 1.9412287473678589, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7445642352104187, + "num_tokens": 342516075.0, + "step": 13234 + }, + { + "epoch": 1.4534372940918077, + "grad_norm": 1.8442647457122803, + "learning_rate": 5e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7261586785316467, + "num_tokens": 342546307.0, + "step": 13235 + }, + { + "epoch": 1.4535471117944212, + "grad_norm": 2.082200288772583, + "learning_rate": 5e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7200003862380981, + "num_tokens": 342569387.0, + "step": 13236 + }, + { + "epoch": 1.453656929497035, + "grad_norm": 1.5835899114608765, + "learning_rate": 5e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7294596433639526, + "num_tokens": 342601337.0, + "step": 13237 + }, + { + "epoch": 1.4537667471996487, + "grad_norm": 1.682715892791748, + "learning_rate": 5e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7392100095748901, + "num_tokens": 342629437.0, + "step": 13238 + }, + { + "epoch": 1.4538765649022622, + "grad_norm": 1.8143432140350342, + "learning_rate": 5e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7147806286811829, + "num_tokens": 342653939.0, + "step": 13239 + }, + { + "epoch": 1.4539863826048758, + "grad_norm": 1.6663596630096436, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.724331259727478, + "num_tokens": 342684220.0, + "step": 13240 + }, + { + "epoch": 1.4540962003074895, + "grad_norm": 1.81267511844635, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7370946407318115, + "num_tokens": 342709322.0, + "step": 13241 + }, + { + "epoch": 1.4542060180101033, + "grad_norm": 1.825081467628479, + "learning_rate": 5e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.743973433971405, + "num_tokens": 342733143.0, + "step": 13242 + }, + { + "epoch": 1.4543158357127168, + "grad_norm": 1.942495584487915, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7351651191711426, + "num_tokens": 342757357.0, + "step": 13243 + }, + { + "epoch": 1.4544256534153306, + "grad_norm": 1.9461475610733032, + "learning_rate": 5e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.6977498531341553, + "num_tokens": 342783241.0, + "step": 13244 + }, + { + "epoch": 1.4545354711179441, + "grad_norm": 1.988737940788269, + "learning_rate": 5e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7492586374282837, + "num_tokens": 342807024.0, + "step": 13245 + }, + { + "epoch": 1.4546452888205579, + "grad_norm": 1.9380757808685303, + "learning_rate": 5e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.743309497833252, + "num_tokens": 342829673.0, + "step": 13246 + }, + { + "epoch": 1.4547551065231716, + "grad_norm": 1.9291034936904907, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7373490333557129, + "num_tokens": 342852602.0, + "step": 13247 + }, + { + "epoch": 1.4548649242257852, + "grad_norm": 1.8098267316818237, + "learning_rate": 5e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.696499228477478, + "num_tokens": 342881324.0, + "step": 13248 + }, + { + "epoch": 1.454974741928399, + "grad_norm": 1.7894045114517212, + "learning_rate": 5e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7388444542884827, + "num_tokens": 342907132.0, + "step": 13249 + }, + { + "epoch": 1.4550845596310125, + "grad_norm": 1.7850970029830933, + "learning_rate": 5e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7332306504249573, + "num_tokens": 342933839.0, + "step": 13250 + }, + { + "epoch": 1.4551943773336262, + "grad_norm": 2.2267415523529053, + "learning_rate": 5e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7300931215286255, + "num_tokens": 342952609.0, + "step": 13251 + }, + { + "epoch": 1.45530419503624, + "grad_norm": 1.7337368726730347, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.715977132320404, + "num_tokens": 342979628.0, + "step": 13252 + }, + { + "epoch": 1.4554140127388535, + "grad_norm": 1.9089184999465942, + "learning_rate": 5e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7450722455978394, + "num_tokens": 343004197.0, + "step": 13253 + }, + { + "epoch": 1.455523830441467, + "grad_norm": 1.7518231868743896, + "learning_rate": 5e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7259178161621094, + "num_tokens": 343033238.0, + "step": 13254 + }, + { + "epoch": 1.4556336481440808, + "grad_norm": 2.0430383682250977, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7491013407707214, + "num_tokens": 343052665.0, + "step": 13255 + }, + { + "epoch": 1.4557434658466946, + "grad_norm": 1.853554368019104, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7356510162353516, + "num_tokens": 343075216.0, + "step": 13256 + }, + { + "epoch": 1.455853283549308, + "grad_norm": 1.6212741136550903, + "learning_rate": 5e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7192099690437317, + "num_tokens": 343107355.0, + "step": 13257 + }, + { + "epoch": 1.4559631012519219, + "grad_norm": 1.8927035331726074, + "learning_rate": 5e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7212181091308594, + "num_tokens": 343136200.0, + "step": 13258 + }, + { + "epoch": 1.4560729189545354, + "grad_norm": 1.9634231328964233, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7420632839202881, + "num_tokens": 343158409.0, + "step": 13259 + }, + { + "epoch": 1.4561827366571491, + "grad_norm": 1.9440609216690063, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7513886094093323, + "num_tokens": 343182541.0, + "step": 13260 + }, + { + "epoch": 1.456292554359763, + "grad_norm": 1.9997479915618896, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7284987568855286, + "num_tokens": 343205400.0, + "step": 13261 + }, + { + "epoch": 1.4564023720623764, + "grad_norm": 1.7722114324569702, + "learning_rate": 5e-06, + "loss": 0.849, + "mean_token_accuracy": 0.729805588722229, + "num_tokens": 343231746.0, + "step": 13262 + }, + { + "epoch": 1.4565121897649902, + "grad_norm": 1.8696950674057007, + "learning_rate": 5e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7443992495536804, + "num_tokens": 343256080.0, + "step": 13263 + }, + { + "epoch": 1.4566220074676037, + "grad_norm": 1.7642827033996582, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7327914237976074, + "num_tokens": 343283883.0, + "step": 13264 + }, + { + "epoch": 1.4567318251702175, + "grad_norm": 1.8849769830703735, + "learning_rate": 5e-06, + "loss": 0.843, + "mean_token_accuracy": 0.735489010810852, + "num_tokens": 343309120.0, + "step": 13265 + }, + { + "epoch": 1.456841642872831, + "grad_norm": 1.8010355234146118, + "learning_rate": 5e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7569979429244995, + "num_tokens": 343334102.0, + "step": 13266 + }, + { + "epoch": 1.4569514605754448, + "grad_norm": 2.1987576484680176, + "learning_rate": 5e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.744875431060791, + "num_tokens": 343354205.0, + "step": 13267 + }, + { + "epoch": 1.4570612782780583, + "grad_norm": 1.8061143159866333, + "learning_rate": 5e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7278822064399719, + "num_tokens": 343381159.0, + "step": 13268 + }, + { + "epoch": 1.457171095980672, + "grad_norm": 1.8907029628753662, + "learning_rate": 5e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.720355749130249, + "num_tokens": 343406832.0, + "step": 13269 + }, + { + "epoch": 1.4572809136832858, + "grad_norm": 1.8737655878067017, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7292964458465576, + "num_tokens": 343430576.0, + "step": 13270 + }, + { + "epoch": 1.4573907313858994, + "grad_norm": 1.715368628501892, + "learning_rate": 5e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7027785778045654, + "num_tokens": 343457655.0, + "step": 13271 + }, + { + "epoch": 1.4575005490885131, + "grad_norm": 1.6905862092971802, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7245871424674988, + "num_tokens": 343484902.0, + "step": 13272 + }, + { + "epoch": 1.4576103667911267, + "grad_norm": 1.5545631647109985, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7358502149581909, + "num_tokens": 343516869.0, + "step": 13273 + }, + { + "epoch": 1.4577201844937404, + "grad_norm": 2.1372392177581787, + "learning_rate": 5e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7273939847946167, + "num_tokens": 343538770.0, + "step": 13274 + }, + { + "epoch": 1.4578300021963542, + "grad_norm": 1.8900407552719116, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7436914443969727, + "num_tokens": 343562284.0, + "step": 13275 + }, + { + "epoch": 1.4579398198989677, + "grad_norm": 1.7720088958740234, + "learning_rate": 5e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7235146760940552, + "num_tokens": 343589387.0, + "step": 13276 + }, + { + "epoch": 1.4580496376015812, + "grad_norm": 1.984147548675537, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7421761751174927, + "num_tokens": 343611486.0, + "step": 13277 + }, + { + "epoch": 1.458159455304195, + "grad_norm": 1.8239446878433228, + "learning_rate": 5e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7127120494842529, + "num_tokens": 343638867.0, + "step": 13278 + }, + { + "epoch": 1.4582692730068088, + "grad_norm": 1.754502296447754, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7309987545013428, + "num_tokens": 343667443.0, + "step": 13279 + }, + { + "epoch": 1.4583790907094223, + "grad_norm": 1.8609285354614258, + "learning_rate": 5e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7317923903465271, + "num_tokens": 343692642.0, + "step": 13280 + }, + { + "epoch": 1.458488908412036, + "grad_norm": 1.7958447933197021, + "learning_rate": 5e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7112325429916382, + "num_tokens": 343719556.0, + "step": 13281 + }, + { + "epoch": 1.4585987261146496, + "grad_norm": 1.8614444732666016, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7246086001396179, + "num_tokens": 343744988.0, + "step": 13282 + }, + { + "epoch": 1.4587085438172633, + "grad_norm": 1.8435755968093872, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7264687418937683, + "num_tokens": 343771677.0, + "step": 13283 + }, + { + "epoch": 1.458818361519877, + "grad_norm": 1.7552785873413086, + "learning_rate": 5e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.719366729259491, + "num_tokens": 343799094.0, + "step": 13284 + }, + { + "epoch": 1.4589281792224906, + "grad_norm": 2.5246994495391846, + "learning_rate": 5e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7584031820297241, + "num_tokens": 343816042.0, + "step": 13285 + }, + { + "epoch": 1.4590379969251044, + "grad_norm": 1.6855156421661377, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7315912246704102, + "num_tokens": 343846022.0, + "step": 13286 + }, + { + "epoch": 1.459147814627718, + "grad_norm": 1.7229119539260864, + "learning_rate": 5e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7508541345596313, + "num_tokens": 343872776.0, + "step": 13287 + }, + { + "epoch": 1.4592576323303317, + "grad_norm": 2.0682220458984375, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7394915223121643, + "num_tokens": 343894728.0, + "step": 13288 + }, + { + "epoch": 1.4593674500329454, + "grad_norm": 2.0083022117614746, + "learning_rate": 5e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7449615597724915, + "num_tokens": 343917005.0, + "step": 13289 + }, + { + "epoch": 1.459477267735559, + "grad_norm": 2.0198898315429688, + "learning_rate": 5e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7130239009857178, + "num_tokens": 343941739.0, + "step": 13290 + }, + { + "epoch": 1.4595870854381725, + "grad_norm": 1.8485074043273926, + "learning_rate": 5e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7220842838287354, + "num_tokens": 343970054.0, + "step": 13291 + }, + { + "epoch": 1.4596969031407863, + "grad_norm": 1.6990857124328613, + "learning_rate": 5e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7640112638473511, + "num_tokens": 343996185.0, + "step": 13292 + }, + { + "epoch": 1.4598067208434, + "grad_norm": 1.850439429283142, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7455683946609497, + "num_tokens": 344021826.0, + "step": 13293 + }, + { + "epoch": 1.4599165385460136, + "grad_norm": 1.900708556175232, + "learning_rate": 5e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7377767562866211, + "num_tokens": 344046684.0, + "step": 13294 + }, + { + "epoch": 1.4600263562486273, + "grad_norm": 2.170876979827881, + "learning_rate": 5e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7351178526878357, + "num_tokens": 344065828.0, + "step": 13295 + }, + { + "epoch": 1.4601361739512408, + "grad_norm": 2.207723617553711, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7335963249206543, + "num_tokens": 344087612.0, + "step": 13296 + }, + { + "epoch": 1.4602459916538546, + "grad_norm": 1.6109254360198975, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7207421064376831, + "num_tokens": 344123679.0, + "step": 13297 + }, + { + "epoch": 1.4603558093564684, + "grad_norm": 1.8424690961837769, + "learning_rate": 5e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7426473498344421, + "num_tokens": 344148792.0, + "step": 13298 + }, + { + "epoch": 1.460465627059082, + "grad_norm": 1.8171770572662354, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7359808683395386, + "num_tokens": 344174396.0, + "step": 13299 + }, + { + "epoch": 1.4605754447616957, + "grad_norm": 1.6330314874649048, + "learning_rate": 5e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7455140948295593, + "num_tokens": 344203449.0, + "step": 13300 + }, + { + "epoch": 1.4606852624643092, + "grad_norm": 1.6858114004135132, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7328171730041504, + "num_tokens": 344231156.0, + "step": 13301 + }, + { + "epoch": 1.460795080166923, + "grad_norm": 1.9420956373214722, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7302727103233337, + "num_tokens": 344253823.0, + "step": 13302 + }, + { + "epoch": 1.4609048978695367, + "grad_norm": 1.7590811252593994, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7182673811912537, + "num_tokens": 344284373.0, + "step": 13303 + }, + { + "epoch": 1.4610147155721502, + "grad_norm": 2.015725612640381, + "learning_rate": 5e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7359432578086853, + "num_tokens": 344309812.0, + "step": 13304 + }, + { + "epoch": 1.4611245332747638, + "grad_norm": 1.9046893119812012, + "learning_rate": 5e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7226464748382568, + "num_tokens": 344337711.0, + "step": 13305 + }, + { + "epoch": 1.4612343509773775, + "grad_norm": 1.8715667724609375, + "learning_rate": 5e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.712386965751648, + "num_tokens": 344364934.0, + "step": 13306 + }, + { + "epoch": 1.4613441686799913, + "grad_norm": 1.7368242740631104, + "learning_rate": 5e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7229032516479492, + "num_tokens": 344394489.0, + "step": 13307 + }, + { + "epoch": 1.4614539863826048, + "grad_norm": 1.9157804250717163, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7145838737487793, + "num_tokens": 344421805.0, + "step": 13308 + }, + { + "epoch": 1.4615638040852186, + "grad_norm": 2.179203987121582, + "learning_rate": 5e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7069091796875, + "num_tokens": 344445768.0, + "step": 13309 + }, + { + "epoch": 1.4616736217878321, + "grad_norm": 1.819105863571167, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7578023672103882, + "num_tokens": 344468321.0, + "step": 13310 + }, + { + "epoch": 1.4617834394904459, + "grad_norm": 2.143232583999634, + "learning_rate": 5e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7401614189147949, + "num_tokens": 344489173.0, + "step": 13311 + }, + { + "epoch": 1.4618932571930596, + "grad_norm": 1.8320561647415161, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7305687665939331, + "num_tokens": 344516337.0, + "step": 13312 + }, + { + "epoch": 1.4620030748956732, + "grad_norm": 1.7653225660324097, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7291775941848755, + "num_tokens": 344544588.0, + "step": 13313 + }, + { + "epoch": 1.462112892598287, + "grad_norm": 1.8542553186416626, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7356019020080566, + "num_tokens": 344570950.0, + "step": 13314 + }, + { + "epoch": 1.4622227103009005, + "grad_norm": 2.175560474395752, + "learning_rate": 5e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7101832032203674, + "num_tokens": 344591185.0, + "step": 13315 + }, + { + "epoch": 1.4623325280035142, + "grad_norm": 1.9151912927627563, + "learning_rate": 5e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7412747740745544, + "num_tokens": 344618353.0, + "step": 13316 + }, + { + "epoch": 1.462442345706128, + "grad_norm": 1.9392033815383911, + "learning_rate": 5e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.71958327293396, + "num_tokens": 344643813.0, + "step": 13317 + }, + { + "epoch": 1.4625521634087415, + "grad_norm": 1.623360514640808, + "learning_rate": 5e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7541130781173706, + "num_tokens": 344675380.0, + "step": 13318 + }, + { + "epoch": 1.462661981111355, + "grad_norm": 1.9905976057052612, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7552061080932617, + "num_tokens": 344699284.0, + "step": 13319 + }, + { + "epoch": 1.4627717988139688, + "grad_norm": 1.724555492401123, + "learning_rate": 5e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7040094137191772, + "num_tokens": 344730968.0, + "step": 13320 + }, + { + "epoch": 1.4628816165165826, + "grad_norm": 2.0021753311157227, + "learning_rate": 5e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7302063703536987, + "num_tokens": 344753616.0, + "step": 13321 + }, + { + "epoch": 1.462991434219196, + "grad_norm": 1.6886109113693237, + "learning_rate": 5e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7247399687767029, + "num_tokens": 344782653.0, + "step": 13322 + }, + { + "epoch": 1.4631012519218098, + "grad_norm": 1.9536676406860352, + "learning_rate": 5e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7129586935043335, + "num_tokens": 344807778.0, + "step": 13323 + }, + { + "epoch": 1.4632110696244234, + "grad_norm": 1.835889458656311, + "learning_rate": 5e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7094587087631226, + "num_tokens": 344836409.0, + "step": 13324 + }, + { + "epoch": 1.4633208873270371, + "grad_norm": 1.958733081817627, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7282087802886963, + "num_tokens": 344860374.0, + "step": 13325 + }, + { + "epoch": 1.463430705029651, + "grad_norm": 1.917194128036499, + "learning_rate": 5e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7418628931045532, + "num_tokens": 344883644.0, + "step": 13326 + }, + { + "epoch": 1.4635405227322644, + "grad_norm": 1.921127200126648, + "learning_rate": 5e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7106542587280273, + "num_tokens": 344908617.0, + "step": 13327 + }, + { + "epoch": 1.4636503404348782, + "grad_norm": 1.7604084014892578, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.731804609298706, + "num_tokens": 344935155.0, + "step": 13328 + }, + { + "epoch": 1.4637601581374917, + "grad_norm": 1.7106600999832153, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.737452507019043, + "num_tokens": 344964504.0, + "step": 13329 + }, + { + "epoch": 1.4638699758401055, + "grad_norm": 1.8524218797683716, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7320506572723389, + "num_tokens": 344990294.0, + "step": 13330 + }, + { + "epoch": 1.463979793542719, + "grad_norm": 2.0239381790161133, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7363485097885132, + "num_tokens": 345011701.0, + "step": 13331 + }, + { + "epoch": 1.4640896112453328, + "grad_norm": 1.8169279098510742, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.763414740562439, + "num_tokens": 345035366.0, + "step": 13332 + }, + { + "epoch": 1.4641994289479463, + "grad_norm": 1.965537667274475, + "learning_rate": 5e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7174437046051025, + "num_tokens": 345057963.0, + "step": 13333 + }, + { + "epoch": 1.46430924665056, + "grad_norm": 1.9216142892837524, + "learning_rate": 5e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7388166189193726, + "num_tokens": 345082081.0, + "step": 13334 + }, + { + "epoch": 1.4644190643531738, + "grad_norm": 1.6735700368881226, + "learning_rate": 5e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.755246639251709, + "num_tokens": 345110497.0, + "step": 13335 + }, + { + "epoch": 1.4645288820557874, + "grad_norm": 2.2579164505004883, + "learning_rate": 5e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7453179359436035, + "num_tokens": 345129020.0, + "step": 13336 + }, + { + "epoch": 1.464638699758401, + "grad_norm": 1.9619779586791992, + "learning_rate": 5e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7293046712875366, + "num_tokens": 345153794.0, + "step": 13337 + }, + { + "epoch": 1.4647485174610146, + "grad_norm": 1.746224045753479, + "learning_rate": 5e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7306409478187561, + "num_tokens": 345180471.0, + "step": 13338 + }, + { + "epoch": 1.4648583351636284, + "grad_norm": 1.8104283809661865, + "learning_rate": 5e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7558075785636902, + "num_tokens": 345205110.0, + "step": 13339 + }, + { + "epoch": 1.4649681528662422, + "grad_norm": 1.7950352430343628, + "learning_rate": 5e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7432864904403687, + "num_tokens": 345231785.0, + "step": 13340 + }, + { + "epoch": 1.4650779705688557, + "grad_norm": 2.027390480041504, + "learning_rate": 5e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7134056091308594, + "num_tokens": 345254224.0, + "step": 13341 + }, + { + "epoch": 1.4651877882714692, + "grad_norm": 1.952286720275879, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7434784770011902, + "num_tokens": 345275858.0, + "step": 13342 + }, + { + "epoch": 1.465297605974083, + "grad_norm": 1.7773200273513794, + "learning_rate": 5e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7046188116073608, + "num_tokens": 345305646.0, + "step": 13343 + }, + { + "epoch": 1.4654074236766967, + "grad_norm": 1.9197041988372803, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7223138809204102, + "num_tokens": 345332029.0, + "step": 13344 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 1.9790257215499878, + "learning_rate": 5e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.721434473991394, + "num_tokens": 345354431.0, + "step": 13345 + }, + { + "epoch": 1.465627059081924, + "grad_norm": 1.6066317558288574, + "learning_rate": 5e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7515733242034912, + "num_tokens": 345386002.0, + "step": 13346 + }, + { + "epoch": 1.4657368767845376, + "grad_norm": 1.995704174041748, + "learning_rate": 5e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7513324618339539, + "num_tokens": 345409670.0, + "step": 13347 + }, + { + "epoch": 1.4658466944871513, + "grad_norm": 1.8498666286468506, + "learning_rate": 5e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7239426970481873, + "num_tokens": 345433393.0, + "step": 13348 + }, + { + "epoch": 1.465956512189765, + "grad_norm": 1.8781503438949585, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7329092621803284, + "num_tokens": 345457612.0, + "step": 13349 + }, + { + "epoch": 1.4660663298923786, + "grad_norm": 1.6222184896469116, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7199623584747314, + "num_tokens": 345487959.0, + "step": 13350 + }, + { + "epoch": 1.4661761475949924, + "grad_norm": 1.8885407447814941, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7207043170928955, + "num_tokens": 345514871.0, + "step": 13351 + }, + { + "epoch": 1.466285965297606, + "grad_norm": 1.7422304153442383, + "learning_rate": 5e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7291327714920044, + "num_tokens": 345544115.0, + "step": 13352 + }, + { + "epoch": 1.4663957830002197, + "grad_norm": 1.8153252601623535, + "learning_rate": 5e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7557158470153809, + "num_tokens": 345569309.0, + "step": 13353 + }, + { + "epoch": 1.4665056007028334, + "grad_norm": 1.9325435161590576, + "learning_rate": 5e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7318203449249268, + "num_tokens": 345595216.0, + "step": 13354 + }, + { + "epoch": 1.466615418405447, + "grad_norm": 1.7688987255096436, + "learning_rate": 5e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7133667469024658, + "num_tokens": 345624865.0, + "step": 13355 + }, + { + "epoch": 1.4667252361080605, + "grad_norm": 1.633927583694458, + "learning_rate": 5e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7223232984542847, + "num_tokens": 345656222.0, + "step": 13356 + }, + { + "epoch": 1.4668350538106742, + "grad_norm": 1.8476828336715698, + "learning_rate": 5e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7431576251983643, + "num_tokens": 345681351.0, + "step": 13357 + }, + { + "epoch": 1.466944871513288, + "grad_norm": 1.9416486024856567, + "learning_rate": 5e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.746695876121521, + "num_tokens": 345704911.0, + "step": 13358 + }, + { + "epoch": 1.4670546892159015, + "grad_norm": 1.9076368808746338, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7231639623641968, + "num_tokens": 345730310.0, + "step": 13359 + }, + { + "epoch": 1.4671645069185153, + "grad_norm": 2.109412670135498, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7308802604675293, + "num_tokens": 345751194.0, + "step": 13360 + }, + { + "epoch": 1.4672743246211288, + "grad_norm": 1.9382145404815674, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7434394955635071, + "num_tokens": 345773637.0, + "step": 13361 + }, + { + "epoch": 1.4673841423237426, + "grad_norm": 1.8486624956130981, + "learning_rate": 5e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7197278738021851, + "num_tokens": 345800974.0, + "step": 13362 + }, + { + "epoch": 1.4674939600263563, + "grad_norm": 1.9898760318756104, + "learning_rate": 5e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7302203178405762, + "num_tokens": 345823152.0, + "step": 13363 + }, + { + "epoch": 1.4676037777289699, + "grad_norm": 1.9340780973434448, + "learning_rate": 5e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7343981862068176, + "num_tokens": 345848869.0, + "step": 13364 + }, + { + "epoch": 1.4677135954315836, + "grad_norm": 1.881851077079773, + "learning_rate": 5e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7411141395568848, + "num_tokens": 345874051.0, + "step": 13365 + }, + { + "epoch": 1.4678234131341972, + "grad_norm": 2.0475664138793945, + "learning_rate": 5e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.7747435569763184, + "num_tokens": 345893400.0, + "step": 13366 + }, + { + "epoch": 1.467933230836811, + "grad_norm": 1.946048378944397, + "learning_rate": 5e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7446886301040649, + "num_tokens": 345917585.0, + "step": 13367 + }, + { + "epoch": 1.4680430485394247, + "grad_norm": 1.9699445962905884, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7330185174942017, + "num_tokens": 345940571.0, + "step": 13368 + }, + { + "epoch": 1.4681528662420382, + "grad_norm": 1.864681363105774, + "learning_rate": 5e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7297601699829102, + "num_tokens": 345965483.0, + "step": 13369 + }, + { + "epoch": 1.4682626839446518, + "grad_norm": 1.6943891048431396, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7265694737434387, + "num_tokens": 345994649.0, + "step": 13370 + }, + { + "epoch": 1.4683725016472655, + "grad_norm": 2.011248826980591, + "learning_rate": 5e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7296504974365234, + "num_tokens": 346018175.0, + "step": 13371 + }, + { + "epoch": 1.4684823193498793, + "grad_norm": 2.0483970642089844, + "learning_rate": 5e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7424218058586121, + "num_tokens": 346040368.0, + "step": 13372 + }, + { + "epoch": 1.4685921370524928, + "grad_norm": 2.0234317779541016, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7450324892997742, + "num_tokens": 346061483.0, + "step": 13373 + }, + { + "epoch": 1.4687019547551066, + "grad_norm": 1.6841206550598145, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.730978786945343, + "num_tokens": 346093102.0, + "step": 13374 + }, + { + "epoch": 1.46881177245772, + "grad_norm": 1.7833926677703857, + "learning_rate": 5e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7211910486221313, + "num_tokens": 346122487.0, + "step": 13375 + }, + { + "epoch": 1.4689215901603339, + "grad_norm": 1.8401923179626465, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7163013219833374, + "num_tokens": 346150125.0, + "step": 13376 + }, + { + "epoch": 1.4690314078629476, + "grad_norm": 1.923214316368103, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7131137847900391, + "num_tokens": 346175104.0, + "step": 13377 + }, + { + "epoch": 1.4691412255655611, + "grad_norm": 1.7255730628967285, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7243567705154419, + "num_tokens": 346204230.0, + "step": 13378 + }, + { + "epoch": 1.469251043268175, + "grad_norm": 1.621762990951538, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.724847674369812, + "num_tokens": 346234936.0, + "step": 13379 + }, + { + "epoch": 1.4693608609707884, + "grad_norm": 1.940220832824707, + "learning_rate": 5e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7064522504806519, + "num_tokens": 346259417.0, + "step": 13380 + }, + { + "epoch": 1.4694706786734022, + "grad_norm": 1.5718497037887573, + "learning_rate": 5e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7082366943359375, + "num_tokens": 346294892.0, + "step": 13381 + }, + { + "epoch": 1.4695804963760157, + "grad_norm": 1.8286609649658203, + "learning_rate": 5e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7745950818061829, + "num_tokens": 346316999.0, + "step": 13382 + }, + { + "epoch": 1.4696903140786295, + "grad_norm": 1.7444018125534058, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7313141226768494, + "num_tokens": 346347272.0, + "step": 13383 + }, + { + "epoch": 1.469800131781243, + "grad_norm": 1.7170932292938232, + "learning_rate": 5e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7407997846603394, + "num_tokens": 346376045.0, + "step": 13384 + }, + { + "epoch": 1.4699099494838568, + "grad_norm": 2.127861738204956, + "learning_rate": 5e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7307133078575134, + "num_tokens": 346396174.0, + "step": 13385 + }, + { + "epoch": 1.4700197671864705, + "grad_norm": 1.87269926071167, + "learning_rate": 5e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7172213792800903, + "num_tokens": 346423544.0, + "step": 13386 + }, + { + "epoch": 1.470129584889084, + "grad_norm": 1.9425569772720337, + "learning_rate": 5e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7305843830108643, + "num_tokens": 346446934.0, + "step": 13387 + }, + { + "epoch": 1.4702394025916978, + "grad_norm": 1.892208456993103, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.711097240447998, + "num_tokens": 346475252.0, + "step": 13388 + }, + { + "epoch": 1.4703492202943114, + "grad_norm": 2.094428777694702, + "learning_rate": 5e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7003308534622192, + "num_tokens": 346500445.0, + "step": 13389 + }, + { + "epoch": 1.4704590379969251, + "grad_norm": 1.8304874897003174, + "learning_rate": 5e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7132246494293213, + "num_tokens": 346527241.0, + "step": 13390 + }, + { + "epoch": 1.4705688556995389, + "grad_norm": 1.7178058624267578, + "learning_rate": 5e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7282707095146179, + "num_tokens": 346558750.0, + "step": 13391 + }, + { + "epoch": 1.4706786734021524, + "grad_norm": 2.0658891201019287, + "learning_rate": 5e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7364580631256104, + "num_tokens": 346583150.0, + "step": 13392 + }, + { + "epoch": 1.4707884911047662, + "grad_norm": 1.7171225547790527, + "learning_rate": 5e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7203992605209351, + "num_tokens": 346615730.0, + "step": 13393 + }, + { + "epoch": 1.4708983088073797, + "grad_norm": 1.7182118892669678, + "learning_rate": 5e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6962441205978394, + "num_tokens": 346646515.0, + "step": 13394 + }, + { + "epoch": 1.4710081265099935, + "grad_norm": 1.8162460327148438, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7373033761978149, + "num_tokens": 346672068.0, + "step": 13395 + }, + { + "epoch": 1.471117944212607, + "grad_norm": 2.0144546031951904, + "learning_rate": 5e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7591531276702881, + "num_tokens": 346691547.0, + "step": 13396 + }, + { + "epoch": 1.4712277619152208, + "grad_norm": 1.9809722900390625, + "learning_rate": 5e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7436623573303223, + "num_tokens": 346713816.0, + "step": 13397 + }, + { + "epoch": 1.4713375796178343, + "grad_norm": 1.9064645767211914, + "learning_rate": 5e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7605734467506409, + "num_tokens": 346738084.0, + "step": 13398 + }, + { + "epoch": 1.471447397320448, + "grad_norm": 1.7492295503616333, + "learning_rate": 5e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7381722927093506, + "num_tokens": 346765920.0, + "step": 13399 + }, + { + "epoch": 1.4715572150230618, + "grad_norm": 1.8978302478790283, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7205214500427246, + "num_tokens": 346791883.0, + "step": 13400 + }, + { + "epoch": 1.4716670327256753, + "grad_norm": 1.7820147275924683, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7451418042182922, + "num_tokens": 346818859.0, + "step": 13401 + }, + { + "epoch": 1.471776850428289, + "grad_norm": 1.7730251550674438, + "learning_rate": 5e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7224152684211731, + "num_tokens": 346850638.0, + "step": 13402 + }, + { + "epoch": 1.4718866681309026, + "grad_norm": 1.6284085512161255, + "learning_rate": 5e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7175101637840271, + "num_tokens": 346883185.0, + "step": 13403 + }, + { + "epoch": 1.4719964858335164, + "grad_norm": 1.8715778589248657, + "learning_rate": 5e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7557758092880249, + "num_tokens": 346905556.0, + "step": 13404 + }, + { + "epoch": 1.4721063035361301, + "grad_norm": 1.759690284729004, + "learning_rate": 5e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7120989561080933, + "num_tokens": 346933223.0, + "step": 13405 + }, + { + "epoch": 1.4722161212387437, + "grad_norm": 1.7937835454940796, + "learning_rate": 5e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7007102966308594, + "num_tokens": 346965046.0, + "step": 13406 + }, + { + "epoch": 1.4723259389413572, + "grad_norm": 1.7015626430511475, + "learning_rate": 5e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7109786868095398, + "num_tokens": 346995592.0, + "step": 13407 + }, + { + "epoch": 1.472435756643971, + "grad_norm": 1.8912285566329956, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7323719263076782, + "num_tokens": 347019448.0, + "step": 13408 + }, + { + "epoch": 1.4725455743465847, + "grad_norm": 1.784487247467041, + "learning_rate": 5e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7353408336639404, + "num_tokens": 347045110.0, + "step": 13409 + }, + { + "epoch": 1.4726553920491983, + "grad_norm": 1.9262526035308838, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7608724236488342, + "num_tokens": 347065398.0, + "step": 13410 + }, + { + "epoch": 1.472765209751812, + "grad_norm": 1.8723267316818237, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7352864742279053, + "num_tokens": 347089182.0, + "step": 13411 + }, + { + "epoch": 1.4728750274544256, + "grad_norm": 1.607141137123108, + "learning_rate": 5e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.724891185760498, + "num_tokens": 347123763.0, + "step": 13412 + }, + { + "epoch": 1.4729848451570393, + "grad_norm": 1.779779314994812, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7345303297042847, + "num_tokens": 347149429.0, + "step": 13413 + }, + { + "epoch": 1.473094662859653, + "grad_norm": 1.766492486000061, + "learning_rate": 5e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7285594940185547, + "num_tokens": 347177405.0, + "step": 13414 + }, + { + "epoch": 1.4732044805622666, + "grad_norm": 2.0649681091308594, + "learning_rate": 5e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7368330359458923, + "num_tokens": 347199135.0, + "step": 13415 + }, + { + "epoch": 1.4733142982648804, + "grad_norm": 1.7273519039154053, + "learning_rate": 5e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7132827043533325, + "num_tokens": 347228243.0, + "step": 13416 + }, + { + "epoch": 1.473424115967494, + "grad_norm": 1.6387802362442017, + "learning_rate": 5e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7242282032966614, + "num_tokens": 347258690.0, + "step": 13417 + }, + { + "epoch": 1.4735339336701077, + "grad_norm": 1.8849823474884033, + "learning_rate": 5e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7514150142669678, + "num_tokens": 347281533.0, + "step": 13418 + }, + { + "epoch": 1.4736437513727214, + "grad_norm": 1.9596670866012573, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7288839817047119, + "num_tokens": 347305062.0, + "step": 13419 + }, + { + "epoch": 1.473753569075335, + "grad_norm": 1.9563099145889282, + "learning_rate": 5e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7237453460693359, + "num_tokens": 347330735.0, + "step": 13420 + }, + { + "epoch": 1.4738633867779485, + "grad_norm": 1.9203929901123047, + "learning_rate": 5e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7474048137664795, + "num_tokens": 347354455.0, + "step": 13421 + }, + { + "epoch": 1.4739732044805622, + "grad_norm": 1.710010051727295, + "learning_rate": 5e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7246016263961792, + "num_tokens": 347384074.0, + "step": 13422 + }, + { + "epoch": 1.474083022183176, + "grad_norm": 1.8978641033172607, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7083152532577515, + "num_tokens": 347411909.0, + "step": 13423 + }, + { + "epoch": 1.4741928398857895, + "grad_norm": 1.679741621017456, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7220573425292969, + "num_tokens": 347443169.0, + "step": 13424 + }, + { + "epoch": 1.4743026575884033, + "grad_norm": 1.867626428604126, + "learning_rate": 5e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7381871938705444, + "num_tokens": 347467046.0, + "step": 13425 + }, + { + "epoch": 1.4744124752910168, + "grad_norm": 1.7227741479873657, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7411422729492188, + "num_tokens": 347496032.0, + "step": 13426 + }, + { + "epoch": 1.4745222929936306, + "grad_norm": 1.7645796537399292, + "learning_rate": 5e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7188299298286438, + "num_tokens": 347522366.0, + "step": 13427 + }, + { + "epoch": 1.4746321106962443, + "grad_norm": 1.8964958190917969, + "learning_rate": 5e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7277119755744934, + "num_tokens": 347547960.0, + "step": 13428 + }, + { + "epoch": 1.4747419283988579, + "grad_norm": 1.6809659004211426, + "learning_rate": 5e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7115709185600281, + "num_tokens": 347578399.0, + "step": 13429 + }, + { + "epoch": 1.4748517461014716, + "grad_norm": 2.065274953842163, + "learning_rate": 5e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7507126331329346, + "num_tokens": 347600257.0, + "step": 13430 + }, + { + "epoch": 1.4749615638040852, + "grad_norm": 1.6240530014038086, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7234708070755005, + "num_tokens": 347631185.0, + "step": 13431 + }, + { + "epoch": 1.475071381506699, + "grad_norm": 1.9541127681732178, + "learning_rate": 5e-06, + "loss": 0.797, + "mean_token_accuracy": 0.744196355342865, + "num_tokens": 347652638.0, + "step": 13432 + }, + { + "epoch": 1.4751811992093127, + "grad_norm": 1.920824408531189, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7393808960914612, + "num_tokens": 347677374.0, + "step": 13433 + }, + { + "epoch": 1.4752910169119262, + "grad_norm": 1.6636736392974854, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7429853677749634, + "num_tokens": 347705992.0, + "step": 13434 + }, + { + "epoch": 1.4754008346145397, + "grad_norm": 1.7771155834197998, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7323141098022461, + "num_tokens": 347733400.0, + "step": 13435 + }, + { + "epoch": 1.4755106523171535, + "grad_norm": 1.8817594051361084, + "learning_rate": 5e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7333015203475952, + "num_tokens": 347758029.0, + "step": 13436 + }, + { + "epoch": 1.4756204700197673, + "grad_norm": 1.552876591682434, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7274558544158936, + "num_tokens": 347793214.0, + "step": 13437 + }, + { + "epoch": 1.4757302877223808, + "grad_norm": 1.780753254890442, + "learning_rate": 5e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7117593884468079, + "num_tokens": 347821017.0, + "step": 13438 + }, + { + "epoch": 1.4758401054249946, + "grad_norm": 1.9434492588043213, + "learning_rate": 5e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7527127265930176, + "num_tokens": 347843590.0, + "step": 13439 + }, + { + "epoch": 1.475949923127608, + "grad_norm": 1.8001333475112915, + "learning_rate": 5e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7418103218078613, + "num_tokens": 347868386.0, + "step": 13440 + }, + { + "epoch": 1.4760597408302218, + "grad_norm": 1.8918150663375854, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7334847450256348, + "num_tokens": 347892341.0, + "step": 13441 + }, + { + "epoch": 1.4761695585328356, + "grad_norm": 2.4933602809906006, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7562375068664551, + "num_tokens": 347908568.0, + "step": 13442 + }, + { + "epoch": 1.4762793762354491, + "grad_norm": 1.968644380569458, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7452045679092407, + "num_tokens": 347932194.0, + "step": 13443 + }, + { + "epoch": 1.476389193938063, + "grad_norm": 1.8148478269577026, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7250927686691284, + "num_tokens": 347958601.0, + "step": 13444 + }, + { + "epoch": 1.4764990116406764, + "grad_norm": 1.8921366930007935, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7623330354690552, + "num_tokens": 347981708.0, + "step": 13445 + }, + { + "epoch": 1.4766088293432902, + "grad_norm": 1.8992934226989746, + "learning_rate": 5e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7514954805374146, + "num_tokens": 348006072.0, + "step": 13446 + }, + { + "epoch": 1.4767186470459037, + "grad_norm": 1.9171156883239746, + "learning_rate": 5e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7546601295471191, + "num_tokens": 348031355.0, + "step": 13447 + }, + { + "epoch": 1.4768284647485175, + "grad_norm": 1.848053216934204, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.700742244720459, + "num_tokens": 348057418.0, + "step": 13448 + }, + { + "epoch": 1.476938282451131, + "grad_norm": 1.6583762168884277, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7247560620307922, + "num_tokens": 348086454.0, + "step": 13449 + }, + { + "epoch": 1.4770481001537448, + "grad_norm": 1.7418302297592163, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7364037036895752, + "num_tokens": 348115326.0, + "step": 13450 + }, + { + "epoch": 1.4771579178563585, + "grad_norm": 2.172049045562744, + "learning_rate": 5e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7436929941177368, + "num_tokens": 348134511.0, + "step": 13451 + }, + { + "epoch": 1.477267735558972, + "grad_norm": 1.8648022413253784, + "learning_rate": 5e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7430336475372314, + "num_tokens": 348159186.0, + "step": 13452 + }, + { + "epoch": 1.4773775532615858, + "grad_norm": 1.8289250135421753, + "learning_rate": 5e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7386308908462524, + "num_tokens": 348185133.0, + "step": 13453 + }, + { + "epoch": 1.4774873709641994, + "grad_norm": 2.0782532691955566, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7304132580757141, + "num_tokens": 348206686.0, + "step": 13454 + }, + { + "epoch": 1.477597188666813, + "grad_norm": 1.6274691820144653, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7682995796203613, + "num_tokens": 348235711.0, + "step": 13455 + }, + { + "epoch": 1.4777070063694269, + "grad_norm": 1.6691116094589233, + "learning_rate": 5e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6991636753082275, + "num_tokens": 348268525.0, + "step": 13456 + }, + { + "epoch": 1.4778168240720404, + "grad_norm": 1.8259650468826294, + "learning_rate": 5e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.74046790599823, + "num_tokens": 348295059.0, + "step": 13457 + }, + { + "epoch": 1.477926641774654, + "grad_norm": 2.081447124481201, + "learning_rate": 5e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.7386846542358398, + "num_tokens": 348314863.0, + "step": 13458 + }, + { + "epoch": 1.4780364594772677, + "grad_norm": 1.8458900451660156, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7498690485954285, + "num_tokens": 348338659.0, + "step": 13459 + }, + { + "epoch": 1.4781462771798815, + "grad_norm": 1.8045622110366821, + "learning_rate": 5e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7206162214279175, + "num_tokens": 348366206.0, + "step": 13460 + }, + { + "epoch": 1.478256094882495, + "grad_norm": 1.9001033306121826, + "learning_rate": 5e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7247257232666016, + "num_tokens": 348388174.0, + "step": 13461 + }, + { + "epoch": 1.4783659125851087, + "grad_norm": 1.9474008083343506, + "learning_rate": 5e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7263226509094238, + "num_tokens": 348414112.0, + "step": 13462 + }, + { + "epoch": 1.4784757302877223, + "grad_norm": 2.236739158630371, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7264412045478821, + "num_tokens": 348434233.0, + "step": 13463 + }, + { + "epoch": 1.478585547990336, + "grad_norm": 1.6185412406921387, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7352675199508667, + "num_tokens": 348465453.0, + "step": 13464 + }, + { + "epoch": 1.4786953656929498, + "grad_norm": 1.89402174949646, + "learning_rate": 5e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7515988945960999, + "num_tokens": 348490485.0, + "step": 13465 + }, + { + "epoch": 1.4788051833955633, + "grad_norm": 1.6812500953674316, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7343180179595947, + "num_tokens": 348520172.0, + "step": 13466 + }, + { + "epoch": 1.478915001098177, + "grad_norm": 1.842981219291687, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7214483618736267, + "num_tokens": 348545900.0, + "step": 13467 + }, + { + "epoch": 1.4790248188007906, + "grad_norm": 1.7542264461517334, + "learning_rate": 5e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7258099317550659, + "num_tokens": 348573296.0, + "step": 13468 + }, + { + "epoch": 1.4791346365034044, + "grad_norm": 1.804116129875183, + "learning_rate": 5e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.715788722038269, + "num_tokens": 348603499.0, + "step": 13469 + }, + { + "epoch": 1.4792444542060181, + "grad_norm": 2.022146463394165, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.724082887172699, + "num_tokens": 348627378.0, + "step": 13470 + }, + { + "epoch": 1.4793542719086317, + "grad_norm": 2.1515896320343018, + "learning_rate": 5e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7474993467330933, + "num_tokens": 348648790.0, + "step": 13471 + }, + { + "epoch": 1.4794640896112452, + "grad_norm": 1.922885537147522, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.719551682472229, + "num_tokens": 348674734.0, + "step": 13472 + }, + { + "epoch": 1.479573907313859, + "grad_norm": 1.9503240585327148, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7363809943199158, + "num_tokens": 348699747.0, + "step": 13473 + }, + { + "epoch": 1.4796837250164727, + "grad_norm": 2.0988380908966064, + "learning_rate": 5e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7483747601509094, + "num_tokens": 348720129.0, + "step": 13474 + }, + { + "epoch": 1.4797935427190863, + "grad_norm": 1.787440299987793, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7483295798301697, + "num_tokens": 348748185.0, + "step": 13475 + }, + { + "epoch": 1.4799033604217, + "grad_norm": 1.9608891010284424, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7214800119400024, + "num_tokens": 348771659.0, + "step": 13476 + }, + { + "epoch": 1.4800131781243135, + "grad_norm": 1.6367030143737793, + "learning_rate": 5e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7119867205619812, + "num_tokens": 348806165.0, + "step": 13477 + }, + { + "epoch": 1.4801229958269273, + "grad_norm": 1.8276656866073608, + "learning_rate": 5e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7301486730575562, + "num_tokens": 348833867.0, + "step": 13478 + }, + { + "epoch": 1.480232813529541, + "grad_norm": 1.649685263633728, + "learning_rate": 5e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7293355464935303, + "num_tokens": 348862579.0, + "step": 13479 + }, + { + "epoch": 1.4803426312321546, + "grad_norm": 1.7444891929626465, + "learning_rate": 5e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7236645817756653, + "num_tokens": 348890606.0, + "step": 13480 + }, + { + "epoch": 1.4804524489347684, + "grad_norm": 1.5972586870193481, + "learning_rate": 5e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7182232141494751, + "num_tokens": 348923189.0, + "step": 13481 + }, + { + "epoch": 1.4805622666373819, + "grad_norm": 1.9925999641418457, + "learning_rate": 5e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7429500818252563, + "num_tokens": 348943968.0, + "step": 13482 + }, + { + "epoch": 1.4806720843399956, + "grad_norm": 1.7754998207092285, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7183954119682312, + "num_tokens": 348973062.0, + "step": 13483 + }, + { + "epoch": 1.4807819020426094, + "grad_norm": 1.9981964826583862, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7354182004928589, + "num_tokens": 348994873.0, + "step": 13484 + }, + { + "epoch": 1.480891719745223, + "grad_norm": 1.9733327627182007, + "learning_rate": 5e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7409979701042175, + "num_tokens": 349019260.0, + "step": 13485 + }, + { + "epoch": 1.4810015374478365, + "grad_norm": 1.8527061939239502, + "learning_rate": 5e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7272268533706665, + "num_tokens": 349046512.0, + "step": 13486 + }, + { + "epoch": 1.4811113551504502, + "grad_norm": 1.6158428192138672, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.746936023235321, + "num_tokens": 349078212.0, + "step": 13487 + }, + { + "epoch": 1.481221172853064, + "grad_norm": 1.6827001571655273, + "learning_rate": 5e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7293506860733032, + "num_tokens": 349105986.0, + "step": 13488 + }, + { + "epoch": 1.4813309905556775, + "grad_norm": 2.183091878890991, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7467082738876343, + "num_tokens": 349125777.0, + "step": 13489 + }, + { + "epoch": 1.4814408082582913, + "grad_norm": 1.9525084495544434, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7226344347000122, + "num_tokens": 349148774.0, + "step": 13490 + }, + { + "epoch": 1.4815506259609048, + "grad_norm": 1.8276270627975464, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7633532285690308, + "num_tokens": 349173768.0, + "step": 13491 + }, + { + "epoch": 1.4816604436635186, + "grad_norm": 1.7329468727111816, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7343260645866394, + "num_tokens": 349201914.0, + "step": 13492 + }, + { + "epoch": 1.4817702613661323, + "grad_norm": 1.8357353210449219, + "learning_rate": 5e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7332552075386047, + "num_tokens": 349229030.0, + "step": 13493 + }, + { + "epoch": 1.4818800790687459, + "grad_norm": 2.0103468894958496, + "learning_rate": 5e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7151514887809753, + "num_tokens": 349253316.0, + "step": 13494 + }, + { + "epoch": 1.4819898967713596, + "grad_norm": 1.9526994228363037, + "learning_rate": 5e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7505863308906555, + "num_tokens": 349276166.0, + "step": 13495 + }, + { + "epoch": 1.4820997144739732, + "grad_norm": 1.7440539598464966, + "learning_rate": 5e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.74174565076828, + "num_tokens": 349304085.0, + "step": 13496 + }, + { + "epoch": 1.482209532176587, + "grad_norm": 1.8737373352050781, + "learning_rate": 5e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7552400231361389, + "num_tokens": 349326998.0, + "step": 13497 + }, + { + "epoch": 1.4823193498792007, + "grad_norm": 1.567623257637024, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7202110886573792, + "num_tokens": 349359311.0, + "step": 13498 + }, + { + "epoch": 1.4824291675818142, + "grad_norm": 1.890345573425293, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7222456932067871, + "num_tokens": 349384966.0, + "step": 13499 + }, + { + "epoch": 1.4825389852844277, + "grad_norm": 1.75148344039917, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7475194334983826, + "num_tokens": 349411274.0, + "step": 13500 + }, + { + "epoch": 1.4826488029870415, + "grad_norm": 1.8902356624603271, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7585135102272034, + "num_tokens": 349434661.0, + "step": 13501 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 1.8895950317382812, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7341400980949402, + "num_tokens": 349459313.0, + "step": 13502 + }, + { + "epoch": 1.4828684383922688, + "grad_norm": 1.6548341512680054, + "learning_rate": 5e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.72215735912323, + "num_tokens": 349491875.0, + "step": 13503 + }, + { + "epoch": 1.4829782560948825, + "grad_norm": 1.8174446821212769, + "learning_rate": 5e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7132965326309204, + "num_tokens": 349520085.0, + "step": 13504 + }, + { + "epoch": 1.483088073797496, + "grad_norm": 1.7492549419403076, + "learning_rate": 5e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7287867069244385, + "num_tokens": 349547631.0, + "step": 13505 + }, + { + "epoch": 1.4831978915001098, + "grad_norm": 1.7201576232910156, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7389388084411621, + "num_tokens": 349577453.0, + "step": 13506 + }, + { + "epoch": 1.4833077092027236, + "grad_norm": 2.1157662868499756, + "learning_rate": 5e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7251116037368774, + "num_tokens": 349600759.0, + "step": 13507 + }, + { + "epoch": 1.4834175269053371, + "grad_norm": 1.7670444250106812, + "learning_rate": 5e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7224552631378174, + "num_tokens": 349630897.0, + "step": 13508 + }, + { + "epoch": 1.4835273446079509, + "grad_norm": 1.9581480026245117, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.74112868309021, + "num_tokens": 349653671.0, + "step": 13509 + }, + { + "epoch": 1.4836371623105644, + "grad_norm": 1.934509038925171, + "learning_rate": 5e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7387337684631348, + "num_tokens": 349676791.0, + "step": 13510 + }, + { + "epoch": 1.4837469800131782, + "grad_norm": 1.8640753030776978, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7287787199020386, + "num_tokens": 349700987.0, + "step": 13511 + }, + { + "epoch": 1.4838567977157917, + "grad_norm": 1.9644521474838257, + "learning_rate": 5e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.7392318248748779, + "num_tokens": 349723185.0, + "step": 13512 + }, + { + "epoch": 1.4839666154184055, + "grad_norm": 1.738992691040039, + "learning_rate": 5e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7100337147712708, + "num_tokens": 349756147.0, + "step": 13513 + }, + { + "epoch": 1.484076433121019, + "grad_norm": 1.9978095293045044, + "learning_rate": 5e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7497556209564209, + "num_tokens": 349778298.0, + "step": 13514 + }, + { + "epoch": 1.4841862508236328, + "grad_norm": 1.569993495941162, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7289191484451294, + "num_tokens": 349812463.0, + "step": 13515 + }, + { + "epoch": 1.4842960685262465, + "grad_norm": 2.139327049255371, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7382243871688843, + "num_tokens": 349833781.0, + "step": 13516 + }, + { + "epoch": 1.48440588622886, + "grad_norm": 1.775702714920044, + "learning_rate": 5e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7287693023681641, + "num_tokens": 349860359.0, + "step": 13517 + }, + { + "epoch": 1.4845157039314738, + "grad_norm": 2.1049230098724365, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7446136474609375, + "num_tokens": 349881797.0, + "step": 13518 + }, + { + "epoch": 1.4846255216340873, + "grad_norm": 1.6958359479904175, + "learning_rate": 5e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7591375112533569, + "num_tokens": 349907620.0, + "step": 13519 + }, + { + "epoch": 1.484735339336701, + "grad_norm": 1.7653130292892456, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7232653498649597, + "num_tokens": 349935049.0, + "step": 13520 + }, + { + "epoch": 1.4848451570393149, + "grad_norm": 1.908062219619751, + "learning_rate": 5e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7580925226211548, + "num_tokens": 349957469.0, + "step": 13521 + }, + { + "epoch": 1.4849549747419284, + "grad_norm": 1.8503085374832153, + "learning_rate": 5e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.7448415756225586, + "num_tokens": 349982259.0, + "step": 13522 + }, + { + "epoch": 1.485064792444542, + "grad_norm": 1.8995375633239746, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7186042666435242, + "num_tokens": 350008366.0, + "step": 13523 + }, + { + "epoch": 1.4851746101471557, + "grad_norm": 1.8466542959213257, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7479387521743774, + "num_tokens": 350033363.0, + "step": 13524 + }, + { + "epoch": 1.4852844278497694, + "grad_norm": 2.1047725677490234, + "learning_rate": 5e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7517458200454712, + "num_tokens": 350053743.0, + "step": 13525 + }, + { + "epoch": 1.485394245552383, + "grad_norm": 1.836022973060608, + "learning_rate": 5e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7191289663314819, + "num_tokens": 350082157.0, + "step": 13526 + }, + { + "epoch": 1.4855040632549967, + "grad_norm": 1.9048465490341187, + "learning_rate": 5e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.735275149345398, + "num_tokens": 350108537.0, + "step": 13527 + }, + { + "epoch": 1.4856138809576103, + "grad_norm": 1.9818446636199951, + "learning_rate": 5e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7269190549850464, + "num_tokens": 350132798.0, + "step": 13528 + }, + { + "epoch": 1.485723698660224, + "grad_norm": 2.0901336669921875, + "learning_rate": 5e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7374439835548401, + "num_tokens": 350154682.0, + "step": 13529 + }, + { + "epoch": 1.4858335163628378, + "grad_norm": 1.7332812547683716, + "learning_rate": 5e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.70248943567276, + "num_tokens": 350184318.0, + "step": 13530 + }, + { + "epoch": 1.4859433340654513, + "grad_norm": 1.752286672592163, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7148041725158691, + "num_tokens": 350211640.0, + "step": 13531 + }, + { + "epoch": 1.486053151768065, + "grad_norm": 1.8230606317520142, + "learning_rate": 5e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7243893146514893, + "num_tokens": 350238119.0, + "step": 13532 + }, + { + "epoch": 1.4861629694706786, + "grad_norm": 1.9256715774536133, + "learning_rate": 5e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7678883671760559, + "num_tokens": 350259170.0, + "step": 13533 + }, + { + "epoch": 1.4862727871732924, + "grad_norm": 2.047269582748413, + "learning_rate": 5e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7291380167007446, + "num_tokens": 350280796.0, + "step": 13534 + }, + { + "epoch": 1.4863826048759061, + "grad_norm": 1.9839502573013306, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7168433666229248, + "num_tokens": 350304255.0, + "step": 13535 + }, + { + "epoch": 1.4864924225785197, + "grad_norm": 2.1266913414001465, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7387494444847107, + "num_tokens": 350326080.0, + "step": 13536 + }, + { + "epoch": 1.4866022402811332, + "grad_norm": 1.8170274496078491, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7368636727333069, + "num_tokens": 350352430.0, + "step": 13537 + }, + { + "epoch": 1.486712057983747, + "grad_norm": 1.9397836923599243, + "learning_rate": 5e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7229951024055481, + "num_tokens": 350376784.0, + "step": 13538 + }, + { + "epoch": 1.4868218756863607, + "grad_norm": 2.075185775756836, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7479903101921082, + "num_tokens": 350397202.0, + "step": 13539 + }, + { + "epoch": 1.4869316933889742, + "grad_norm": 1.814820408821106, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7338126301765442, + "num_tokens": 350425172.0, + "step": 13540 + }, + { + "epoch": 1.487041511091588, + "grad_norm": 1.7617487907409668, + "learning_rate": 5e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.740233302116394, + "num_tokens": 350452835.0, + "step": 13541 + }, + { + "epoch": 1.4871513287942015, + "grad_norm": 1.7886414527893066, + "learning_rate": 5e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7224857807159424, + "num_tokens": 350484091.0, + "step": 13542 + }, + { + "epoch": 1.4872611464968153, + "grad_norm": 1.9954133033752441, + "learning_rate": 5e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7437942624092102, + "num_tokens": 350507786.0, + "step": 13543 + }, + { + "epoch": 1.487370964199429, + "grad_norm": 1.8914259672164917, + "learning_rate": 5e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7493579983711243, + "num_tokens": 350533543.0, + "step": 13544 + }, + { + "epoch": 1.4874807819020426, + "grad_norm": 1.603801965713501, + "learning_rate": 5e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7076988220214844, + "num_tokens": 350567664.0, + "step": 13545 + }, + { + "epoch": 1.4875905996046563, + "grad_norm": 1.7938084602355957, + "learning_rate": 5e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7166249752044678, + "num_tokens": 350594606.0, + "step": 13546 + }, + { + "epoch": 1.4877004173072699, + "grad_norm": 2.074476480484009, + "learning_rate": 5e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7512425184249878, + "num_tokens": 350614754.0, + "step": 13547 + }, + { + "epoch": 1.4878102350098836, + "grad_norm": 1.9821892976760864, + "learning_rate": 5e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7319816946983337, + "num_tokens": 350638625.0, + "step": 13548 + }, + { + "epoch": 1.4879200527124974, + "grad_norm": 1.9937844276428223, + "learning_rate": 5e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7479837536811829, + "num_tokens": 350661592.0, + "step": 13549 + }, + { + "epoch": 1.488029870415111, + "grad_norm": 1.7461260557174683, + "learning_rate": 5e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7409890294075012, + "num_tokens": 350690760.0, + "step": 13550 + }, + { + "epoch": 1.4881396881177245, + "grad_norm": 1.9218233823776245, + "learning_rate": 5e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7265831232070923, + "num_tokens": 350714716.0, + "step": 13551 + }, + { + "epoch": 1.4882495058203382, + "grad_norm": 1.7241183519363403, + "learning_rate": 5e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7538705468177795, + "num_tokens": 350739607.0, + "step": 13552 + }, + { + "epoch": 1.488359323522952, + "grad_norm": 1.6393004655838013, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.744178056716919, + "num_tokens": 350769534.0, + "step": 13553 + }, + { + "epoch": 1.4884691412255655, + "grad_norm": 2.0331132411956787, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7482892870903015, + "num_tokens": 350790803.0, + "step": 13554 + }, + { + "epoch": 1.4885789589281793, + "grad_norm": 2.0066535472869873, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7340965270996094, + "num_tokens": 350814984.0, + "step": 13555 + }, + { + "epoch": 1.4886887766307928, + "grad_norm": 1.7635374069213867, + "learning_rate": 5e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7175465226173401, + "num_tokens": 350844302.0, + "step": 13556 + }, + { + "epoch": 1.4887985943334066, + "grad_norm": 1.9841762781143188, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.733536958694458, + "num_tokens": 350867035.0, + "step": 13557 + }, + { + "epoch": 1.4889084120360203, + "grad_norm": 2.124368906021118, + "learning_rate": 5e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7241128087043762, + "num_tokens": 350890553.0, + "step": 13558 + }, + { + "epoch": 1.4890182297386338, + "grad_norm": 1.7535266876220703, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7513924241065979, + "num_tokens": 350914589.0, + "step": 13559 + }, + { + "epoch": 1.4891280474412476, + "grad_norm": 1.6207866668701172, + "learning_rate": 5e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7515149116516113, + "num_tokens": 350943650.0, + "step": 13560 + }, + { + "epoch": 1.4892378651438611, + "grad_norm": 1.7503589391708374, + "learning_rate": 5e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7136303186416626, + "num_tokens": 350972379.0, + "step": 13561 + }, + { + "epoch": 1.489347682846475, + "grad_norm": 1.8026797771453857, + "learning_rate": 5e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7410878539085388, + "num_tokens": 350996663.0, + "step": 13562 + }, + { + "epoch": 1.4894575005490884, + "grad_norm": 1.7013821601867676, + "learning_rate": 5e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7571457624435425, + "num_tokens": 351025081.0, + "step": 13563 + }, + { + "epoch": 1.4895673182517022, + "grad_norm": 2.2628283500671387, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7465648651123047, + "num_tokens": 351044777.0, + "step": 13564 + }, + { + "epoch": 1.4896771359543157, + "grad_norm": 1.685929536819458, + "learning_rate": 5e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7355889081954956, + "num_tokens": 351075120.0, + "step": 13565 + }, + { + "epoch": 1.4897869536569295, + "grad_norm": 1.819784164428711, + "learning_rate": 5e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7432361245155334, + "num_tokens": 351101360.0, + "step": 13566 + }, + { + "epoch": 1.4898967713595432, + "grad_norm": 1.8447551727294922, + "learning_rate": 5e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7551670670509338, + "num_tokens": 351125468.0, + "step": 13567 + }, + { + "epoch": 1.4900065890621568, + "grad_norm": 1.9252331256866455, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7337950468063354, + "num_tokens": 351150049.0, + "step": 13568 + }, + { + "epoch": 1.4901164067647705, + "grad_norm": 1.8620871305465698, + "learning_rate": 5e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7322207689285278, + "num_tokens": 351176261.0, + "step": 13569 + }, + { + "epoch": 1.490226224467384, + "grad_norm": 1.8094465732574463, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7313821911811829, + "num_tokens": 351202654.0, + "step": 13570 + }, + { + "epoch": 1.4903360421699978, + "grad_norm": 1.8176881074905396, + "learning_rate": 5e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7136421799659729, + "num_tokens": 351231215.0, + "step": 13571 + }, + { + "epoch": 1.4904458598726116, + "grad_norm": 1.9869340658187866, + "learning_rate": 5e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7364308834075928, + "num_tokens": 351255744.0, + "step": 13572 + }, + { + "epoch": 1.4905556775752251, + "grad_norm": 2.278557538986206, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.743145227432251, + "num_tokens": 351274767.0, + "step": 13573 + }, + { + "epoch": 1.4906654952778389, + "grad_norm": 1.7632780075073242, + "learning_rate": 5e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7124487161636353, + "num_tokens": 351303590.0, + "step": 13574 + }, + { + "epoch": 1.4907753129804524, + "grad_norm": 1.999580979347229, + "learning_rate": 5e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7378242015838623, + "num_tokens": 351326931.0, + "step": 13575 + }, + { + "epoch": 1.4908851306830662, + "grad_norm": 2.075078010559082, + "learning_rate": 5e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7485495805740356, + "num_tokens": 351350979.0, + "step": 13576 + }, + { + "epoch": 1.4909949483856797, + "grad_norm": 1.8677340745925903, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7127237319946289, + "num_tokens": 351379502.0, + "step": 13577 + }, + { + "epoch": 1.4911047660882935, + "grad_norm": 1.993858814239502, + "learning_rate": 5e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.725861668586731, + "num_tokens": 351405311.0, + "step": 13578 + }, + { + "epoch": 1.491214583790907, + "grad_norm": 1.9841258525848389, + "learning_rate": 5e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7025442123413086, + "num_tokens": 351429867.0, + "step": 13579 + }, + { + "epoch": 1.4913244014935207, + "grad_norm": 1.8614983558654785, + "learning_rate": 5e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7223856449127197, + "num_tokens": 351455398.0, + "step": 13580 + }, + { + "epoch": 1.4914342191961345, + "grad_norm": 1.695840835571289, + "learning_rate": 5e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7304966449737549, + "num_tokens": 351485516.0, + "step": 13581 + }, + { + "epoch": 1.491544036898748, + "grad_norm": 1.8380895853042603, + "learning_rate": 5e-06, + "loss": 0.832, + "mean_token_accuracy": 0.739453911781311, + "num_tokens": 351509861.0, + "step": 13582 + }, + { + "epoch": 1.4916538546013618, + "grad_norm": 1.7268568277359009, + "learning_rate": 5e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7191147804260254, + "num_tokens": 351537620.0, + "step": 13583 + }, + { + "epoch": 1.4917636723039753, + "grad_norm": 1.6203296184539795, + "learning_rate": 5e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7408956289291382, + "num_tokens": 351565734.0, + "step": 13584 + }, + { + "epoch": 1.491873490006589, + "grad_norm": 1.9330313205718994, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7592370510101318, + "num_tokens": 351587948.0, + "step": 13585 + }, + { + "epoch": 1.4919833077092028, + "grad_norm": 1.8505250215530396, + "learning_rate": 5e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7229477167129517, + "num_tokens": 351615933.0, + "step": 13586 + }, + { + "epoch": 1.4920931254118164, + "grad_norm": 1.8951036930084229, + "learning_rate": 5e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.76056969165802, + "num_tokens": 351641092.0, + "step": 13587 + }, + { + "epoch": 1.49220294311443, + "grad_norm": 1.952591896057129, + "learning_rate": 5e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7571207284927368, + "num_tokens": 351662376.0, + "step": 13588 + }, + { + "epoch": 1.4923127608170437, + "grad_norm": 2.1362996101379395, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.748609721660614, + "num_tokens": 351683711.0, + "step": 13589 + }, + { + "epoch": 1.4924225785196574, + "grad_norm": 1.7495328187942505, + "learning_rate": 5e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7220360636711121, + "num_tokens": 351713640.0, + "step": 13590 + }, + { + "epoch": 1.492532396222271, + "grad_norm": 1.707100510597229, + "learning_rate": 5e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7389161586761475, + "num_tokens": 351741333.0, + "step": 13591 + }, + { + "epoch": 1.4926422139248847, + "grad_norm": 1.6538439989089966, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7276089787483215, + "num_tokens": 351774402.0, + "step": 13592 + }, + { + "epoch": 1.4927520316274983, + "grad_norm": 1.6971577405929565, + "learning_rate": 5e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7362117767333984, + "num_tokens": 351803284.0, + "step": 13593 + }, + { + "epoch": 1.492861849330112, + "grad_norm": 1.7800923585891724, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7224975228309631, + "num_tokens": 351830356.0, + "step": 13594 + }, + { + "epoch": 1.4929716670327258, + "grad_norm": 1.8974007368087769, + "learning_rate": 5e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7349098324775696, + "num_tokens": 351854561.0, + "step": 13595 + }, + { + "epoch": 1.4930814847353393, + "grad_norm": 1.5983116626739502, + "learning_rate": 5e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7093584537506104, + "num_tokens": 351884934.0, + "step": 13596 + }, + { + "epoch": 1.493191302437953, + "grad_norm": 1.7439756393432617, + "learning_rate": 5e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7341816425323486, + "num_tokens": 351914123.0, + "step": 13597 + }, + { + "epoch": 1.4933011201405666, + "grad_norm": 1.7276136875152588, + "learning_rate": 5e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.71019446849823, + "num_tokens": 351942121.0, + "step": 13598 + }, + { + "epoch": 1.4934109378431804, + "grad_norm": 1.7392287254333496, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7251797318458557, + "num_tokens": 351972153.0, + "step": 13599 + }, + { + "epoch": 1.493520755545794, + "grad_norm": 1.746696949005127, + "learning_rate": 5e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7336685657501221, + "num_tokens": 351997512.0, + "step": 13600 + }, + { + "epoch": 1.4936305732484076, + "grad_norm": 1.7012910842895508, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7387872338294983, + "num_tokens": 352026915.0, + "step": 13601 + }, + { + "epoch": 1.4937403909510212, + "grad_norm": 1.9381953477859497, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7452383041381836, + "num_tokens": 352048795.0, + "step": 13602 + }, + { + "epoch": 1.493850208653635, + "grad_norm": 1.9780397415161133, + "learning_rate": 5e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7417421340942383, + "num_tokens": 352070428.0, + "step": 13603 + }, + { + "epoch": 1.4939600263562487, + "grad_norm": 2.055126667022705, + "learning_rate": 5e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7151796817779541, + "num_tokens": 352097948.0, + "step": 13604 + }, + { + "epoch": 1.4940698440588622, + "grad_norm": 2.267338991165161, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7137086987495422, + "num_tokens": 352121594.0, + "step": 13605 + }, + { + "epoch": 1.494179661761476, + "grad_norm": 1.737418532371521, + "learning_rate": 5e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.724895715713501, + "num_tokens": 352149460.0, + "step": 13606 + }, + { + "epoch": 1.4942894794640895, + "grad_norm": 2.0604443550109863, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7278703451156616, + "num_tokens": 352172917.0, + "step": 13607 + }, + { + "epoch": 1.4943992971667033, + "grad_norm": 1.723276138305664, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7274031639099121, + "num_tokens": 352202379.0, + "step": 13608 + }, + { + "epoch": 1.494509114869317, + "grad_norm": 2.0429720878601074, + "learning_rate": 5e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.7697910666465759, + "num_tokens": 352222612.0, + "step": 13609 + }, + { + "epoch": 1.4946189325719306, + "grad_norm": 1.7369375228881836, + "learning_rate": 5e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7245616316795349, + "num_tokens": 352253099.0, + "step": 13610 + }, + { + "epoch": 1.4947287502745443, + "grad_norm": 1.990033507347107, + "learning_rate": 5e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7252140045166016, + "num_tokens": 352276571.0, + "step": 13611 + }, + { + "epoch": 1.4948385679771579, + "grad_norm": 1.8192150592803955, + "learning_rate": 5e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7065783739089966, + "num_tokens": 352304392.0, + "step": 13612 + }, + { + "epoch": 1.4949483856797716, + "grad_norm": 1.841338038444519, + "learning_rate": 5e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7397735118865967, + "num_tokens": 352331425.0, + "step": 13613 + }, + { + "epoch": 1.4950582033823854, + "grad_norm": 1.8727627992630005, + "learning_rate": 5e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7456330060958862, + "num_tokens": 352357022.0, + "step": 13614 + }, + { + "epoch": 1.495168021084999, + "grad_norm": 1.773958683013916, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7230880260467529, + "num_tokens": 352385765.0, + "step": 13615 + }, + { + "epoch": 1.4952778387876124, + "grad_norm": 1.9737330675125122, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.760526180267334, + "num_tokens": 352408616.0, + "step": 13616 + }, + { + "epoch": 1.4953876564902262, + "grad_norm": 1.8348380327224731, + "learning_rate": 5e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7136722803115845, + "num_tokens": 352437556.0, + "step": 13617 + }, + { + "epoch": 1.49549747419284, + "grad_norm": 1.8924254179000854, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7213398814201355, + "num_tokens": 352463378.0, + "step": 13618 + }, + { + "epoch": 1.4956072918954535, + "grad_norm": 1.8484200239181519, + "learning_rate": 5e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7336013317108154, + "num_tokens": 352492380.0, + "step": 13619 + }, + { + "epoch": 1.4957171095980673, + "grad_norm": 1.9985429048538208, + "learning_rate": 5e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7271527647972107, + "num_tokens": 352515589.0, + "step": 13620 + }, + { + "epoch": 1.4958269273006808, + "grad_norm": 1.6342631578445435, + "learning_rate": 5e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7193955183029175, + "num_tokens": 352547331.0, + "step": 13621 + }, + { + "epoch": 1.4959367450032945, + "grad_norm": 2.0026192665100098, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7547999620437622, + "num_tokens": 352568049.0, + "step": 13622 + }, + { + "epoch": 1.4960465627059083, + "grad_norm": 1.8692737817764282, + "learning_rate": 5e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7208153009414673, + "num_tokens": 352596319.0, + "step": 13623 + }, + { + "epoch": 1.4961563804085218, + "grad_norm": 1.7059706449508667, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.735079824924469, + "num_tokens": 352624028.0, + "step": 13624 + }, + { + "epoch": 1.4962661981111356, + "grad_norm": 1.7057137489318848, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7471599578857422, + "num_tokens": 352649877.0, + "step": 13625 + }, + { + "epoch": 1.4963760158137491, + "grad_norm": 1.7049543857574463, + "learning_rate": 5e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7559348940849304, + "num_tokens": 352675961.0, + "step": 13626 + }, + { + "epoch": 1.4964858335163629, + "grad_norm": 1.7138755321502686, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7325851917266846, + "num_tokens": 352703630.0, + "step": 13627 + }, + { + "epoch": 1.4965956512189764, + "grad_norm": 1.9094518423080444, + "learning_rate": 5e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7377296686172485, + "num_tokens": 352726230.0, + "step": 13628 + }, + { + "epoch": 1.4967054689215902, + "grad_norm": 1.7836519479751587, + "learning_rate": 5e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7505937218666077, + "num_tokens": 352750908.0, + "step": 13629 + }, + { + "epoch": 1.4968152866242037, + "grad_norm": 1.8355381488800049, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7400298118591309, + "num_tokens": 352776556.0, + "step": 13630 + }, + { + "epoch": 1.4969251043268175, + "grad_norm": 1.5453778505325317, + "learning_rate": 5e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7440532445907593, + "num_tokens": 352812198.0, + "step": 13631 + }, + { + "epoch": 1.4970349220294312, + "grad_norm": 1.9188398122787476, + "learning_rate": 5e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.717432975769043, + "num_tokens": 352837228.0, + "step": 13632 + }, + { + "epoch": 1.4971447397320448, + "grad_norm": 1.9341273307800293, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.742618203163147, + "num_tokens": 352863193.0, + "step": 13633 + }, + { + "epoch": 1.4972545574346585, + "grad_norm": 2.0448575019836426, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7268005609512329, + "num_tokens": 352886712.0, + "step": 13634 + }, + { + "epoch": 1.497364375137272, + "grad_norm": 1.664690613746643, + "learning_rate": 5e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7092637419700623, + "num_tokens": 352920647.0, + "step": 13635 + }, + { + "epoch": 1.4974741928398858, + "grad_norm": 1.711999535560608, + "learning_rate": 5e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7210465669631958, + "num_tokens": 352947974.0, + "step": 13636 + }, + { + "epoch": 1.4975840105424996, + "grad_norm": 1.8965766429901123, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7580146789550781, + "num_tokens": 352970436.0, + "step": 13637 + }, + { + "epoch": 1.497693828245113, + "grad_norm": 1.8790794610977173, + "learning_rate": 5e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7282779216766357, + "num_tokens": 352994642.0, + "step": 13638 + }, + { + "epoch": 1.4978036459477266, + "grad_norm": 2.0103776454925537, + "learning_rate": 5e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7414409518241882, + "num_tokens": 353015205.0, + "step": 13639 + }, + { + "epoch": 1.4979134636503404, + "grad_norm": 1.8104795217514038, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7350670099258423, + "num_tokens": 353041651.0, + "step": 13640 + }, + { + "epoch": 1.4980232813529542, + "grad_norm": 1.8691935539245605, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7373833656311035, + "num_tokens": 353066399.0, + "step": 13641 + }, + { + "epoch": 1.4981330990555677, + "grad_norm": 1.8841105699539185, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.747523844242096, + "num_tokens": 353088930.0, + "step": 13642 + }, + { + "epoch": 1.4982429167581814, + "grad_norm": 1.8526535034179688, + "learning_rate": 5e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7434333562850952, + "num_tokens": 353114153.0, + "step": 13643 + }, + { + "epoch": 1.498352734460795, + "grad_norm": 1.7697515487670898, + "learning_rate": 5e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7399706840515137, + "num_tokens": 353138934.0, + "step": 13644 + }, + { + "epoch": 1.4984625521634087, + "grad_norm": 1.6238505840301514, + "learning_rate": 5e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7098242044448853, + "num_tokens": 353170713.0, + "step": 13645 + }, + { + "epoch": 1.4985723698660225, + "grad_norm": 1.7789971828460693, + "learning_rate": 5e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7243587374687195, + "num_tokens": 353197950.0, + "step": 13646 + }, + { + "epoch": 1.498682187568636, + "grad_norm": 1.65764319896698, + "learning_rate": 5e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7412970066070557, + "num_tokens": 353228111.0, + "step": 13647 + }, + { + "epoch": 1.4987920052712498, + "grad_norm": 2.0616672039031982, + "learning_rate": 5e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7159227132797241, + "num_tokens": 353250689.0, + "step": 13648 + }, + { + "epoch": 1.4989018229738633, + "grad_norm": 1.6365282535552979, + "learning_rate": 5e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7518686056137085, + "num_tokens": 353281226.0, + "step": 13649 + }, + { + "epoch": 1.499011640676477, + "grad_norm": 1.7162134647369385, + "learning_rate": 5e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7411071062088013, + "num_tokens": 353308984.0, + "step": 13650 + }, + { + "epoch": 1.4991214583790908, + "grad_norm": 1.7002825736999512, + "learning_rate": 5e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7428373098373413, + "num_tokens": 353337392.0, + "step": 13651 + }, + { + "epoch": 1.4992312760817044, + "grad_norm": 1.8880614042282104, + "learning_rate": 5e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7146148681640625, + "num_tokens": 353363533.0, + "step": 13652 + }, + { + "epoch": 1.499341093784318, + "grad_norm": 2.0434465408325195, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7458351850509644, + "num_tokens": 353383300.0, + "step": 13653 + }, + { + "epoch": 1.4994509114869317, + "grad_norm": 1.8169074058532715, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7312928438186646, + "num_tokens": 353409500.0, + "step": 13654 + }, + { + "epoch": 1.4995607291895454, + "grad_norm": 1.9819310903549194, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7368431091308594, + "num_tokens": 353434551.0, + "step": 13655 + }, + { + "epoch": 1.499670546892159, + "grad_norm": 2.1963987350463867, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7252050638198853, + "num_tokens": 353455104.0, + "step": 13656 + }, + { + "epoch": 1.4997803645947727, + "grad_norm": 1.7008283138275146, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7561737298965454, + "num_tokens": 353480801.0, + "step": 13657 + }, + { + "epoch": 1.4998901822973862, + "grad_norm": 1.6529444456100464, + "learning_rate": 5e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7204767465591431, + "num_tokens": 353512095.0, + "step": 13658 + }, + { + "epoch": 1.5, + "grad_norm": 1.867713451385498, + "learning_rate": 5e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7568391561508179, + "num_tokens": 353536756.0, + "step": 13659 + }, + { + "epoch": 1.5001098177026138, + "grad_norm": 2.0950419902801514, + "learning_rate": 5e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7538943886756897, + "num_tokens": 353556715.0, + "step": 13660 + }, + { + "epoch": 1.5002196354052273, + "grad_norm": 1.7350469827651978, + "learning_rate": 5e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.740423321723938, + "num_tokens": 353585513.0, + "step": 13661 + }, + { + "epoch": 1.5003294531078408, + "grad_norm": 1.8855071067810059, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7467729449272156, + "num_tokens": 353608105.0, + "step": 13662 + }, + { + "epoch": 1.5004392708104546, + "grad_norm": 2.1159276962280273, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7164682149887085, + "num_tokens": 353629138.0, + "step": 13663 + }, + { + "epoch": 1.5005490885130683, + "grad_norm": 1.5862157344818115, + "learning_rate": 5e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7146745920181274, + "num_tokens": 353661940.0, + "step": 13664 + }, + { + "epoch": 1.500658906215682, + "grad_norm": 1.9834994077682495, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7445619106292725, + "num_tokens": 353687396.0, + "step": 13665 + }, + { + "epoch": 1.5007687239182956, + "grad_norm": 2.168408155441284, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7271620035171509, + "num_tokens": 353708360.0, + "step": 13666 + }, + { + "epoch": 1.5008785416209092, + "grad_norm": 1.859546422958374, + "learning_rate": 5e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7347257137298584, + "num_tokens": 353734616.0, + "step": 13667 + }, + { + "epoch": 1.500988359323523, + "grad_norm": 1.6763304471969604, + "learning_rate": 5e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7264876365661621, + "num_tokens": 353763097.0, + "step": 13668 + }, + { + "epoch": 1.5010981770261367, + "grad_norm": 1.8095332384109497, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7338253855705261, + "num_tokens": 353790191.0, + "step": 13669 + }, + { + "epoch": 1.5012079947287504, + "grad_norm": 2.0463080406188965, + "learning_rate": 5e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7477244138717651, + "num_tokens": 353812027.0, + "step": 13670 + }, + { + "epoch": 1.501317812431364, + "grad_norm": 2.2302212715148926, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7246385812759399, + "num_tokens": 353831153.0, + "step": 13671 + }, + { + "epoch": 1.5014276301339775, + "grad_norm": 1.6317330598831177, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7302575707435608, + "num_tokens": 353863139.0, + "step": 13672 + }, + { + "epoch": 1.5015374478365913, + "grad_norm": 1.850620985031128, + "learning_rate": 5e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6988593935966492, + "num_tokens": 353893140.0, + "step": 13673 + }, + { + "epoch": 1.501647265539205, + "grad_norm": 1.8201667070388794, + "learning_rate": 5e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7497581839561462, + "num_tokens": 353920175.0, + "step": 13674 + }, + { + "epoch": 1.5017570832418186, + "grad_norm": 1.6295101642608643, + "learning_rate": 5e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.716565728187561, + "num_tokens": 353951848.0, + "step": 13675 + }, + { + "epoch": 1.501866900944432, + "grad_norm": 1.8465938568115234, + "learning_rate": 5e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7484535574913025, + "num_tokens": 353975295.0, + "step": 13676 + }, + { + "epoch": 1.5019767186470458, + "grad_norm": 1.9509267807006836, + "learning_rate": 5e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7056957483291626, + "num_tokens": 354002144.0, + "step": 13677 + }, + { + "epoch": 1.5020865363496596, + "grad_norm": 1.8073513507843018, + "learning_rate": 5e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7172513008117676, + "num_tokens": 354030140.0, + "step": 13678 + }, + { + "epoch": 1.5021963540522734, + "grad_norm": 1.7933706045150757, + "learning_rate": 5e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7156945466995239, + "num_tokens": 354056336.0, + "step": 13679 + }, + { + "epoch": 1.502306171754887, + "grad_norm": 1.8668237924575806, + "learning_rate": 5e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7212153673171997, + "num_tokens": 354082293.0, + "step": 13680 + }, + { + "epoch": 1.5024159894575004, + "grad_norm": 1.9984112977981567, + "learning_rate": 5e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7491589784622192, + "num_tokens": 354104594.0, + "step": 13681 + }, + { + "epoch": 1.5025258071601142, + "grad_norm": 1.771523118019104, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7153900861740112, + "num_tokens": 354133083.0, + "step": 13682 + }, + { + "epoch": 1.502635624862728, + "grad_norm": 1.9262962341308594, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7495160698890686, + "num_tokens": 354156223.0, + "step": 13683 + }, + { + "epoch": 1.5027454425653415, + "grad_norm": 1.946321964263916, + "learning_rate": 5e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7399718761444092, + "num_tokens": 354178332.0, + "step": 13684 + }, + { + "epoch": 1.5028552602679552, + "grad_norm": 1.9968502521514893, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7426731586456299, + "num_tokens": 354200645.0, + "step": 13685 + }, + { + "epoch": 1.5029650779705688, + "grad_norm": 2.1551058292388916, + "learning_rate": 5e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7499920725822449, + "num_tokens": 354219750.0, + "step": 13686 + }, + { + "epoch": 1.5030748956731825, + "grad_norm": 1.9233124256134033, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7710983753204346, + "num_tokens": 354241156.0, + "step": 13687 + }, + { + "epoch": 1.5031847133757963, + "grad_norm": 2.0569825172424316, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7378838062286377, + "num_tokens": 354262266.0, + "step": 13688 + }, + { + "epoch": 1.5032945310784098, + "grad_norm": 1.7539210319519043, + "learning_rate": 5e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7232480049133301, + "num_tokens": 354289056.0, + "step": 13689 + }, + { + "epoch": 1.5034043487810234, + "grad_norm": 1.7811371088027954, + "learning_rate": 5e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7492846250534058, + "num_tokens": 354315729.0, + "step": 13690 + }, + { + "epoch": 1.5035141664836371, + "grad_norm": 1.667810082435608, + "learning_rate": 5e-06, + "loss": 0.833, + "mean_token_accuracy": 0.73250412940979, + "num_tokens": 354344797.0, + "step": 13691 + }, + { + "epoch": 1.5036239841862509, + "grad_norm": 1.9824366569519043, + "learning_rate": 5e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7547869682312012, + "num_tokens": 354365441.0, + "step": 13692 + }, + { + "epoch": 1.5037338018888646, + "grad_norm": 1.9393227100372314, + "learning_rate": 5e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7337982654571533, + "num_tokens": 354390265.0, + "step": 13693 + }, + { + "epoch": 1.5038436195914782, + "grad_norm": 2.0387446880340576, + "learning_rate": 5e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7589972019195557, + "num_tokens": 354410627.0, + "step": 13694 + }, + { + "epoch": 1.5039534372940917, + "grad_norm": 1.737562656402588, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7369486093521118, + "num_tokens": 354437756.0, + "step": 13695 + }, + { + "epoch": 1.5040632549967055, + "grad_norm": 1.8563259840011597, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7300401926040649, + "num_tokens": 354465187.0, + "step": 13696 + }, + { + "epoch": 1.5041730726993192, + "grad_norm": 1.7663477659225464, + "learning_rate": 5e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.718265175819397, + "num_tokens": 354493873.0, + "step": 13697 + }, + { + "epoch": 1.5042828904019327, + "grad_norm": 1.6860138177871704, + "learning_rate": 5e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7284096479415894, + "num_tokens": 354524627.0, + "step": 13698 + }, + { + "epoch": 1.5043927081045465, + "grad_norm": 1.8238571882247925, + "learning_rate": 5e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.726645290851593, + "num_tokens": 354553193.0, + "step": 13699 + }, + { + "epoch": 1.50450252580716, + "grad_norm": 1.9975953102111816, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7294807434082031, + "num_tokens": 354576556.0, + "step": 13700 + }, + { + "epoch": 1.5046123435097738, + "grad_norm": 1.9305046796798706, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7428570985794067, + "num_tokens": 354599914.0, + "step": 13701 + }, + { + "epoch": 1.5047221612123876, + "grad_norm": 2.0818161964416504, + "learning_rate": 5e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7483986020088196, + "num_tokens": 354619919.0, + "step": 13702 + }, + { + "epoch": 1.504831978915001, + "grad_norm": 1.7284520864486694, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7331878542900085, + "num_tokens": 354648925.0, + "step": 13703 + }, + { + "epoch": 1.5049417966176146, + "grad_norm": 1.8141683340072632, + "learning_rate": 5e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7254405617713928, + "num_tokens": 354673674.0, + "step": 13704 + }, + { + "epoch": 1.5050516143202284, + "grad_norm": 1.7062076330184937, + "learning_rate": 5e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7492364645004272, + "num_tokens": 354700122.0, + "step": 13705 + }, + { + "epoch": 1.5051614320228421, + "grad_norm": 1.9441297054290771, + "learning_rate": 5e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7448217868804932, + "num_tokens": 354723158.0, + "step": 13706 + }, + { + "epoch": 1.505271249725456, + "grad_norm": 1.5979167222976685, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7162420153617859, + "num_tokens": 354757638.0, + "step": 13707 + }, + { + "epoch": 1.5053810674280694, + "grad_norm": 1.7363775968551636, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7248689532279968, + "num_tokens": 354787161.0, + "step": 13708 + }, + { + "epoch": 1.505490885130683, + "grad_norm": 2.0685722827911377, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7341442108154297, + "num_tokens": 354808639.0, + "step": 13709 + }, + { + "epoch": 1.5056007028332967, + "grad_norm": 1.8810782432556152, + "learning_rate": 5e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7274734377861023, + "num_tokens": 354833321.0, + "step": 13710 + }, + { + "epoch": 1.5057105205359105, + "grad_norm": 2.1869137287139893, + "learning_rate": 5e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7518877387046814, + "num_tokens": 354852786.0, + "step": 13711 + }, + { + "epoch": 1.505820338238524, + "grad_norm": 2.1059606075286865, + "learning_rate": 5e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7402074337005615, + "num_tokens": 354874052.0, + "step": 13712 + }, + { + "epoch": 1.5059301559411375, + "grad_norm": 1.577335000038147, + "learning_rate": 5e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7138594388961792, + "num_tokens": 354905905.0, + "step": 13713 + }, + { + "epoch": 1.5060399736437513, + "grad_norm": 1.7479654550552368, + "learning_rate": 5e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7614104747772217, + "num_tokens": 354931372.0, + "step": 13714 + }, + { + "epoch": 1.506149791346365, + "grad_norm": 1.6049624681472778, + "learning_rate": 5e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.715821385383606, + "num_tokens": 354963065.0, + "step": 13715 + }, + { + "epoch": 1.5062596090489788, + "grad_norm": 1.7704774141311646, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7277291417121887, + "num_tokens": 354987910.0, + "step": 13716 + }, + { + "epoch": 1.5063694267515924, + "grad_norm": 1.6045911312103271, + "learning_rate": 5e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7475317120552063, + "num_tokens": 355016465.0, + "step": 13717 + }, + { + "epoch": 1.506479244454206, + "grad_norm": 1.8917094469070435, + "learning_rate": 5e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7450283765792847, + "num_tokens": 355040000.0, + "step": 13718 + }, + { + "epoch": 1.5065890621568196, + "grad_norm": 1.698706030845642, + "learning_rate": 5e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7407999038696289, + "num_tokens": 355068698.0, + "step": 13719 + }, + { + "epoch": 1.5066988798594334, + "grad_norm": 2.1499786376953125, + "learning_rate": 5e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7350401878356934, + "num_tokens": 355089496.0, + "step": 13720 + }, + { + "epoch": 1.5068086975620472, + "grad_norm": 1.9742637872695923, + "learning_rate": 5e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7383467555046082, + "num_tokens": 355111265.0, + "step": 13721 + }, + { + "epoch": 1.5069185152646607, + "grad_norm": 2.296314001083374, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7261685132980347, + "num_tokens": 355130857.0, + "step": 13722 + }, + { + "epoch": 1.5070283329672742, + "grad_norm": 1.986437439918518, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7477990984916687, + "num_tokens": 355152047.0, + "step": 13723 + }, + { + "epoch": 1.507138150669888, + "grad_norm": 1.9242244958877563, + "learning_rate": 5e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7266979217529297, + "num_tokens": 355176842.0, + "step": 13724 + }, + { + "epoch": 1.5072479683725017, + "grad_norm": 1.7541526556015015, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7285386919975281, + "num_tokens": 355206273.0, + "step": 13725 + }, + { + "epoch": 1.5073577860751153, + "grad_norm": 1.9226003885269165, + "learning_rate": 5e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7300366163253784, + "num_tokens": 355231931.0, + "step": 13726 + }, + { + "epoch": 1.5074676037777288, + "grad_norm": 1.9010815620422363, + "learning_rate": 5e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7164726257324219, + "num_tokens": 355258587.0, + "step": 13727 + }, + { + "epoch": 1.5075774214803426, + "grad_norm": 1.6622174978256226, + "learning_rate": 5e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7422189712524414, + "num_tokens": 355288813.0, + "step": 13728 + }, + { + "epoch": 1.5076872391829563, + "grad_norm": 1.9389607906341553, + "learning_rate": 5e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7425430417060852, + "num_tokens": 355311073.0, + "step": 13729 + }, + { + "epoch": 1.50779705688557, + "grad_norm": 1.701820731163025, + "learning_rate": 5e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7332419157028198, + "num_tokens": 355338604.0, + "step": 13730 + }, + { + "epoch": 1.5079068745881836, + "grad_norm": 1.8588618040084839, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7355575561523438, + "num_tokens": 355363987.0, + "step": 13731 + }, + { + "epoch": 1.5080166922907972, + "grad_norm": 1.9305267333984375, + "learning_rate": 5e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7365811467170715, + "num_tokens": 355389065.0, + "step": 13732 + }, + { + "epoch": 1.508126509993411, + "grad_norm": 1.6400429010391235, + "learning_rate": 5e-06, + "loss": 0.926, + "mean_token_accuracy": 0.724844217300415, + "num_tokens": 355419382.0, + "step": 13733 + }, + { + "epoch": 1.5082363276960247, + "grad_norm": 1.8036940097808838, + "learning_rate": 5e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7256505489349365, + "num_tokens": 355446816.0, + "step": 13734 + }, + { + "epoch": 1.5083461453986382, + "grad_norm": 2.18249773979187, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7434357404708862, + "num_tokens": 355464244.0, + "step": 13735 + }, + { + "epoch": 1.508455963101252, + "grad_norm": 2.0074729919433594, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7396617531776428, + "num_tokens": 355487195.0, + "step": 13736 + }, + { + "epoch": 1.5085657808038655, + "grad_norm": 1.9203466176986694, + "learning_rate": 5e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7232411503791809, + "num_tokens": 355511173.0, + "step": 13737 + }, + { + "epoch": 1.5086755985064793, + "grad_norm": 1.880143404006958, + "learning_rate": 5e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7267200946807861, + "num_tokens": 355538634.0, + "step": 13738 + }, + { + "epoch": 1.508785416209093, + "grad_norm": 1.8259252309799194, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7233017086982727, + "num_tokens": 355565199.0, + "step": 13739 + }, + { + "epoch": 1.5088952339117065, + "grad_norm": 1.6455161571502686, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7419115304946899, + "num_tokens": 355594830.0, + "step": 13740 + }, + { + "epoch": 1.50900505161432, + "grad_norm": 1.6297601461410522, + "learning_rate": 5e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7446247339248657, + "num_tokens": 355624396.0, + "step": 13741 + }, + { + "epoch": 1.5091148693169338, + "grad_norm": 1.932157278060913, + "learning_rate": 5e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7267918586730957, + "num_tokens": 355646661.0, + "step": 13742 + }, + { + "epoch": 1.5092246870195476, + "grad_norm": 1.9857794046401978, + "learning_rate": 5e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7387533187866211, + "num_tokens": 355668930.0, + "step": 13743 + }, + { + "epoch": 1.5093345047221614, + "grad_norm": 1.8655344247817993, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7194792628288269, + "num_tokens": 355694378.0, + "step": 13744 + }, + { + "epoch": 1.5094443224247749, + "grad_norm": 1.66990327835083, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7223994731903076, + "num_tokens": 355725724.0, + "step": 13745 + }, + { + "epoch": 1.5095541401273884, + "grad_norm": 1.8246420621871948, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7347732782363892, + "num_tokens": 355752242.0, + "step": 13746 + }, + { + "epoch": 1.5096639578300022, + "grad_norm": 1.5753823518753052, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7402521967887878, + "num_tokens": 355785085.0, + "step": 13747 + }, + { + "epoch": 1.509773775532616, + "grad_norm": 1.9460978507995605, + "learning_rate": 5e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7304596900939941, + "num_tokens": 355809712.0, + "step": 13748 + }, + { + "epoch": 1.5098835932352295, + "grad_norm": 1.8272920846939087, + "learning_rate": 5e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.730289101600647, + "num_tokens": 355836144.0, + "step": 13749 + }, + { + "epoch": 1.5099934109378432, + "grad_norm": 1.795316457748413, + "learning_rate": 5e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7236175537109375, + "num_tokens": 355863874.0, + "step": 13750 + }, + { + "epoch": 1.5101032286404568, + "grad_norm": 1.9358878135681152, + "learning_rate": 5e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7381993532180786, + "num_tokens": 355886136.0, + "step": 13751 + }, + { + "epoch": 1.5102130463430705, + "grad_norm": 1.979958176612854, + "learning_rate": 5e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7319355607032776, + "num_tokens": 355908926.0, + "step": 13752 + }, + { + "epoch": 1.5103228640456843, + "grad_norm": 1.8850395679473877, + "learning_rate": 5e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7488220930099487, + "num_tokens": 355931933.0, + "step": 13753 + }, + { + "epoch": 1.5104326817482978, + "grad_norm": 1.7393560409545898, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7132633328437805, + "num_tokens": 355960472.0, + "step": 13754 + }, + { + "epoch": 1.5105424994509113, + "grad_norm": 1.9597644805908203, + "learning_rate": 5e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7396997213363647, + "num_tokens": 355982211.0, + "step": 13755 + }, + { + "epoch": 1.510652317153525, + "grad_norm": 1.8840587139129639, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7163503170013428, + "num_tokens": 356007758.0, + "step": 13756 + }, + { + "epoch": 1.5107621348561389, + "grad_norm": 1.922214150428772, + "learning_rate": 5e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7541672587394714, + "num_tokens": 356030448.0, + "step": 13757 + }, + { + "epoch": 1.5108719525587526, + "grad_norm": 1.9650747776031494, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7274536490440369, + "num_tokens": 356053315.0, + "step": 13758 + }, + { + "epoch": 1.5109817702613662, + "grad_norm": 1.8678381443023682, + "learning_rate": 5e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7281994819641113, + "num_tokens": 356077605.0, + "step": 13759 + }, + { + "epoch": 1.5110915879639797, + "grad_norm": 1.8613135814666748, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.726803719997406, + "num_tokens": 356102312.0, + "step": 13760 + }, + { + "epoch": 1.5112014056665934, + "grad_norm": 1.9855679273605347, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7421257495880127, + "num_tokens": 356122597.0, + "step": 13761 + }, + { + "epoch": 1.5113112233692072, + "grad_norm": 2.1899218559265137, + "learning_rate": 5e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7448329329490662, + "num_tokens": 356140699.0, + "step": 13762 + }, + { + "epoch": 1.5114210410718207, + "grad_norm": 1.7513656616210938, + "learning_rate": 5e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7390727996826172, + "num_tokens": 356168223.0, + "step": 13763 + }, + { + "epoch": 1.5115308587744345, + "grad_norm": 2.1165082454681396, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7685584425926208, + "num_tokens": 356186986.0, + "step": 13764 + }, + { + "epoch": 1.511640676477048, + "grad_norm": 1.8260776996612549, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7203084826469421, + "num_tokens": 356213870.0, + "step": 13765 + }, + { + "epoch": 1.5117504941796618, + "grad_norm": 1.919153094291687, + "learning_rate": 5e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7155221104621887, + "num_tokens": 356238731.0, + "step": 13766 + }, + { + "epoch": 1.5118603118822755, + "grad_norm": 1.5870286226272583, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7064511179924011, + "num_tokens": 356274934.0, + "step": 13767 + }, + { + "epoch": 1.511970129584889, + "grad_norm": 1.8019931316375732, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7146647572517395, + "num_tokens": 356303088.0, + "step": 13768 + }, + { + "epoch": 1.5120799472875026, + "grad_norm": 2.031212568283081, + "learning_rate": 5e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7213828563690186, + "num_tokens": 356329248.0, + "step": 13769 + }, + { + "epoch": 1.5121897649901164, + "grad_norm": 1.8812216520309448, + "learning_rate": 5e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7140119671821594, + "num_tokens": 356354501.0, + "step": 13770 + }, + { + "epoch": 1.5122995826927301, + "grad_norm": 1.8935787677764893, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7382919788360596, + "num_tokens": 356376778.0, + "step": 13771 + }, + { + "epoch": 1.5124094003953439, + "grad_norm": 1.7498724460601807, + "learning_rate": 5e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7306290864944458, + "num_tokens": 356404215.0, + "step": 13772 + }, + { + "epoch": 1.5125192180979574, + "grad_norm": 1.8945003747940063, + "learning_rate": 5e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7068237066268921, + "num_tokens": 356429950.0, + "step": 13773 + }, + { + "epoch": 1.512629035800571, + "grad_norm": 1.7248258590698242, + "learning_rate": 5e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7277328372001648, + "num_tokens": 356457404.0, + "step": 13774 + }, + { + "epoch": 1.5127388535031847, + "grad_norm": 1.8267042636871338, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7340428233146667, + "num_tokens": 356484307.0, + "step": 13775 + }, + { + "epoch": 1.5128486712057985, + "grad_norm": 1.7584612369537354, + "learning_rate": 5e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7305916547775269, + "num_tokens": 356511726.0, + "step": 13776 + }, + { + "epoch": 1.512958488908412, + "grad_norm": 1.909438133239746, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7352402806282043, + "num_tokens": 356536435.0, + "step": 13777 + }, + { + "epoch": 1.5130683066110255, + "grad_norm": 1.7255710363388062, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7189463376998901, + "num_tokens": 356563397.0, + "step": 13778 + }, + { + "epoch": 1.5131781243136393, + "grad_norm": 1.7339869737625122, + "learning_rate": 5e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7399530410766602, + "num_tokens": 356591179.0, + "step": 13779 + }, + { + "epoch": 1.513287942016253, + "grad_norm": 1.594088077545166, + "learning_rate": 5e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7463542222976685, + "num_tokens": 356622674.0, + "step": 13780 + }, + { + "epoch": 1.5133977597188668, + "grad_norm": 2.139800548553467, + "learning_rate": 5e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7102667093276978, + "num_tokens": 356645988.0, + "step": 13781 + }, + { + "epoch": 1.5135075774214803, + "grad_norm": 1.9576268196105957, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7401068210601807, + "num_tokens": 356672150.0, + "step": 13782 + }, + { + "epoch": 1.5136173951240939, + "grad_norm": 1.8800026178359985, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7416592240333557, + "num_tokens": 356697043.0, + "step": 13783 + }, + { + "epoch": 1.5137272128267076, + "grad_norm": 1.8418645858764648, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7205597758293152, + "num_tokens": 356721351.0, + "step": 13784 + }, + { + "epoch": 1.5138370305293214, + "grad_norm": 1.8976967334747314, + "learning_rate": 5e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7321233153343201, + "num_tokens": 356746772.0, + "step": 13785 + }, + { + "epoch": 1.5139468482319351, + "grad_norm": 1.7374005317687988, + "learning_rate": 5e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7512229681015015, + "num_tokens": 356773642.0, + "step": 13786 + }, + { + "epoch": 1.5140566659345487, + "grad_norm": 1.7758969068527222, + "learning_rate": 5e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.71421879529953, + "num_tokens": 356802703.0, + "step": 13787 + }, + { + "epoch": 1.5141664836371622, + "grad_norm": 1.7072352170944214, + "learning_rate": 5e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.744247317314148, + "num_tokens": 356828984.0, + "step": 13788 + }, + { + "epoch": 1.514276301339776, + "grad_norm": 1.6980810165405273, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7466915845870972, + "num_tokens": 356855532.0, + "step": 13789 + }, + { + "epoch": 1.5143861190423897, + "grad_norm": 1.8793537616729736, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.76023930311203, + "num_tokens": 356878074.0, + "step": 13790 + }, + { + "epoch": 1.5144959367450033, + "grad_norm": 1.674116849899292, + "learning_rate": 5e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.713219404220581, + "num_tokens": 356909749.0, + "step": 13791 + }, + { + "epoch": 1.5146057544476168, + "grad_norm": 1.929122805595398, + "learning_rate": 5e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7366908192634583, + "num_tokens": 356934157.0, + "step": 13792 + }, + { + "epoch": 1.5147155721502306, + "grad_norm": 1.8063536882400513, + "learning_rate": 5e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7199215888977051, + "num_tokens": 356962386.0, + "step": 13793 + }, + { + "epoch": 1.5148253898528443, + "grad_norm": 1.7720849514007568, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7304404973983765, + "num_tokens": 356990211.0, + "step": 13794 + }, + { + "epoch": 1.514935207555458, + "grad_norm": 2.007108449935913, + "learning_rate": 5e-06, + "loss": 0.7962, + "mean_token_accuracy": 0.7405927777290344, + "num_tokens": 357012147.0, + "step": 13795 + }, + { + "epoch": 1.5150450252580716, + "grad_norm": 1.662757396697998, + "learning_rate": 5e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7393890619277954, + "num_tokens": 357042535.0, + "step": 13796 + }, + { + "epoch": 1.5151548429606851, + "grad_norm": 1.8679107427597046, + "learning_rate": 5e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7194373607635498, + "num_tokens": 357068464.0, + "step": 13797 + }, + { + "epoch": 1.515264660663299, + "grad_norm": 1.8202159404754639, + "learning_rate": 5e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7312396168708801, + "num_tokens": 357094818.0, + "step": 13798 + }, + { + "epoch": 1.5153744783659127, + "grad_norm": 1.8836750984191895, + "learning_rate": 5e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7425059080123901, + "num_tokens": 357118826.0, + "step": 13799 + }, + { + "epoch": 1.5154842960685262, + "grad_norm": 1.8634696006774902, + "learning_rate": 5e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7202643156051636, + "num_tokens": 357144530.0, + "step": 13800 + }, + { + "epoch": 1.51559411377114, + "grad_norm": 1.7888377904891968, + "learning_rate": 5e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.744606614112854, + "num_tokens": 357168276.0, + "step": 13801 + }, + { + "epoch": 1.5157039314737535, + "grad_norm": 2.0190064907073975, + "learning_rate": 5e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7395941615104675, + "num_tokens": 357190360.0, + "step": 13802 + }, + { + "epoch": 1.5158137491763672, + "grad_norm": 1.7447426319122314, + "learning_rate": 5e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7413233518600464, + "num_tokens": 357218692.0, + "step": 13803 + }, + { + "epoch": 1.515923566878981, + "grad_norm": 2.012054920196533, + "learning_rate": 5e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7416986227035522, + "num_tokens": 357241750.0, + "step": 13804 + }, + { + "epoch": 1.5160333845815945, + "grad_norm": 1.7410403490066528, + "learning_rate": 5e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7497296929359436, + "num_tokens": 357268948.0, + "step": 13805 + }, + { + "epoch": 1.516143202284208, + "grad_norm": 1.9083831310272217, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7215707302093506, + "num_tokens": 357292968.0, + "step": 13806 + }, + { + "epoch": 1.5162530199868218, + "grad_norm": 1.7120497226715088, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7216248512268066, + "num_tokens": 357322011.0, + "step": 13807 + }, + { + "epoch": 1.5163628376894356, + "grad_norm": 1.8771791458129883, + "learning_rate": 5e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7652290463447571, + "num_tokens": 357345993.0, + "step": 13808 + }, + { + "epoch": 1.5164726553920493, + "grad_norm": 1.827523946762085, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7144342064857483, + "num_tokens": 357373985.0, + "step": 13809 + }, + { + "epoch": 1.5165824730946629, + "grad_norm": 1.9407254457473755, + "learning_rate": 5e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7035272717475891, + "num_tokens": 357401408.0, + "step": 13810 + }, + { + "epoch": 1.5166922907972764, + "grad_norm": 1.663430094718933, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7193596363067627, + "num_tokens": 357436935.0, + "step": 13811 + }, + { + "epoch": 1.5168021084998902, + "grad_norm": 1.6886011362075806, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7164785861968994, + "num_tokens": 357466968.0, + "step": 13812 + }, + { + "epoch": 1.516911926202504, + "grad_norm": 2.1321914196014404, + "learning_rate": 5e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7519060969352722, + "num_tokens": 357485482.0, + "step": 13813 + }, + { + "epoch": 1.5170217439051175, + "grad_norm": 1.8610800504684448, + "learning_rate": 5e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7455363273620605, + "num_tokens": 357510844.0, + "step": 13814 + }, + { + "epoch": 1.5171315616077312, + "grad_norm": 1.6893649101257324, + "learning_rate": 5e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7198994159698486, + "num_tokens": 357541775.0, + "step": 13815 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 1.8158860206604004, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7240090370178223, + "num_tokens": 357568660.0, + "step": 13816 + }, + { + "epoch": 1.5173511970129585, + "grad_norm": 1.765264868736267, + "learning_rate": 5e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6918920278549194, + "num_tokens": 357599675.0, + "step": 13817 + }, + { + "epoch": 1.5174610147155723, + "grad_norm": 1.8373867273330688, + "learning_rate": 5e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7241623401641846, + "num_tokens": 357624824.0, + "step": 13818 + }, + { + "epoch": 1.5175708324181858, + "grad_norm": 1.6890026330947876, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7215242385864258, + "num_tokens": 357655828.0, + "step": 13819 + }, + { + "epoch": 1.5176806501207993, + "grad_norm": 1.778171181678772, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7335294485092163, + "num_tokens": 357681130.0, + "step": 13820 + }, + { + "epoch": 1.517790467823413, + "grad_norm": 1.633742332458496, + "learning_rate": 5e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7083187103271484, + "num_tokens": 357711824.0, + "step": 13821 + }, + { + "epoch": 1.5179002855260268, + "grad_norm": 1.7455991506576538, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7388018369674683, + "num_tokens": 357737450.0, + "step": 13822 + }, + { + "epoch": 1.5180101032286406, + "grad_norm": 2.187511682510376, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7422767877578735, + "num_tokens": 357758458.0, + "step": 13823 + }, + { + "epoch": 1.5181199209312541, + "grad_norm": 1.8131910562515259, + "learning_rate": 5e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7492305636405945, + "num_tokens": 357783392.0, + "step": 13824 + }, + { + "epoch": 1.5182297386338677, + "grad_norm": 1.8200809955596924, + "learning_rate": 5e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.707496166229248, + "num_tokens": 357813220.0, + "step": 13825 + }, + { + "epoch": 1.5183395563364814, + "grad_norm": 1.9390580654144287, + "learning_rate": 5e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7523126602172852, + "num_tokens": 357837190.0, + "step": 13826 + }, + { + "epoch": 1.5184493740390952, + "grad_norm": 1.8551263809204102, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7262833118438721, + "num_tokens": 357861795.0, + "step": 13827 + }, + { + "epoch": 1.5185591917417087, + "grad_norm": 1.749739408493042, + "learning_rate": 5e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7128855586051941, + "num_tokens": 357889244.0, + "step": 13828 + }, + { + "epoch": 1.5186690094443223, + "grad_norm": 2.232034683227539, + "learning_rate": 5e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7358316779136658, + "num_tokens": 357909713.0, + "step": 13829 + }, + { + "epoch": 1.518778827146936, + "grad_norm": 1.8508433103561401, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7411477565765381, + "num_tokens": 357936478.0, + "step": 13830 + }, + { + "epoch": 1.5188886448495498, + "grad_norm": 1.9773168563842773, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7325530648231506, + "num_tokens": 357960202.0, + "step": 13831 + }, + { + "epoch": 1.5189984625521635, + "grad_norm": 1.7583409547805786, + "learning_rate": 5e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7183308601379395, + "num_tokens": 357988405.0, + "step": 13832 + }, + { + "epoch": 1.519108280254777, + "grad_norm": 1.6881355047225952, + "learning_rate": 5e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7181575894355774, + "num_tokens": 358019550.0, + "step": 13833 + }, + { + "epoch": 1.5192180979573906, + "grad_norm": 1.8316447734832764, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7300647497177124, + "num_tokens": 358047791.0, + "step": 13834 + }, + { + "epoch": 1.5193279156600044, + "grad_norm": 2.0748605728149414, + "learning_rate": 5e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7100355625152588, + "num_tokens": 358075566.0, + "step": 13835 + }, + { + "epoch": 1.5194377333626181, + "grad_norm": 1.916306972503662, + "learning_rate": 5e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7452988028526306, + "num_tokens": 358099535.0, + "step": 13836 + }, + { + "epoch": 1.5195475510652319, + "grad_norm": 1.7469043731689453, + "learning_rate": 5e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7390832901000977, + "num_tokens": 358125862.0, + "step": 13837 + }, + { + "epoch": 1.5196573687678454, + "grad_norm": 1.8518810272216797, + "learning_rate": 5e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7190995216369629, + "num_tokens": 358150776.0, + "step": 13838 + }, + { + "epoch": 1.519767186470459, + "grad_norm": 1.6498079299926758, + "learning_rate": 5e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7266663312911987, + "num_tokens": 358181424.0, + "step": 13839 + }, + { + "epoch": 1.5198770041730727, + "grad_norm": 1.968773603439331, + "learning_rate": 5e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7231303453445435, + "num_tokens": 358204852.0, + "step": 13840 + }, + { + "epoch": 1.5199868218756865, + "grad_norm": 1.7336719036102295, + "learning_rate": 5e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7318677306175232, + "num_tokens": 358232149.0, + "step": 13841 + }, + { + "epoch": 1.5200966395783, + "grad_norm": 1.8295701742172241, + "learning_rate": 5e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7274250984191895, + "num_tokens": 358258279.0, + "step": 13842 + }, + { + "epoch": 1.5202064572809135, + "grad_norm": 1.9242618083953857, + "learning_rate": 5e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7489890456199646, + "num_tokens": 358281607.0, + "step": 13843 + }, + { + "epoch": 1.5203162749835273, + "grad_norm": 1.7238399982452393, + "learning_rate": 5e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7175465822219849, + "num_tokens": 358310032.0, + "step": 13844 + }, + { + "epoch": 1.520426092686141, + "grad_norm": 2.3341305255889893, + "learning_rate": 5e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7322688102722168, + "num_tokens": 358327912.0, + "step": 13845 + }, + { + "epoch": 1.5205359103887548, + "grad_norm": 1.9452500343322754, + "learning_rate": 5e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7465506792068481, + "num_tokens": 358352400.0, + "step": 13846 + }, + { + "epoch": 1.5206457280913683, + "grad_norm": 1.8889994621276855, + "learning_rate": 5e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7362908124923706, + "num_tokens": 358376105.0, + "step": 13847 + }, + { + "epoch": 1.5207555457939819, + "grad_norm": 1.7999224662780762, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7488742470741272, + "num_tokens": 358402017.0, + "step": 13848 + }, + { + "epoch": 1.5208653634965956, + "grad_norm": 1.8133388757705688, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7309114336967468, + "num_tokens": 358428876.0, + "step": 13849 + }, + { + "epoch": 1.5209751811992094, + "grad_norm": 2.030878782272339, + "learning_rate": 5e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7035911679267883, + "num_tokens": 358454354.0, + "step": 13850 + }, + { + "epoch": 1.5210849989018231, + "grad_norm": 1.6637518405914307, + "learning_rate": 5e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7246345281600952, + "num_tokens": 358483960.0, + "step": 13851 + }, + { + "epoch": 1.5211948166044367, + "grad_norm": 2.0720441341400146, + "learning_rate": 5e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7408216595649719, + "num_tokens": 358506407.0, + "step": 13852 + }, + { + "epoch": 1.5213046343070502, + "grad_norm": 1.8654954433441162, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7466713190078735, + "num_tokens": 358530810.0, + "step": 13853 + }, + { + "epoch": 1.521414452009664, + "grad_norm": 1.9402799606323242, + "learning_rate": 5e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7128804326057434, + "num_tokens": 358555020.0, + "step": 13854 + }, + { + "epoch": 1.5215242697122777, + "grad_norm": 1.8896737098693848, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7342329025268555, + "num_tokens": 358579738.0, + "step": 13855 + }, + { + "epoch": 1.5216340874148913, + "grad_norm": 1.7782782316207886, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7590363025665283, + "num_tokens": 358606030.0, + "step": 13856 + }, + { + "epoch": 1.5217439051175048, + "grad_norm": 1.7271289825439453, + "learning_rate": 5e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.722380518913269, + "num_tokens": 358636440.0, + "step": 13857 + }, + { + "epoch": 1.5218537228201185, + "grad_norm": 1.9110026359558105, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7536758184432983, + "num_tokens": 358658919.0, + "step": 13858 + }, + { + "epoch": 1.5219635405227323, + "grad_norm": 1.9792189598083496, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7370151281356812, + "num_tokens": 358682228.0, + "step": 13859 + }, + { + "epoch": 1.522073358225346, + "grad_norm": 1.6674058437347412, + "learning_rate": 5e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7200205326080322, + "num_tokens": 358715607.0, + "step": 13860 + }, + { + "epoch": 1.5221831759279596, + "grad_norm": 1.6405203342437744, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7304263710975647, + "num_tokens": 358749533.0, + "step": 13861 + }, + { + "epoch": 1.5222929936305731, + "grad_norm": 1.8355228900909424, + "learning_rate": 5e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7474404573440552, + "num_tokens": 358774535.0, + "step": 13862 + }, + { + "epoch": 1.522402811333187, + "grad_norm": 1.8903933763504028, + "learning_rate": 5e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7299932837486267, + "num_tokens": 358800805.0, + "step": 13863 + }, + { + "epoch": 1.5225126290358006, + "grad_norm": 1.730605125427246, + "learning_rate": 5e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7426572442054749, + "num_tokens": 358827789.0, + "step": 13864 + }, + { + "epoch": 1.5226224467384142, + "grad_norm": 1.829164743423462, + "learning_rate": 5e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7423925399780273, + "num_tokens": 358853951.0, + "step": 13865 + }, + { + "epoch": 1.522732264441028, + "grad_norm": 1.7926054000854492, + "learning_rate": 5e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7262086868286133, + "num_tokens": 358880287.0, + "step": 13866 + }, + { + "epoch": 1.5228420821436415, + "grad_norm": 1.6555976867675781, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.734856128692627, + "num_tokens": 358910663.0, + "step": 13867 + }, + { + "epoch": 1.5229518998462552, + "grad_norm": 2.0298759937286377, + "learning_rate": 5e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7417764663696289, + "num_tokens": 358931320.0, + "step": 13868 + }, + { + "epoch": 1.523061717548869, + "grad_norm": 1.7711763381958008, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7408139705657959, + "num_tokens": 358957195.0, + "step": 13869 + }, + { + "epoch": 1.5231715352514825, + "grad_norm": 1.9417723417282104, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7375562191009521, + "num_tokens": 358983733.0, + "step": 13870 + }, + { + "epoch": 1.523281352954096, + "grad_norm": 1.910994291305542, + "learning_rate": 5e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7416402101516724, + "num_tokens": 359007644.0, + "step": 13871 + }, + { + "epoch": 1.5233911706567098, + "grad_norm": 1.8639894723892212, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.724940299987793, + "num_tokens": 359033440.0, + "step": 13872 + }, + { + "epoch": 1.5235009883593236, + "grad_norm": 1.7182484865188599, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.735410213470459, + "num_tokens": 359062696.0, + "step": 13873 + }, + { + "epoch": 1.5236108060619373, + "grad_norm": 1.6293878555297852, + "learning_rate": 5e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7268648147583008, + "num_tokens": 359093362.0, + "step": 13874 + }, + { + "epoch": 1.5237206237645509, + "grad_norm": 1.9823822975158691, + "learning_rate": 5e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7150380611419678, + "num_tokens": 359120629.0, + "step": 13875 + }, + { + "epoch": 1.5238304414671644, + "grad_norm": 1.886249303817749, + "learning_rate": 5e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7131614685058594, + "num_tokens": 359145160.0, + "step": 13876 + }, + { + "epoch": 1.5239402591697782, + "grad_norm": 1.8660656213760376, + "learning_rate": 5e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7334834337234497, + "num_tokens": 359168875.0, + "step": 13877 + }, + { + "epoch": 1.524050076872392, + "grad_norm": 1.8462392091751099, + "learning_rate": 5e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7572144269943237, + "num_tokens": 359193016.0, + "step": 13878 + }, + { + "epoch": 1.5241598945750054, + "grad_norm": 1.7014636993408203, + "learning_rate": 5e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7254390716552734, + "num_tokens": 359220976.0, + "step": 13879 + }, + { + "epoch": 1.5242697122776192, + "grad_norm": 2.0610291957855225, + "learning_rate": 5e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7265125513076782, + "num_tokens": 359242818.0, + "step": 13880 + }, + { + "epoch": 1.5243795299802327, + "grad_norm": 1.9533741474151611, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7425839304924011, + "num_tokens": 359268496.0, + "step": 13881 + }, + { + "epoch": 1.5244893476828465, + "grad_norm": 1.965442419052124, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7251760363578796, + "num_tokens": 359291429.0, + "step": 13882 + }, + { + "epoch": 1.5245991653854603, + "grad_norm": 1.8680083751678467, + "learning_rate": 5e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.740750253200531, + "num_tokens": 359315312.0, + "step": 13883 + }, + { + "epoch": 1.5247089830880738, + "grad_norm": 1.7965103387832642, + "learning_rate": 5e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7234756946563721, + "num_tokens": 359342703.0, + "step": 13884 + }, + { + "epoch": 1.5248188007906873, + "grad_norm": 2.0356593132019043, + "learning_rate": 5e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7219641208648682, + "num_tokens": 359365809.0, + "step": 13885 + }, + { + "epoch": 1.524928618493301, + "grad_norm": 1.9539247751235962, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7362070083618164, + "num_tokens": 359390984.0, + "step": 13886 + }, + { + "epoch": 1.5250384361959148, + "grad_norm": 1.4955569505691528, + "learning_rate": 5e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.712875247001648, + "num_tokens": 359426343.0, + "step": 13887 + }, + { + "epoch": 1.5251482538985286, + "grad_norm": 2.0055291652679443, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7181569933891296, + "num_tokens": 359448583.0, + "step": 13888 + }, + { + "epoch": 1.5252580716011421, + "grad_norm": 1.6009811162948608, + "learning_rate": 5e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.6979358196258545, + "num_tokens": 359483561.0, + "step": 13889 + }, + { + "epoch": 1.5253678893037557, + "grad_norm": 1.8986233472824097, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.746206521987915, + "num_tokens": 359507445.0, + "step": 13890 + }, + { + "epoch": 1.5254777070063694, + "grad_norm": 1.8029208183288574, + "learning_rate": 5e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7132532000541687, + "num_tokens": 359538506.0, + "step": 13891 + }, + { + "epoch": 1.5255875247089832, + "grad_norm": 2.102919101715088, + "learning_rate": 5e-06, + "loss": 0.757, + "mean_token_accuracy": 0.755273699760437, + "num_tokens": 359558417.0, + "step": 13892 + }, + { + "epoch": 1.5256973424115967, + "grad_norm": 1.9593658447265625, + "learning_rate": 5e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7358533143997192, + "num_tokens": 359580460.0, + "step": 13893 + }, + { + "epoch": 1.5258071601142102, + "grad_norm": 1.9785590171813965, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7221386432647705, + "num_tokens": 359603247.0, + "step": 13894 + }, + { + "epoch": 1.525916977816824, + "grad_norm": 1.8711364269256592, + "learning_rate": 5e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.734785258769989, + "num_tokens": 359628718.0, + "step": 13895 + }, + { + "epoch": 1.5260267955194378, + "grad_norm": 1.8455941677093506, + "learning_rate": 5e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7518266439437866, + "num_tokens": 359651109.0, + "step": 13896 + }, + { + "epoch": 1.5261366132220515, + "grad_norm": 1.576635718345642, + "learning_rate": 5e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7254283428192139, + "num_tokens": 359681961.0, + "step": 13897 + }, + { + "epoch": 1.526246430924665, + "grad_norm": 1.88563072681427, + "learning_rate": 5e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.755459189414978, + "num_tokens": 359708217.0, + "step": 13898 + }, + { + "epoch": 1.5263562486272786, + "grad_norm": 2.001217842102051, + "learning_rate": 5e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7398704290390015, + "num_tokens": 359728876.0, + "step": 13899 + }, + { + "epoch": 1.5264660663298923, + "grad_norm": 1.759854793548584, + "learning_rate": 5e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7410862445831299, + "num_tokens": 359755926.0, + "step": 13900 + }, + { + "epoch": 1.526575884032506, + "grad_norm": 1.9391217231750488, + "learning_rate": 5e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7579574584960938, + "num_tokens": 359779145.0, + "step": 13901 + }, + { + "epoch": 1.5266857017351199, + "grad_norm": 1.8002848625183105, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7566015720367432, + "num_tokens": 359803037.0, + "step": 13902 + }, + { + "epoch": 1.5267955194377334, + "grad_norm": 1.7636069059371948, + "learning_rate": 5e-06, + "loss": 0.959, + "mean_token_accuracy": 0.6989030838012695, + "num_tokens": 359831666.0, + "step": 13903 + }, + { + "epoch": 1.526905337140347, + "grad_norm": 2.1334848403930664, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7474521994590759, + "num_tokens": 359851870.0, + "step": 13904 + }, + { + "epoch": 1.5270151548429607, + "grad_norm": 1.7487540245056152, + "learning_rate": 5e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7205577492713928, + "num_tokens": 359883549.0, + "step": 13905 + }, + { + "epoch": 1.5271249725455744, + "grad_norm": 1.8701210021972656, + "learning_rate": 5e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7134144306182861, + "num_tokens": 359912225.0, + "step": 13906 + }, + { + "epoch": 1.527234790248188, + "grad_norm": 1.777541995048523, + "learning_rate": 5e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7483261823654175, + "num_tokens": 359939385.0, + "step": 13907 + }, + { + "epoch": 1.5273446079508015, + "grad_norm": 1.8063762187957764, + "learning_rate": 5e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7300238609313965, + "num_tokens": 359965279.0, + "step": 13908 + }, + { + "epoch": 1.5274544256534153, + "grad_norm": 1.8175843954086304, + "learning_rate": 5e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7246426939964294, + "num_tokens": 359992279.0, + "step": 13909 + }, + { + "epoch": 1.527564243356029, + "grad_norm": 2.0523781776428223, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7424585819244385, + "num_tokens": 360014657.0, + "step": 13910 + }, + { + "epoch": 1.5276740610586428, + "grad_norm": 1.9892243146896362, + "learning_rate": 5e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7430048584938049, + "num_tokens": 360037981.0, + "step": 13911 + }, + { + "epoch": 1.5277838787612563, + "grad_norm": 2.0131564140319824, + "learning_rate": 5e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.748140811920166, + "num_tokens": 360059312.0, + "step": 13912 + }, + { + "epoch": 1.5278936964638699, + "grad_norm": 1.7527098655700684, + "learning_rate": 5e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7195233702659607, + "num_tokens": 360093139.0, + "step": 13913 + }, + { + "epoch": 1.5280035141664836, + "grad_norm": 2.0075931549072266, + "learning_rate": 5e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7005434632301331, + "num_tokens": 360119118.0, + "step": 13914 + }, + { + "epoch": 1.5281133318690974, + "grad_norm": 1.918650507926941, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7172868251800537, + "num_tokens": 360144457.0, + "step": 13915 + }, + { + "epoch": 1.5282231495717111, + "grad_norm": 1.8573992252349854, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7231149673461914, + "num_tokens": 360171404.0, + "step": 13916 + }, + { + "epoch": 1.5283329672743247, + "grad_norm": 1.739995002746582, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7325981259346008, + "num_tokens": 360200260.0, + "step": 13917 + }, + { + "epoch": 1.5284427849769382, + "grad_norm": 1.908846139907837, + "learning_rate": 5e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.729354977607727, + "num_tokens": 360224713.0, + "step": 13918 + }, + { + "epoch": 1.528552602679552, + "grad_norm": 1.7150615453720093, + "learning_rate": 5e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.747282087802887, + "num_tokens": 360250994.0, + "step": 13919 + }, + { + "epoch": 1.5286624203821657, + "grad_norm": 1.8356841802597046, + "learning_rate": 5e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7154256701469421, + "num_tokens": 360275592.0, + "step": 13920 + }, + { + "epoch": 1.5287722380847792, + "grad_norm": 1.8515695333480835, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.714512050151825, + "num_tokens": 360301228.0, + "step": 13921 + }, + { + "epoch": 1.5288820557873928, + "grad_norm": 1.8197293281555176, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.726300060749054, + "num_tokens": 360325739.0, + "step": 13922 + }, + { + "epoch": 1.5289918734900065, + "grad_norm": 1.8181796073913574, + "learning_rate": 5e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7385679483413696, + "num_tokens": 360351374.0, + "step": 13923 + }, + { + "epoch": 1.5291016911926203, + "grad_norm": 1.823613166809082, + "learning_rate": 5e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7398425340652466, + "num_tokens": 360378060.0, + "step": 13924 + }, + { + "epoch": 1.529211508895234, + "grad_norm": 1.7996968030929565, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7316399216651917, + "num_tokens": 360403630.0, + "step": 13925 + }, + { + "epoch": 1.5293213265978476, + "grad_norm": 1.7306444644927979, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7392856478691101, + "num_tokens": 360430407.0, + "step": 13926 + }, + { + "epoch": 1.5294311443004611, + "grad_norm": 1.7753746509552002, + "learning_rate": 5e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7308647632598877, + "num_tokens": 360461168.0, + "step": 13927 + }, + { + "epoch": 1.5295409620030749, + "grad_norm": 1.9092847108840942, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7178918719291687, + "num_tokens": 360488783.0, + "step": 13928 + }, + { + "epoch": 1.5296507797056886, + "grad_norm": 1.900087594985962, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7171467542648315, + "num_tokens": 360514553.0, + "step": 13929 + }, + { + "epoch": 1.5297605974083022, + "grad_norm": 1.6933293342590332, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7353417277336121, + "num_tokens": 360544013.0, + "step": 13930 + }, + { + "epoch": 1.529870415110916, + "grad_norm": 1.7286337614059448, + "learning_rate": 5e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7383533716201782, + "num_tokens": 360573075.0, + "step": 13931 + }, + { + "epoch": 1.5299802328135295, + "grad_norm": 1.7207541465759277, + "learning_rate": 5e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7049496173858643, + "num_tokens": 360603568.0, + "step": 13932 + }, + { + "epoch": 1.5300900505161432, + "grad_norm": 1.841833233833313, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7420142889022827, + "num_tokens": 360630479.0, + "step": 13933 + }, + { + "epoch": 1.530199868218757, + "grad_norm": 1.6263580322265625, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7355101108551025, + "num_tokens": 360665264.0, + "step": 13934 + }, + { + "epoch": 1.5303096859213705, + "grad_norm": 1.8049900531768799, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7156075239181519, + "num_tokens": 360691798.0, + "step": 13935 + }, + { + "epoch": 1.530419503623984, + "grad_norm": 1.6935545206069946, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7285869121551514, + "num_tokens": 360721178.0, + "step": 13936 + }, + { + "epoch": 1.5305293213265978, + "grad_norm": 1.7314690351486206, + "learning_rate": 5e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7063102722167969, + "num_tokens": 360751573.0, + "step": 13937 + }, + { + "epoch": 1.5306391390292116, + "grad_norm": 1.8648773431777954, + "learning_rate": 5e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.74238121509552, + "num_tokens": 360775894.0, + "step": 13938 + }, + { + "epoch": 1.5307489567318253, + "grad_norm": 1.8203984498977661, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7232240438461304, + "num_tokens": 360802339.0, + "step": 13939 + }, + { + "epoch": 1.5308587744344389, + "grad_norm": 1.8535646200180054, + "learning_rate": 5e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7519757747650146, + "num_tokens": 360826922.0, + "step": 13940 + }, + { + "epoch": 1.5309685921370524, + "grad_norm": 1.9520549774169922, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7152843475341797, + "num_tokens": 360852318.0, + "step": 13941 + }, + { + "epoch": 1.5310784098396661, + "grad_norm": 1.894504427909851, + "learning_rate": 5e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7286738157272339, + "num_tokens": 360877789.0, + "step": 13942 + }, + { + "epoch": 1.53118822754228, + "grad_norm": 1.9954172372817993, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7617683410644531, + "num_tokens": 360897860.0, + "step": 13943 + }, + { + "epoch": 1.5312980452448934, + "grad_norm": 1.6803094148635864, + "learning_rate": 5e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.73479163646698, + "num_tokens": 360928264.0, + "step": 13944 + }, + { + "epoch": 1.5314078629475072, + "grad_norm": 1.8823508024215698, + "learning_rate": 5e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7256970405578613, + "num_tokens": 360953209.0, + "step": 13945 + }, + { + "epoch": 1.5315176806501207, + "grad_norm": 1.959763526916504, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7461779117584229, + "num_tokens": 360974827.0, + "step": 13946 + }, + { + "epoch": 1.5316274983527345, + "grad_norm": 1.8009376525878906, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7573164105415344, + "num_tokens": 361000300.0, + "step": 13947 + }, + { + "epoch": 1.5317373160553482, + "grad_norm": 1.7213027477264404, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7357326745986938, + "num_tokens": 361027944.0, + "step": 13948 + }, + { + "epoch": 1.5318471337579618, + "grad_norm": 1.6656206846237183, + "learning_rate": 5e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7304065227508545, + "num_tokens": 361057718.0, + "step": 13949 + }, + { + "epoch": 1.5319569514605753, + "grad_norm": 1.9231712818145752, + "learning_rate": 5e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7236084938049316, + "num_tokens": 361082372.0, + "step": 13950 + }, + { + "epoch": 1.532066769163189, + "grad_norm": 1.9386895895004272, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7383699417114258, + "num_tokens": 361103914.0, + "step": 13951 + }, + { + "epoch": 1.5321765868658028, + "grad_norm": 1.9589241743087769, + "learning_rate": 5e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7309228181838989, + "num_tokens": 361129220.0, + "step": 13952 + }, + { + "epoch": 1.5322864045684166, + "grad_norm": 1.792680025100708, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7348435521125793, + "num_tokens": 361156936.0, + "step": 13953 + }, + { + "epoch": 1.5323962222710301, + "grad_norm": 1.919271469116211, + "learning_rate": 5e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7380671501159668, + "num_tokens": 361180338.0, + "step": 13954 + }, + { + "epoch": 1.5325060399736437, + "grad_norm": 1.8167409896850586, + "learning_rate": 5e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7292851805686951, + "num_tokens": 361206053.0, + "step": 13955 + }, + { + "epoch": 1.5326158576762574, + "grad_norm": 1.926029086112976, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7284083962440491, + "num_tokens": 361229618.0, + "step": 13956 + }, + { + "epoch": 1.5327256753788712, + "grad_norm": 1.769845962524414, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7214243412017822, + "num_tokens": 361260630.0, + "step": 13957 + }, + { + "epoch": 1.5328354930814847, + "grad_norm": 1.7865424156188965, + "learning_rate": 5e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7212867736816406, + "num_tokens": 361287209.0, + "step": 13958 + }, + { + "epoch": 1.5329453107840982, + "grad_norm": 1.9753600358963013, + "learning_rate": 5e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.74069744348526, + "num_tokens": 361310122.0, + "step": 13959 + }, + { + "epoch": 1.533055128486712, + "grad_norm": 2.0414984226226807, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.751092255115509, + "num_tokens": 361330338.0, + "step": 13960 + }, + { + "epoch": 1.5331649461893258, + "grad_norm": 2.1518797874450684, + "learning_rate": 5e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7090268135070801, + "num_tokens": 361351483.0, + "step": 13961 + }, + { + "epoch": 1.5332747638919395, + "grad_norm": 1.7891342639923096, + "learning_rate": 5e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7358067035675049, + "num_tokens": 361378823.0, + "step": 13962 + }, + { + "epoch": 1.533384581594553, + "grad_norm": 1.862792730331421, + "learning_rate": 5e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7329859733581543, + "num_tokens": 361401625.0, + "step": 13963 + }, + { + "epoch": 1.5334943992971666, + "grad_norm": 1.8128725290298462, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7314690351486206, + "num_tokens": 361428876.0, + "step": 13964 + }, + { + "epoch": 1.5336042169997803, + "grad_norm": 1.6149898767471313, + "learning_rate": 5e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7169406414031982, + "num_tokens": 361461194.0, + "step": 13965 + }, + { + "epoch": 1.533714034702394, + "grad_norm": 1.9135453701019287, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7179116010665894, + "num_tokens": 361483996.0, + "step": 13966 + }, + { + "epoch": 1.5338238524050078, + "grad_norm": 1.8800305128097534, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7142054438591003, + "num_tokens": 361511321.0, + "step": 13967 + }, + { + "epoch": 1.5339336701076214, + "grad_norm": 1.7506824731826782, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7284339070320129, + "num_tokens": 361538295.0, + "step": 13968 + }, + { + "epoch": 1.534043487810235, + "grad_norm": 2.0347535610198975, + "learning_rate": 5e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7395703792572021, + "num_tokens": 361560022.0, + "step": 13969 + }, + { + "epoch": 1.5341533055128487, + "grad_norm": 1.7029675245285034, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7404168844223022, + "num_tokens": 361589270.0, + "step": 13970 + }, + { + "epoch": 1.5342631232154624, + "grad_norm": 1.7407944202423096, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7150682806968689, + "num_tokens": 361616233.0, + "step": 13971 + }, + { + "epoch": 1.534372940918076, + "grad_norm": 1.7058569192886353, + "learning_rate": 5e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7546942234039307, + "num_tokens": 361642874.0, + "step": 13972 + }, + { + "epoch": 1.5344827586206895, + "grad_norm": 1.827775478363037, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7421165704727173, + "num_tokens": 361666812.0, + "step": 13973 + }, + { + "epoch": 1.5345925763233033, + "grad_norm": 1.810344934463501, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7502826452255249, + "num_tokens": 361691979.0, + "step": 13974 + }, + { + "epoch": 1.534702394025917, + "grad_norm": 1.9703190326690674, + "learning_rate": 5e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7331660389900208, + "num_tokens": 361717112.0, + "step": 13975 + }, + { + "epoch": 1.5348122117285308, + "grad_norm": 1.6098536252975464, + "learning_rate": 5e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7218775749206543, + "num_tokens": 361748292.0, + "step": 13976 + }, + { + "epoch": 1.5349220294311443, + "grad_norm": 1.5944393873214722, + "learning_rate": 5e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7406858205795288, + "num_tokens": 361778894.0, + "step": 13977 + }, + { + "epoch": 1.5350318471337578, + "grad_norm": 1.8287521600723267, + "learning_rate": 5e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.730229914188385, + "num_tokens": 361805061.0, + "step": 13978 + }, + { + "epoch": 1.5351416648363716, + "grad_norm": 1.9014229774475098, + "learning_rate": 5e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7245679497718811, + "num_tokens": 361830404.0, + "step": 13979 + }, + { + "epoch": 1.5352514825389854, + "grad_norm": 1.866505742073059, + "learning_rate": 5e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7043890357017517, + "num_tokens": 361856521.0, + "step": 13980 + }, + { + "epoch": 1.535361300241599, + "grad_norm": 1.8362250328063965, + "learning_rate": 5e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7078888416290283, + "num_tokens": 361883902.0, + "step": 13981 + }, + { + "epoch": 1.5354711179442126, + "grad_norm": 1.8290205001831055, + "learning_rate": 5e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7224138379096985, + "num_tokens": 361911788.0, + "step": 13982 + }, + { + "epoch": 1.5355809356468262, + "grad_norm": 1.6312741041183472, + "learning_rate": 5e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7315528392791748, + "num_tokens": 361943459.0, + "step": 13983 + }, + { + "epoch": 1.53569075334944, + "grad_norm": 1.910220742225647, + "learning_rate": 5e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7340797185897827, + "num_tokens": 361968580.0, + "step": 13984 + }, + { + "epoch": 1.5358005710520537, + "grad_norm": 2.0224013328552246, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7481625080108643, + "num_tokens": 361989127.0, + "step": 13985 + }, + { + "epoch": 1.5359103887546672, + "grad_norm": 2.0697975158691406, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7234251499176025, + "num_tokens": 362010779.0, + "step": 13986 + }, + { + "epoch": 1.5360202064572808, + "grad_norm": 1.878758192062378, + "learning_rate": 5e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7411001920700073, + "num_tokens": 362036120.0, + "step": 13987 + }, + { + "epoch": 1.5361300241598945, + "grad_norm": 1.7098357677459717, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.737798810005188, + "num_tokens": 362064879.0, + "step": 13988 + }, + { + "epoch": 1.5362398418625083, + "grad_norm": 1.819176197052002, + "learning_rate": 5e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.727479100227356, + "num_tokens": 362093673.0, + "step": 13989 + }, + { + "epoch": 1.536349659565122, + "grad_norm": 1.8960411548614502, + "learning_rate": 5e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7433541417121887, + "num_tokens": 362118256.0, + "step": 13990 + }, + { + "epoch": 1.5364594772677356, + "grad_norm": 2.035689353942871, + "learning_rate": 5e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7234278917312622, + "num_tokens": 362140265.0, + "step": 13991 + }, + { + "epoch": 1.536569294970349, + "grad_norm": 2.0384042263031006, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7349814176559448, + "num_tokens": 362163642.0, + "step": 13992 + }, + { + "epoch": 1.5366791126729629, + "grad_norm": 1.8103880882263184, + "learning_rate": 5e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7362271547317505, + "num_tokens": 362190634.0, + "step": 13993 + }, + { + "epoch": 1.5367889303755766, + "grad_norm": 1.8555923700332642, + "learning_rate": 5e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7444703578948975, + "num_tokens": 362215369.0, + "step": 13994 + }, + { + "epoch": 1.5368987480781902, + "grad_norm": 1.9903978109359741, + "learning_rate": 5e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7454003095626831, + "num_tokens": 362239189.0, + "step": 13995 + }, + { + "epoch": 1.537008565780804, + "grad_norm": 2.0031933784484863, + "learning_rate": 5e-06, + "loss": 0.913, + "mean_token_accuracy": 0.716765284538269, + "num_tokens": 362264601.0, + "step": 13996 + }, + { + "epoch": 1.5371183834834174, + "grad_norm": 1.8227753639221191, + "learning_rate": 5e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7347851991653442, + "num_tokens": 362291838.0, + "step": 13997 + }, + { + "epoch": 1.5372282011860312, + "grad_norm": 2.023599624633789, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7264143228530884, + "num_tokens": 362314143.0, + "step": 13998 + }, + { + "epoch": 1.537338018888645, + "grad_norm": 1.7318193912506104, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7162085771560669, + "num_tokens": 362344845.0, + "step": 13999 + }, + { + "epoch": 1.5374478365912585, + "grad_norm": 1.7430500984191895, + "learning_rate": 5e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7418185472488403, + "num_tokens": 362373105.0, + "step": 14000 + }, + { + "epoch": 1.537557654293872, + "grad_norm": 1.725730061531067, + "learning_rate": 5e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7275893688201904, + "num_tokens": 362402987.0, + "step": 14001 + }, + { + "epoch": 1.5376674719964858, + "grad_norm": 2.014317512512207, + "learning_rate": 5e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7297322154045105, + "num_tokens": 362424486.0, + "step": 14002 + }, + { + "epoch": 1.5377772896990995, + "grad_norm": 1.8519805669784546, + "learning_rate": 5e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7101917862892151, + "num_tokens": 362449947.0, + "step": 14003 + }, + { + "epoch": 1.5378871074017133, + "grad_norm": 1.9573427438735962, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7316758632659912, + "num_tokens": 362474155.0, + "step": 14004 + }, + { + "epoch": 1.5379969251043268, + "grad_norm": 1.7598453760147095, + "learning_rate": 5e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7281222939491272, + "num_tokens": 362503020.0, + "step": 14005 + }, + { + "epoch": 1.5381067428069404, + "grad_norm": 1.74798583984375, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7100520133972168, + "num_tokens": 362533066.0, + "step": 14006 + }, + { + "epoch": 1.5382165605095541, + "grad_norm": 1.811583161354065, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7297024726867676, + "num_tokens": 362558909.0, + "step": 14007 + }, + { + "epoch": 1.5383263782121679, + "grad_norm": 1.8750547170639038, + "learning_rate": 5e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7186387777328491, + "num_tokens": 362583787.0, + "step": 14008 + }, + { + "epoch": 1.5384361959147814, + "grad_norm": 1.9174381494522095, + "learning_rate": 5e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7236895561218262, + "num_tokens": 362612364.0, + "step": 14009 + }, + { + "epoch": 1.538546013617395, + "grad_norm": 1.864203929901123, + "learning_rate": 5e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7364522814750671, + "num_tokens": 362635776.0, + "step": 14010 + }, + { + "epoch": 1.5386558313200087, + "grad_norm": 1.8785910606384277, + "learning_rate": 5e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7198000550270081, + "num_tokens": 362661259.0, + "step": 14011 + }, + { + "epoch": 1.5387656490226225, + "grad_norm": 1.891403079032898, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.732434868812561, + "num_tokens": 362686452.0, + "step": 14012 + }, + { + "epoch": 1.5388754667252362, + "grad_norm": 1.7122931480407715, + "learning_rate": 5e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7363147735595703, + "num_tokens": 362713842.0, + "step": 14013 + }, + { + "epoch": 1.5389852844278498, + "grad_norm": 1.9388681650161743, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.738903284072876, + "num_tokens": 362736740.0, + "step": 14014 + }, + { + "epoch": 1.5390951021304633, + "grad_norm": 1.7203450202941895, + "learning_rate": 5e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7086515426635742, + "num_tokens": 362765547.0, + "step": 14015 + }, + { + "epoch": 1.539204919833077, + "grad_norm": 1.6004879474639893, + "learning_rate": 5e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7254164814949036, + "num_tokens": 362797663.0, + "step": 14016 + }, + { + "epoch": 1.5393147375356908, + "grad_norm": 1.7769047021865845, + "learning_rate": 5e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.751223623752594, + "num_tokens": 362824416.0, + "step": 14017 + }, + { + "epoch": 1.5394245552383046, + "grad_norm": 1.7609866857528687, + "learning_rate": 5e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7448731660842896, + "num_tokens": 362850697.0, + "step": 14018 + }, + { + "epoch": 1.539534372940918, + "grad_norm": 2.0165634155273438, + "learning_rate": 5e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7380654215812683, + "num_tokens": 362873805.0, + "step": 14019 + }, + { + "epoch": 1.5396441906435316, + "grad_norm": 1.8912127017974854, + "learning_rate": 5e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7351987361907959, + "num_tokens": 362898520.0, + "step": 14020 + }, + { + "epoch": 1.5397540083461454, + "grad_norm": 1.9841485023498535, + "learning_rate": 5e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7571786642074585, + "num_tokens": 362919748.0, + "step": 14021 + }, + { + "epoch": 1.5398638260487592, + "grad_norm": 2.1315665245056152, + "learning_rate": 5e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7341643571853638, + "num_tokens": 362940512.0, + "step": 14022 + }, + { + "epoch": 1.5399736437513727, + "grad_norm": 1.7740198373794556, + "learning_rate": 5e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7542590498924255, + "num_tokens": 362964869.0, + "step": 14023 + }, + { + "epoch": 1.5400834614539862, + "grad_norm": 2.123314142227173, + "learning_rate": 5e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7350836992263794, + "num_tokens": 362985418.0, + "step": 14024 + }, + { + "epoch": 1.5401932791566, + "grad_norm": 1.7604951858520508, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7155215740203857, + "num_tokens": 363012368.0, + "step": 14025 + }, + { + "epoch": 1.5403030968592137, + "grad_norm": 1.682739019393921, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7553114891052246, + "num_tokens": 363040900.0, + "step": 14026 + }, + { + "epoch": 1.5404129145618275, + "grad_norm": 1.9634833335876465, + "learning_rate": 5e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7373813986778259, + "num_tokens": 363063326.0, + "step": 14027 + }, + { + "epoch": 1.540522732264441, + "grad_norm": 2.121656894683838, + "learning_rate": 5e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7549280524253845, + "num_tokens": 363083397.0, + "step": 14028 + }, + { + "epoch": 1.5406325499670546, + "grad_norm": 1.8195085525512695, + "learning_rate": 5e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7352997064590454, + "num_tokens": 363111042.0, + "step": 14029 + }, + { + "epoch": 1.5407423676696683, + "grad_norm": 1.706364631652832, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7393573522567749, + "num_tokens": 363142124.0, + "step": 14030 + }, + { + "epoch": 1.540852185372282, + "grad_norm": 1.8350836038589478, + "learning_rate": 5e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7335019111633301, + "num_tokens": 363169391.0, + "step": 14031 + }, + { + "epoch": 1.5409620030748958, + "grad_norm": 1.9095330238342285, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7424341440200806, + "num_tokens": 363192645.0, + "step": 14032 + }, + { + "epoch": 1.5410718207775094, + "grad_norm": 1.972418189048767, + "learning_rate": 5e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7309553027153015, + "num_tokens": 363215527.0, + "step": 14033 + }, + { + "epoch": 1.541181638480123, + "grad_norm": 1.8269262313842773, + "learning_rate": 5e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7163621783256531, + "num_tokens": 363245957.0, + "step": 14034 + }, + { + "epoch": 1.5412914561827367, + "grad_norm": 1.9539631605148315, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7580569982528687, + "num_tokens": 363266765.0, + "step": 14035 + }, + { + "epoch": 1.5414012738853504, + "grad_norm": 1.891142725944519, + "learning_rate": 5e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7536348700523376, + "num_tokens": 363288534.0, + "step": 14036 + }, + { + "epoch": 1.541511091587964, + "grad_norm": 1.804534673690796, + "learning_rate": 5e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7217972874641418, + "num_tokens": 363314439.0, + "step": 14037 + }, + { + "epoch": 1.5416209092905775, + "grad_norm": 1.5816786289215088, + "learning_rate": 5e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7451056838035583, + "num_tokens": 363347549.0, + "step": 14038 + }, + { + "epoch": 1.5417307269931912, + "grad_norm": 2.0223276615142822, + "learning_rate": 5e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7318881750106812, + "num_tokens": 363369029.0, + "step": 14039 + }, + { + "epoch": 1.541840544695805, + "grad_norm": 1.841957688331604, + "learning_rate": 5e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7197058200836182, + "num_tokens": 363394599.0, + "step": 14040 + }, + { + "epoch": 1.5419503623984188, + "grad_norm": 1.5980241298675537, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7398300766944885, + "num_tokens": 363424657.0, + "step": 14041 + }, + { + "epoch": 1.5420601801010323, + "grad_norm": 1.9198976755142212, + "learning_rate": 5e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7604146003723145, + "num_tokens": 363446719.0, + "step": 14042 + }, + { + "epoch": 1.5421699978036458, + "grad_norm": 1.917098045349121, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7215322256088257, + "num_tokens": 363470703.0, + "step": 14043 + }, + { + "epoch": 1.5422798155062596, + "grad_norm": 1.8597784042358398, + "learning_rate": 5e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.746137797832489, + "num_tokens": 363494185.0, + "step": 14044 + }, + { + "epoch": 1.5423896332088733, + "grad_norm": 1.828654170036316, + "learning_rate": 5e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7246198654174805, + "num_tokens": 363520621.0, + "step": 14045 + }, + { + "epoch": 1.5424994509114869, + "grad_norm": 2.1386184692382812, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7488635778427124, + "num_tokens": 363540909.0, + "step": 14046 + }, + { + "epoch": 1.5426092686141006, + "grad_norm": 1.8542169332504272, + "learning_rate": 5e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7349320650100708, + "num_tokens": 363566355.0, + "step": 14047 + }, + { + "epoch": 1.5427190863167142, + "grad_norm": 1.684574007987976, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7136425971984863, + "num_tokens": 363595777.0, + "step": 14048 + }, + { + "epoch": 1.542828904019328, + "grad_norm": 1.9525405168533325, + "learning_rate": 5e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7340237498283386, + "num_tokens": 363618729.0, + "step": 14049 + }, + { + "epoch": 1.5429387217219417, + "grad_norm": 1.7421797513961792, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7183363437652588, + "num_tokens": 363647054.0, + "step": 14050 + }, + { + "epoch": 1.5430485394245552, + "grad_norm": 1.979499101638794, + "learning_rate": 5e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7278575897216797, + "num_tokens": 363671404.0, + "step": 14051 + }, + { + "epoch": 1.5431583571271688, + "grad_norm": 1.8872929811477661, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7298759818077087, + "num_tokens": 363696768.0, + "step": 14052 + }, + { + "epoch": 1.5432681748297825, + "grad_norm": 1.899735927581787, + "learning_rate": 5e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7224349975585938, + "num_tokens": 363726267.0, + "step": 14053 + }, + { + "epoch": 1.5433779925323963, + "grad_norm": 1.7509255409240723, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7342863082885742, + "num_tokens": 363753758.0, + "step": 14054 + }, + { + "epoch": 1.54348781023501, + "grad_norm": 1.6596671342849731, + "learning_rate": 5e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7218736410140991, + "num_tokens": 363782116.0, + "step": 14055 + }, + { + "epoch": 1.5435976279376236, + "grad_norm": 2.146526575088501, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7438102960586548, + "num_tokens": 363801586.0, + "step": 14056 + }, + { + "epoch": 1.543707445640237, + "grad_norm": 1.8730194568634033, + "learning_rate": 5e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.7690165042877197, + "num_tokens": 363823967.0, + "step": 14057 + }, + { + "epoch": 1.5438172633428509, + "grad_norm": 2.0364112854003906, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.759323000907898, + "num_tokens": 363844563.0, + "step": 14058 + }, + { + "epoch": 1.5439270810454646, + "grad_norm": 2.475619077682495, + "learning_rate": 5e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7528448700904846, + "num_tokens": 363860559.0, + "step": 14059 + }, + { + "epoch": 1.5440368987480781, + "grad_norm": 2.0092835426330566, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7350882291793823, + "num_tokens": 363882911.0, + "step": 14060 + }, + { + "epoch": 1.544146716450692, + "grad_norm": 1.821151614189148, + "learning_rate": 5e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7292817831039429, + "num_tokens": 363908593.0, + "step": 14061 + }, + { + "epoch": 1.5442565341533054, + "grad_norm": 1.9655447006225586, + "learning_rate": 5e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7388359904289246, + "num_tokens": 363932426.0, + "step": 14062 + }, + { + "epoch": 1.5443663518559192, + "grad_norm": 1.5723695755004883, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7293845415115356, + "num_tokens": 363966761.0, + "step": 14063 + }, + { + "epoch": 1.544476169558533, + "grad_norm": 1.8976339101791382, + "learning_rate": 5e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7580793499946594, + "num_tokens": 363990372.0, + "step": 14064 + }, + { + "epoch": 1.5445859872611465, + "grad_norm": 1.8428946733474731, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7287790775299072, + "num_tokens": 364016386.0, + "step": 14065 + }, + { + "epoch": 1.54469580496376, + "grad_norm": 2.0661938190460205, + "learning_rate": 5e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7440745830535889, + "num_tokens": 364038821.0, + "step": 14066 + }, + { + "epoch": 1.5448056226663738, + "grad_norm": 1.910119891166687, + "learning_rate": 5e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7297800183296204, + "num_tokens": 364062930.0, + "step": 14067 + }, + { + "epoch": 1.5449154403689875, + "grad_norm": 1.9548554420471191, + "learning_rate": 5e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.749864935874939, + "num_tokens": 364084325.0, + "step": 14068 + }, + { + "epoch": 1.5450252580716013, + "grad_norm": 1.7795735597610474, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.716159999370575, + "num_tokens": 364114402.0, + "step": 14069 + }, + { + "epoch": 1.5451350757742148, + "grad_norm": 1.8498057126998901, + "learning_rate": 5e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7054555416107178, + "num_tokens": 364140527.0, + "step": 14070 + }, + { + "epoch": 1.5452448934768284, + "grad_norm": 1.8010913133621216, + "learning_rate": 5e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7187209725379944, + "num_tokens": 364167163.0, + "step": 14071 + }, + { + "epoch": 1.5453547111794421, + "grad_norm": 1.848949670791626, + "learning_rate": 5e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.727515459060669, + "num_tokens": 364196524.0, + "step": 14072 + }, + { + "epoch": 1.5454645288820559, + "grad_norm": 1.8437044620513916, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7395967245101929, + "num_tokens": 364220123.0, + "step": 14073 + }, + { + "epoch": 1.5455743465846694, + "grad_norm": 1.672569751739502, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7154634594917297, + "num_tokens": 364251620.0, + "step": 14074 + }, + { + "epoch": 1.545684164287283, + "grad_norm": 1.9806277751922607, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7233780026435852, + "num_tokens": 364274977.0, + "step": 14075 + }, + { + "epoch": 1.5457939819898967, + "grad_norm": 2.0149261951446533, + "learning_rate": 5e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7327709197998047, + "num_tokens": 364295871.0, + "step": 14076 + }, + { + "epoch": 1.5459037996925105, + "grad_norm": 1.8456186056137085, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7416112422943115, + "num_tokens": 364319950.0, + "step": 14077 + }, + { + "epoch": 1.5460136173951242, + "grad_norm": 1.7406970262527466, + "learning_rate": 5e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.72156822681427, + "num_tokens": 364351576.0, + "step": 14078 + }, + { + "epoch": 1.5461234350977378, + "grad_norm": 1.816512942314148, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7362525463104248, + "num_tokens": 364376335.0, + "step": 14079 + }, + { + "epoch": 1.5462332528003513, + "grad_norm": 1.8085297346115112, + "learning_rate": 5e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7310906052589417, + "num_tokens": 364401823.0, + "step": 14080 + }, + { + "epoch": 1.546343070502965, + "grad_norm": 1.7776027917861938, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7300460338592529, + "num_tokens": 364428797.0, + "step": 14081 + }, + { + "epoch": 1.5464528882055788, + "grad_norm": 1.9010803699493408, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.756411075592041, + "num_tokens": 364451239.0, + "step": 14082 + }, + { + "epoch": 1.5465627059081926, + "grad_norm": 1.8671658039093018, + "learning_rate": 5e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7485066652297974, + "num_tokens": 364476076.0, + "step": 14083 + }, + { + "epoch": 1.546672523610806, + "grad_norm": 1.6993334293365479, + "learning_rate": 5e-06, + "loss": 0.855, + "mean_token_accuracy": 0.730711817741394, + "num_tokens": 364505403.0, + "step": 14084 + }, + { + "epoch": 1.5467823413134196, + "grad_norm": 1.6728041172027588, + "learning_rate": 5e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.741706132888794, + "num_tokens": 364534166.0, + "step": 14085 + }, + { + "epoch": 1.5468921590160334, + "grad_norm": 2.0270471572875977, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7329698801040649, + "num_tokens": 364556500.0, + "step": 14086 + }, + { + "epoch": 1.5470019767186471, + "grad_norm": 1.8795777559280396, + "learning_rate": 5e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7356520891189575, + "num_tokens": 364579901.0, + "step": 14087 + }, + { + "epoch": 1.5471117944212607, + "grad_norm": 1.7081899642944336, + "learning_rate": 5e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7246567606925964, + "num_tokens": 364610965.0, + "step": 14088 + }, + { + "epoch": 1.5472216121238742, + "grad_norm": 2.075587511062622, + "learning_rate": 5e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7420299053192139, + "num_tokens": 364631670.0, + "step": 14089 + }, + { + "epoch": 1.547331429826488, + "grad_norm": 1.7514554262161255, + "learning_rate": 5e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7171213626861572, + "num_tokens": 364660777.0, + "step": 14090 + }, + { + "epoch": 1.5474412475291017, + "grad_norm": 1.866322636604309, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7546032667160034, + "num_tokens": 364682521.0, + "step": 14091 + }, + { + "epoch": 1.5475510652317155, + "grad_norm": 1.6939553022384644, + "learning_rate": 5e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7367138862609863, + "num_tokens": 364714923.0, + "step": 14092 + }, + { + "epoch": 1.547660882934329, + "grad_norm": 1.791387677192688, + "learning_rate": 5e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7273194789886475, + "num_tokens": 364743138.0, + "step": 14093 + }, + { + "epoch": 1.5477707006369426, + "grad_norm": 2.321514844894409, + "learning_rate": 5e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7326256632804871, + "num_tokens": 364761505.0, + "step": 14094 + }, + { + "epoch": 1.5478805183395563, + "grad_norm": 2.017789840698242, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.723933219909668, + "num_tokens": 364784557.0, + "step": 14095 + }, + { + "epoch": 1.54799033604217, + "grad_norm": 1.8604800701141357, + "learning_rate": 5e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7111852765083313, + "num_tokens": 364811097.0, + "step": 14096 + }, + { + "epoch": 1.5481001537447838, + "grad_norm": 1.9881703853607178, + "learning_rate": 5e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.75782710313797, + "num_tokens": 364833730.0, + "step": 14097 + }, + { + "epoch": 1.5482099714473974, + "grad_norm": 1.8442643880844116, + "learning_rate": 5e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7183789014816284, + "num_tokens": 364861705.0, + "step": 14098 + }, + { + "epoch": 1.548319789150011, + "grad_norm": 1.7818422317504883, + "learning_rate": 5e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7359368205070496, + "num_tokens": 364886381.0, + "step": 14099 + }, + { + "epoch": 1.5484296068526247, + "grad_norm": 1.9041084051132202, + "learning_rate": 5e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7302722930908203, + "num_tokens": 364912769.0, + "step": 14100 + }, + { + "epoch": 1.5485394245552384, + "grad_norm": 1.7699007987976074, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7294127941131592, + "num_tokens": 364939072.0, + "step": 14101 + }, + { + "epoch": 1.548649242257852, + "grad_norm": 1.848828911781311, + "learning_rate": 5e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7601955533027649, + "num_tokens": 364964099.0, + "step": 14102 + }, + { + "epoch": 1.5487590599604655, + "grad_norm": 1.8188824653625488, + "learning_rate": 5e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7545021176338196, + "num_tokens": 364991211.0, + "step": 14103 + }, + { + "epoch": 1.5488688776630792, + "grad_norm": 1.6872175931930542, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7399393320083618, + "num_tokens": 365021745.0, + "step": 14104 + }, + { + "epoch": 1.548978695365693, + "grad_norm": 1.8380873203277588, + "learning_rate": 5e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7345262765884399, + "num_tokens": 365045993.0, + "step": 14105 + }, + { + "epoch": 1.5490885130683067, + "grad_norm": 1.823927879333496, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7332020998001099, + "num_tokens": 365071861.0, + "step": 14106 + }, + { + "epoch": 1.5491983307709203, + "grad_norm": 1.7243086099624634, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7575220465660095, + "num_tokens": 365099028.0, + "step": 14107 + }, + { + "epoch": 1.5493081484735338, + "grad_norm": 1.621925950050354, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7126135230064392, + "num_tokens": 365128468.0, + "step": 14108 + }, + { + "epoch": 1.5494179661761476, + "grad_norm": 1.6714907884597778, + "learning_rate": 5e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7271351218223572, + "num_tokens": 365158576.0, + "step": 14109 + }, + { + "epoch": 1.5495277838787613, + "grad_norm": 1.7863513231277466, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7402029037475586, + "num_tokens": 365184587.0, + "step": 14110 + }, + { + "epoch": 1.5496376015813749, + "grad_norm": 1.8503963947296143, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7549327611923218, + "num_tokens": 365207453.0, + "step": 14111 + }, + { + "epoch": 1.5497474192839886, + "grad_norm": 1.7183388471603394, + "learning_rate": 5e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7362165451049805, + "num_tokens": 365235021.0, + "step": 14112 + }, + { + "epoch": 1.5498572369866022, + "grad_norm": 1.7655913829803467, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7202349901199341, + "num_tokens": 365262895.0, + "step": 14113 + }, + { + "epoch": 1.549967054689216, + "grad_norm": 1.9054677486419678, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7380625605583191, + "num_tokens": 365286677.0, + "step": 14114 + }, + { + "epoch": 1.5500768723918297, + "grad_norm": 1.8378705978393555, + "learning_rate": 5e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7421121001243591, + "num_tokens": 365312620.0, + "step": 14115 + }, + { + "epoch": 1.5501866900944432, + "grad_norm": 2.0110177993774414, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7647792100906372, + "num_tokens": 365334092.0, + "step": 14116 + }, + { + "epoch": 1.5502965077970567, + "grad_norm": 2.052539587020874, + "learning_rate": 5e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7154592275619507, + "num_tokens": 365359132.0, + "step": 14117 + }, + { + "epoch": 1.5504063254996705, + "grad_norm": 1.9337756633758545, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7399922609329224, + "num_tokens": 365383658.0, + "step": 14118 + }, + { + "epoch": 1.5505161432022843, + "grad_norm": 1.7345871925354004, + "learning_rate": 5e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7112630009651184, + "num_tokens": 365416034.0, + "step": 14119 + }, + { + "epoch": 1.550625960904898, + "grad_norm": 1.7586196660995483, + "learning_rate": 5e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.721695601940155, + "num_tokens": 365444532.0, + "step": 14120 + }, + { + "epoch": 1.5507357786075116, + "grad_norm": 1.8370686769485474, + "learning_rate": 5e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7378981113433838, + "num_tokens": 365469929.0, + "step": 14121 + }, + { + "epoch": 1.550845596310125, + "grad_norm": 2.060607433319092, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.742780327796936, + "num_tokens": 365490902.0, + "step": 14122 + }, + { + "epoch": 1.5509554140127388, + "grad_norm": 1.8921618461608887, + "learning_rate": 5e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7230071425437927, + "num_tokens": 365515288.0, + "step": 14123 + }, + { + "epoch": 1.5510652317153526, + "grad_norm": 1.8472260236740112, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7346117496490479, + "num_tokens": 365540697.0, + "step": 14124 + }, + { + "epoch": 1.5511750494179661, + "grad_norm": 1.988542914390564, + "learning_rate": 5e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7429428696632385, + "num_tokens": 365563531.0, + "step": 14125 + }, + { + "epoch": 1.55128486712058, + "grad_norm": 1.6827622652053833, + "learning_rate": 5e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7390124797821045, + "num_tokens": 365593565.0, + "step": 14126 + }, + { + "epoch": 1.5513946848231934, + "grad_norm": 1.800424575805664, + "learning_rate": 5e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7108945846557617, + "num_tokens": 365622599.0, + "step": 14127 + }, + { + "epoch": 1.5515045025258072, + "grad_norm": 1.7431490421295166, + "learning_rate": 5e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7245075702667236, + "num_tokens": 365649945.0, + "step": 14128 + }, + { + "epoch": 1.551614320228421, + "grad_norm": 1.7801848649978638, + "learning_rate": 5e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7168640494346619, + "num_tokens": 365678335.0, + "step": 14129 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 1.8970381021499634, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.766377329826355, + "num_tokens": 365700541.0, + "step": 14130 + }, + { + "epoch": 1.551833955633648, + "grad_norm": 2.065419912338257, + "learning_rate": 5e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7491052746772766, + "num_tokens": 365720663.0, + "step": 14131 + }, + { + "epoch": 1.5519437733362618, + "grad_norm": 2.008173942565918, + "learning_rate": 5e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7430657148361206, + "num_tokens": 365744239.0, + "step": 14132 + }, + { + "epoch": 1.5520535910388755, + "grad_norm": 1.9675602912902832, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7225524187088013, + "num_tokens": 365767008.0, + "step": 14133 + }, + { + "epoch": 1.5521634087414893, + "grad_norm": 1.697427749633789, + "learning_rate": 5e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7455310225486755, + "num_tokens": 365796349.0, + "step": 14134 + }, + { + "epoch": 1.5522732264441028, + "grad_norm": 1.7150672674179077, + "learning_rate": 5e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.728983998298645, + "num_tokens": 365823323.0, + "step": 14135 + }, + { + "epoch": 1.5523830441467164, + "grad_norm": 1.9173212051391602, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7306027412414551, + "num_tokens": 365847284.0, + "step": 14136 + }, + { + "epoch": 1.55249286184933, + "grad_norm": 1.8541010618209839, + "learning_rate": 5e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7540956735610962, + "num_tokens": 365870291.0, + "step": 14137 + }, + { + "epoch": 1.5526026795519439, + "grad_norm": 1.8616914749145508, + "learning_rate": 5e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7140268087387085, + "num_tokens": 365897543.0, + "step": 14138 + }, + { + "epoch": 1.5527124972545574, + "grad_norm": 1.8027853965759277, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7371234893798828, + "num_tokens": 365923546.0, + "step": 14139 + }, + { + "epoch": 1.552822314957171, + "grad_norm": 1.8515127897262573, + "learning_rate": 5e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7344627976417542, + "num_tokens": 365947535.0, + "step": 14140 + }, + { + "epoch": 1.5529321326597847, + "grad_norm": 1.9149876832962036, + "learning_rate": 5e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7176308631896973, + "num_tokens": 365972064.0, + "step": 14141 + }, + { + "epoch": 1.5530419503623984, + "grad_norm": 1.9839534759521484, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.713117241859436, + "num_tokens": 365994587.0, + "step": 14142 + }, + { + "epoch": 1.5531517680650122, + "grad_norm": 1.949070930480957, + "learning_rate": 5e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7086746692657471, + "num_tokens": 366019770.0, + "step": 14143 + }, + { + "epoch": 1.5532615857676257, + "grad_norm": 1.6136236190795898, + "learning_rate": 5e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7146286964416504, + "num_tokens": 366052155.0, + "step": 14144 + }, + { + "epoch": 1.5533714034702393, + "grad_norm": 1.6955758333206177, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7554860711097717, + "num_tokens": 366078883.0, + "step": 14145 + }, + { + "epoch": 1.553481221172853, + "grad_norm": 1.8255637884140015, + "learning_rate": 5e-06, + "loss": 0.917, + "mean_token_accuracy": 0.71607506275177, + "num_tokens": 366105248.0, + "step": 14146 + }, + { + "epoch": 1.5535910388754668, + "grad_norm": 1.9214855432510376, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.744890570640564, + "num_tokens": 366128788.0, + "step": 14147 + }, + { + "epoch": 1.5537008565780805, + "grad_norm": 1.6502500772476196, + "learning_rate": 5e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7305461168289185, + "num_tokens": 366158443.0, + "step": 14148 + }, + { + "epoch": 1.553810674280694, + "grad_norm": 1.7989990711212158, + "learning_rate": 5e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7136979103088379, + "num_tokens": 366188353.0, + "step": 14149 + }, + { + "epoch": 1.5539204919833076, + "grad_norm": 1.7809449434280396, + "learning_rate": 5e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.751839280128479, + "num_tokens": 366214142.0, + "step": 14150 + }, + { + "epoch": 1.5540303096859214, + "grad_norm": 1.9197869300842285, + "learning_rate": 5e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.750717043876648, + "num_tokens": 366235712.0, + "step": 14151 + }, + { + "epoch": 1.5541401273885351, + "grad_norm": 1.569693922996521, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7179571986198425, + "num_tokens": 366268369.0, + "step": 14152 + }, + { + "epoch": 1.5542499450911487, + "grad_norm": 1.734072208404541, + "learning_rate": 5e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7312618494033813, + "num_tokens": 366294167.0, + "step": 14153 + }, + { + "epoch": 1.5543597627937622, + "grad_norm": 1.5140199661254883, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7551650404930115, + "num_tokens": 366329547.0, + "step": 14154 + }, + { + "epoch": 1.554469580496376, + "grad_norm": 1.780461311340332, + "learning_rate": 5e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7369349598884583, + "num_tokens": 366359069.0, + "step": 14155 + }, + { + "epoch": 1.5545793981989897, + "grad_norm": 1.8386597633361816, + "learning_rate": 5e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7197755575180054, + "num_tokens": 366383860.0, + "step": 14156 + }, + { + "epoch": 1.5546892159016035, + "grad_norm": 2.282576322555542, + "learning_rate": 5e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7583813667297363, + "num_tokens": 366400540.0, + "step": 14157 + }, + { + "epoch": 1.554799033604217, + "grad_norm": 2.1151585578918457, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7352709770202637, + "num_tokens": 366420719.0, + "step": 14158 + }, + { + "epoch": 1.5549088513068305, + "grad_norm": 1.867364525794983, + "learning_rate": 5e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7458422183990479, + "num_tokens": 366444859.0, + "step": 14159 + }, + { + "epoch": 1.5550186690094443, + "grad_norm": 1.7073416709899902, + "learning_rate": 5e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7328633069992065, + "num_tokens": 366471725.0, + "step": 14160 + }, + { + "epoch": 1.555128486712058, + "grad_norm": 1.8178536891937256, + "learning_rate": 5e-06, + "loss": 0.967, + "mean_token_accuracy": 0.6991444230079651, + "num_tokens": 366500157.0, + "step": 14161 + }, + { + "epoch": 1.5552383044146716, + "grad_norm": 1.786990761756897, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7444283366203308, + "num_tokens": 366529068.0, + "step": 14162 + }, + { + "epoch": 1.5553481221172853, + "grad_norm": 1.841169834136963, + "learning_rate": 5e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7205228209495544, + "num_tokens": 366555827.0, + "step": 14163 + }, + { + "epoch": 1.5554579398198989, + "grad_norm": 1.9221084117889404, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7297942042350769, + "num_tokens": 366581685.0, + "step": 14164 + }, + { + "epoch": 1.5555677575225126, + "grad_norm": 1.9619537591934204, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7419674396514893, + "num_tokens": 366604826.0, + "step": 14165 + }, + { + "epoch": 1.5556775752251264, + "grad_norm": 1.990600347518921, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7511094212532043, + "num_tokens": 366624070.0, + "step": 14166 + }, + { + "epoch": 1.55578739292774, + "grad_norm": 1.7227641344070435, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7260861396789551, + "num_tokens": 366651095.0, + "step": 14167 + }, + { + "epoch": 1.5558972106303535, + "grad_norm": 1.8618528842926025, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7331953048706055, + "num_tokens": 366676630.0, + "step": 14168 + }, + { + "epoch": 1.5560070283329672, + "grad_norm": 1.6280779838562012, + "learning_rate": 5e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.734792947769165, + "num_tokens": 366704132.0, + "step": 14169 + }, + { + "epoch": 1.556116846035581, + "grad_norm": 1.954771637916565, + "learning_rate": 5e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.7482684850692749, + "num_tokens": 366726407.0, + "step": 14170 + }, + { + "epoch": 1.5562266637381947, + "grad_norm": 1.9862382411956787, + "learning_rate": 5e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7333671450614929, + "num_tokens": 366749411.0, + "step": 14171 + }, + { + "epoch": 1.5563364814408083, + "grad_norm": 2.0331313610076904, + "learning_rate": 5e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7490298748016357, + "num_tokens": 366770767.0, + "step": 14172 + }, + { + "epoch": 1.5564462991434218, + "grad_norm": 1.7348697185516357, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7491989731788635, + "num_tokens": 366797694.0, + "step": 14173 + }, + { + "epoch": 1.5565561168460356, + "grad_norm": 1.6845698356628418, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7186875343322754, + "num_tokens": 366828569.0, + "step": 14174 + }, + { + "epoch": 1.5566659345486493, + "grad_norm": 1.6163344383239746, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7507519125938416, + "num_tokens": 366859570.0, + "step": 14175 + }, + { + "epoch": 1.5567757522512629, + "grad_norm": 1.9425041675567627, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7272622585296631, + "num_tokens": 366885207.0, + "step": 14176 + }, + { + "epoch": 1.5568855699538766, + "grad_norm": 1.835691213607788, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7244521975517273, + "num_tokens": 366912533.0, + "step": 14177 + }, + { + "epoch": 1.5569953876564901, + "grad_norm": 1.6894328594207764, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7173611521720886, + "num_tokens": 366942932.0, + "step": 14178 + }, + { + "epoch": 1.557105205359104, + "grad_norm": 1.901123046875, + "learning_rate": 5e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7475561499595642, + "num_tokens": 366967898.0, + "step": 14179 + }, + { + "epoch": 1.5572150230617177, + "grad_norm": 1.5500050783157349, + "learning_rate": 5e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7162337303161621, + "num_tokens": 367001042.0, + "step": 14180 + }, + { + "epoch": 1.5573248407643312, + "grad_norm": 1.716435194015503, + "learning_rate": 5e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7359161972999573, + "num_tokens": 367028148.0, + "step": 14181 + }, + { + "epoch": 1.5574346584669447, + "grad_norm": 2.060059070587158, + "learning_rate": 5e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.739159107208252, + "num_tokens": 367048753.0, + "step": 14182 + }, + { + "epoch": 1.5575444761695585, + "grad_norm": 2.0012590885162354, + "learning_rate": 5e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7419964075088501, + "num_tokens": 367069307.0, + "step": 14183 + }, + { + "epoch": 1.5576542938721722, + "grad_norm": 1.8813285827636719, + "learning_rate": 5e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7193735241889954, + "num_tokens": 367096020.0, + "step": 14184 + }, + { + "epoch": 1.557764111574786, + "grad_norm": 1.8074637651443481, + "learning_rate": 5e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7374628186225891, + "num_tokens": 367120682.0, + "step": 14185 + }, + { + "epoch": 1.5578739292773995, + "grad_norm": 1.8657935857772827, + "learning_rate": 5e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7505590915679932, + "num_tokens": 367143608.0, + "step": 14186 + }, + { + "epoch": 1.557983746980013, + "grad_norm": 1.7345174551010132, + "learning_rate": 5e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7099716067314148, + "num_tokens": 367170976.0, + "step": 14187 + }, + { + "epoch": 1.5580935646826268, + "grad_norm": 1.8951653242111206, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7495177388191223, + "num_tokens": 367193928.0, + "step": 14188 + }, + { + "epoch": 1.5582033823852406, + "grad_norm": 2.1154911518096924, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7518465518951416, + "num_tokens": 367214859.0, + "step": 14189 + }, + { + "epoch": 1.5583132000878541, + "grad_norm": 1.826928973197937, + "learning_rate": 5e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7483401298522949, + "num_tokens": 367237427.0, + "step": 14190 + }, + { + "epoch": 1.5584230177904677, + "grad_norm": 1.7784737348556519, + "learning_rate": 5e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7374522089958191, + "num_tokens": 367264330.0, + "step": 14191 + }, + { + "epoch": 1.5585328354930814, + "grad_norm": 1.814812421798706, + "learning_rate": 5e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7201728224754333, + "num_tokens": 367295249.0, + "step": 14192 + }, + { + "epoch": 1.5586426531956952, + "grad_norm": 1.9581178426742554, + "learning_rate": 5e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7217717170715332, + "num_tokens": 367318819.0, + "step": 14193 + }, + { + "epoch": 1.558752470898309, + "grad_norm": 1.7958931922912598, + "learning_rate": 5e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7242791652679443, + "num_tokens": 367345786.0, + "step": 14194 + }, + { + "epoch": 1.5588622886009225, + "grad_norm": 1.8337575197219849, + "learning_rate": 5e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7385942339897156, + "num_tokens": 367368365.0, + "step": 14195 + }, + { + "epoch": 1.558972106303536, + "grad_norm": 1.9016757011413574, + "learning_rate": 5e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7166401147842407, + "num_tokens": 367393333.0, + "step": 14196 + }, + { + "epoch": 1.5590819240061498, + "grad_norm": 1.971888542175293, + "learning_rate": 5e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7617225050926208, + "num_tokens": 367416590.0, + "step": 14197 + }, + { + "epoch": 1.5591917417087635, + "grad_norm": 1.9680225849151611, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7193991541862488, + "num_tokens": 367439619.0, + "step": 14198 + }, + { + "epoch": 1.5593015594113773, + "grad_norm": 1.9539618492126465, + "learning_rate": 5e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7406958341598511, + "num_tokens": 367463192.0, + "step": 14199 + }, + { + "epoch": 1.5594113771139908, + "grad_norm": 1.587396502494812, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7394670844078064, + "num_tokens": 367492538.0, + "step": 14200 + }, + { + "epoch": 1.5595211948166043, + "grad_norm": 1.7633777856826782, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7186481356620789, + "num_tokens": 367522695.0, + "step": 14201 + }, + { + "epoch": 1.559631012519218, + "grad_norm": 2.011281728744507, + "learning_rate": 5e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7306848168373108, + "num_tokens": 367545812.0, + "step": 14202 + }, + { + "epoch": 1.5597408302218319, + "grad_norm": 2.050429582595825, + "learning_rate": 5e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7574216723442078, + "num_tokens": 367566673.0, + "step": 14203 + }, + { + "epoch": 1.5598506479244454, + "grad_norm": 1.7354687452316284, + "learning_rate": 5e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7422688007354736, + "num_tokens": 367592499.0, + "step": 14204 + }, + { + "epoch": 1.559960465627059, + "grad_norm": 1.9005564451217651, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7315275073051453, + "num_tokens": 367615898.0, + "step": 14205 + }, + { + "epoch": 1.5600702833296727, + "grad_norm": 1.865682601928711, + "learning_rate": 5e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7560535073280334, + "num_tokens": 367641445.0, + "step": 14206 + }, + { + "epoch": 1.5601801010322864, + "grad_norm": 1.9271636009216309, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.730593204498291, + "num_tokens": 367665866.0, + "step": 14207 + }, + { + "epoch": 1.5602899187349002, + "grad_norm": 1.9217106103897095, + "learning_rate": 5e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7328374981880188, + "num_tokens": 367689780.0, + "step": 14208 + }, + { + "epoch": 1.5603997364375137, + "grad_norm": 1.908128023147583, + "learning_rate": 5e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7311797142028809, + "num_tokens": 367714286.0, + "step": 14209 + }, + { + "epoch": 1.5605095541401273, + "grad_norm": 1.9121469259262085, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7404488325119019, + "num_tokens": 367738774.0, + "step": 14210 + }, + { + "epoch": 1.560619371842741, + "grad_norm": 1.8429516553878784, + "learning_rate": 5e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7450910210609436, + "num_tokens": 367761592.0, + "step": 14211 + }, + { + "epoch": 1.5607291895453548, + "grad_norm": 1.8047131299972534, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7341868877410889, + "num_tokens": 367789347.0, + "step": 14212 + }, + { + "epoch": 1.5608390072479685, + "grad_norm": 1.9792141914367676, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7395182847976685, + "num_tokens": 367812724.0, + "step": 14213 + }, + { + "epoch": 1.560948824950582, + "grad_norm": 1.7456151247024536, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7119619846343994, + "num_tokens": 367842508.0, + "step": 14214 + }, + { + "epoch": 1.5610586426531956, + "grad_norm": 2.114675760269165, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7542242407798767, + "num_tokens": 367861616.0, + "step": 14215 + }, + { + "epoch": 1.5611684603558094, + "grad_norm": 1.8773510456085205, + "learning_rate": 5e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7570109367370605, + "num_tokens": 367886496.0, + "step": 14216 + }, + { + "epoch": 1.5612782780584231, + "grad_norm": 1.850982427597046, + "learning_rate": 5e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7384904623031616, + "num_tokens": 367913059.0, + "step": 14217 + }, + { + "epoch": 1.5613880957610367, + "grad_norm": 1.7504591941833496, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7454575300216675, + "num_tokens": 367939768.0, + "step": 14218 + }, + { + "epoch": 1.5614979134636502, + "grad_norm": 1.8840395212173462, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7354827523231506, + "num_tokens": 367965891.0, + "step": 14219 + }, + { + "epoch": 1.561607731166264, + "grad_norm": 1.7335346937179565, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7182353734970093, + "num_tokens": 367994713.0, + "step": 14220 + }, + { + "epoch": 1.5617175488688777, + "grad_norm": 2.053215980529785, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7323037385940552, + "num_tokens": 368018041.0, + "step": 14221 + }, + { + "epoch": 1.5618273665714915, + "grad_norm": 2.288813829421997, + "learning_rate": 5e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7510663270950317, + "num_tokens": 368036930.0, + "step": 14222 + }, + { + "epoch": 1.561937184274105, + "grad_norm": 1.900975227355957, + "learning_rate": 5e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7455759048461914, + "num_tokens": 368059192.0, + "step": 14223 + }, + { + "epoch": 1.5620470019767185, + "grad_norm": 1.696900725364685, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7241455316543579, + "num_tokens": 368092845.0, + "step": 14224 + }, + { + "epoch": 1.5621568196793323, + "grad_norm": 1.819825291633606, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7407401204109192, + "num_tokens": 368117945.0, + "step": 14225 + }, + { + "epoch": 1.562266637381946, + "grad_norm": 1.9705647230148315, + "learning_rate": 5e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7221759557723999, + "num_tokens": 368142493.0, + "step": 14226 + }, + { + "epoch": 1.5623764550845596, + "grad_norm": 1.9431449174880981, + "learning_rate": 5e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7429730892181396, + "num_tokens": 368165612.0, + "step": 14227 + }, + { + "epoch": 1.5624862727871733, + "grad_norm": 1.781508207321167, + "learning_rate": 5e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7176522016525269, + "num_tokens": 368194215.0, + "step": 14228 + }, + { + "epoch": 1.5625960904897869, + "grad_norm": 1.7462127208709717, + "learning_rate": 5e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7446997761726379, + "num_tokens": 368223584.0, + "step": 14229 + }, + { + "epoch": 1.5627059081924006, + "grad_norm": 1.6768712997436523, + "learning_rate": 5e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7527939081192017, + "num_tokens": 368253121.0, + "step": 14230 + }, + { + "epoch": 1.5628157258950144, + "grad_norm": 2.0653507709503174, + "learning_rate": 5e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7586359977722168, + "num_tokens": 368273149.0, + "step": 14231 + }, + { + "epoch": 1.562925543597628, + "grad_norm": 1.7010666131973267, + "learning_rate": 5e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7436187267303467, + "num_tokens": 368300554.0, + "step": 14232 + }, + { + "epoch": 1.5630353613002415, + "grad_norm": 1.885087013244629, + "learning_rate": 5e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7027170062065125, + "num_tokens": 368326700.0, + "step": 14233 + }, + { + "epoch": 1.5631451790028552, + "grad_norm": 1.7157080173492432, + "learning_rate": 5e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7350074052810669, + "num_tokens": 368353496.0, + "step": 14234 + }, + { + "epoch": 1.563254996705469, + "grad_norm": 1.9147059917449951, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7295231819152832, + "num_tokens": 368379051.0, + "step": 14235 + }, + { + "epoch": 1.5633648144080827, + "grad_norm": 1.7133896350860596, + "learning_rate": 5e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7121485471725464, + "num_tokens": 368407597.0, + "step": 14236 + }, + { + "epoch": 1.5634746321106963, + "grad_norm": 1.797825813293457, + "learning_rate": 5e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7241643071174622, + "num_tokens": 368432519.0, + "step": 14237 + }, + { + "epoch": 1.5635844498133098, + "grad_norm": 1.9803131818771362, + "learning_rate": 5e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7433795928955078, + "num_tokens": 368453262.0, + "step": 14238 + }, + { + "epoch": 1.5636942675159236, + "grad_norm": 1.7291673421859741, + "learning_rate": 5e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7184240221977234, + "num_tokens": 368479024.0, + "step": 14239 + }, + { + "epoch": 1.5638040852185373, + "grad_norm": 1.7760815620422363, + "learning_rate": 5e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.694951593875885, + "num_tokens": 368506723.0, + "step": 14240 + }, + { + "epoch": 1.5639139029211508, + "grad_norm": 1.773584008216858, + "learning_rate": 5e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7225755453109741, + "num_tokens": 368534862.0, + "step": 14241 + }, + { + "epoch": 1.5640237206237646, + "grad_norm": 2.0378925800323486, + "learning_rate": 5e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7099746465682983, + "num_tokens": 368557266.0, + "step": 14242 + }, + { + "epoch": 1.5641335383263781, + "grad_norm": 1.6889710426330566, + "learning_rate": 5e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7230405807495117, + "num_tokens": 368587271.0, + "step": 14243 + }, + { + "epoch": 1.564243356028992, + "grad_norm": 2.1718082427978516, + "learning_rate": 5e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7235766649246216, + "num_tokens": 368607516.0, + "step": 14244 + }, + { + "epoch": 1.5643531737316057, + "grad_norm": 1.890546202659607, + "learning_rate": 5e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7179603576660156, + "num_tokens": 368634108.0, + "step": 14245 + }, + { + "epoch": 1.5644629914342192, + "grad_norm": 2.0302023887634277, + "learning_rate": 5e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7509552240371704, + "num_tokens": 368655505.0, + "step": 14246 + }, + { + "epoch": 1.5645728091368327, + "grad_norm": 1.7266852855682373, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7583743333816528, + "num_tokens": 368680887.0, + "step": 14247 + }, + { + "epoch": 1.5646826268394465, + "grad_norm": 1.8255151510238647, + "learning_rate": 5e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7365878820419312, + "num_tokens": 368707070.0, + "step": 14248 + }, + { + "epoch": 1.5647924445420602, + "grad_norm": 1.864821434020996, + "learning_rate": 5e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7343434691429138, + "num_tokens": 368732189.0, + "step": 14249 + }, + { + "epoch": 1.564902262244674, + "grad_norm": 1.8857299089431763, + "learning_rate": 5e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7415128946304321, + "num_tokens": 368755856.0, + "step": 14250 + }, + { + "epoch": 1.5650120799472875, + "grad_norm": 1.971826434135437, + "learning_rate": 5e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7425404787063599, + "num_tokens": 368776805.0, + "step": 14251 + }, + { + "epoch": 1.565121897649901, + "grad_norm": 1.8120605945587158, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7472982406616211, + "num_tokens": 368801228.0, + "step": 14252 + }, + { + "epoch": 1.5652317153525148, + "grad_norm": 1.8902804851531982, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7342785596847534, + "num_tokens": 368826242.0, + "step": 14253 + }, + { + "epoch": 1.5653415330551286, + "grad_norm": 2.014378547668457, + "learning_rate": 5e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7192437648773193, + "num_tokens": 368848966.0, + "step": 14254 + }, + { + "epoch": 1.565451350757742, + "grad_norm": 1.7062172889709473, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7298251390457153, + "num_tokens": 368878096.0, + "step": 14255 + }, + { + "epoch": 1.5655611684603556, + "grad_norm": 1.843850016593933, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7485541105270386, + "num_tokens": 368901644.0, + "step": 14256 + }, + { + "epoch": 1.5656709861629694, + "grad_norm": 2.036494493484497, + "learning_rate": 5e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.719834566116333, + "num_tokens": 368925864.0, + "step": 14257 + }, + { + "epoch": 1.5657808038655832, + "grad_norm": 2.2338244915008545, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7423387765884399, + "num_tokens": 368942878.0, + "step": 14258 + }, + { + "epoch": 1.565890621568197, + "grad_norm": 1.5852243900299072, + "learning_rate": 5e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7267411947250366, + "num_tokens": 368976892.0, + "step": 14259 + }, + { + "epoch": 1.5660004392708105, + "grad_norm": 1.7253764867782593, + "learning_rate": 5e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7390952110290527, + "num_tokens": 369004216.0, + "step": 14260 + }, + { + "epoch": 1.566110256973424, + "grad_norm": 1.8634874820709229, + "learning_rate": 5e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7202781438827515, + "num_tokens": 369030707.0, + "step": 14261 + }, + { + "epoch": 1.5662200746760377, + "grad_norm": 1.7870525121688843, + "learning_rate": 5e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7419642210006714, + "num_tokens": 369057970.0, + "step": 14262 + }, + { + "epoch": 1.5663298923786515, + "grad_norm": 2.269435167312622, + "learning_rate": 5e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7451507449150085, + "num_tokens": 369075058.0, + "step": 14263 + }, + { + "epoch": 1.5664397100812653, + "grad_norm": 1.831856608390808, + "learning_rate": 5e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7381102442741394, + "num_tokens": 369099635.0, + "step": 14264 + }, + { + "epoch": 1.5665495277838788, + "grad_norm": 1.9636834859848022, + "learning_rate": 5e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.717849850654602, + "num_tokens": 369123918.0, + "step": 14265 + }, + { + "epoch": 1.5666593454864923, + "grad_norm": 1.7672971487045288, + "learning_rate": 5e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7250140309333801, + "num_tokens": 369151753.0, + "step": 14266 + }, + { + "epoch": 1.566769163189106, + "grad_norm": 1.8717753887176514, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7537834644317627, + "num_tokens": 369177790.0, + "step": 14267 + }, + { + "epoch": 1.5668789808917198, + "grad_norm": 1.9151265621185303, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7316864728927612, + "num_tokens": 369202775.0, + "step": 14268 + }, + { + "epoch": 1.5669887985943334, + "grad_norm": 1.872629165649414, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7334288954734802, + "num_tokens": 369226395.0, + "step": 14269 + }, + { + "epoch": 1.567098616296947, + "grad_norm": 1.6070219278335571, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7360437512397766, + "num_tokens": 369260427.0, + "step": 14270 + }, + { + "epoch": 1.5672084339995607, + "grad_norm": 1.77646005153656, + "learning_rate": 5e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7031270265579224, + "num_tokens": 369289448.0, + "step": 14271 + }, + { + "epoch": 1.5673182517021744, + "grad_norm": 1.6921905279159546, + "learning_rate": 5e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7259114980697632, + "num_tokens": 369318070.0, + "step": 14272 + }, + { + "epoch": 1.5674280694047882, + "grad_norm": 1.7789288759231567, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7271329164505005, + "num_tokens": 369346206.0, + "step": 14273 + }, + { + "epoch": 1.5675378871074017, + "grad_norm": 1.9165500402450562, + "learning_rate": 5e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7254738211631775, + "num_tokens": 369375119.0, + "step": 14274 + }, + { + "epoch": 1.5676477048100153, + "grad_norm": 2.0446741580963135, + "learning_rate": 5e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.713281512260437, + "num_tokens": 369399441.0, + "step": 14275 + }, + { + "epoch": 1.567757522512629, + "grad_norm": 1.7320311069488525, + "learning_rate": 5e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7135680913925171, + "num_tokens": 369429229.0, + "step": 14276 + }, + { + "epoch": 1.5678673402152428, + "grad_norm": 1.9187626838684082, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7349081039428711, + "num_tokens": 369452563.0, + "step": 14277 + }, + { + "epoch": 1.5679771579178565, + "grad_norm": 1.7164020538330078, + "learning_rate": 5e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.735040009021759, + "num_tokens": 369481768.0, + "step": 14278 + }, + { + "epoch": 1.56808697562047, + "grad_norm": 1.909862756729126, + "learning_rate": 5e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7421740293502808, + "num_tokens": 369506227.0, + "step": 14279 + }, + { + "epoch": 1.5681967933230836, + "grad_norm": 1.6722623109817505, + "learning_rate": 5e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7520079016685486, + "num_tokens": 369532736.0, + "step": 14280 + }, + { + "epoch": 1.5683066110256974, + "grad_norm": 2.182591438293457, + "learning_rate": 5e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7444816827774048, + "num_tokens": 369550965.0, + "step": 14281 + }, + { + "epoch": 1.568416428728311, + "grad_norm": 1.84745192527771, + "learning_rate": 5e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.728138267993927, + "num_tokens": 369576990.0, + "step": 14282 + }, + { + "epoch": 1.5685262464309246, + "grad_norm": 2.142857074737549, + "learning_rate": 5e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7475066781044006, + "num_tokens": 369597962.0, + "step": 14283 + }, + { + "epoch": 1.5686360641335382, + "grad_norm": 1.8799961805343628, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7592285871505737, + "num_tokens": 369621498.0, + "step": 14284 + }, + { + "epoch": 1.568745881836152, + "grad_norm": 2.0193593502044678, + "learning_rate": 5e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7418147921562195, + "num_tokens": 369642800.0, + "step": 14285 + }, + { + "epoch": 1.5688556995387657, + "grad_norm": 1.9702790975570679, + "learning_rate": 5e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7081202864646912, + "num_tokens": 369668214.0, + "step": 14286 + }, + { + "epoch": 1.5689655172413794, + "grad_norm": 2.0184216499328613, + "learning_rate": 5e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7361108660697937, + "num_tokens": 369691578.0, + "step": 14287 + }, + { + "epoch": 1.569075334943993, + "grad_norm": 1.876213550567627, + "learning_rate": 5e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7294448614120483, + "num_tokens": 369717049.0, + "step": 14288 + }, + { + "epoch": 1.5691851526466065, + "grad_norm": 1.786428689956665, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7402458190917969, + "num_tokens": 369743214.0, + "step": 14289 + }, + { + "epoch": 1.5692949703492203, + "grad_norm": 1.9808303117752075, + "learning_rate": 5e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.7461223006248474, + "num_tokens": 369766257.0, + "step": 14290 + }, + { + "epoch": 1.569404788051834, + "grad_norm": 1.927876353263855, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7349734902381897, + "num_tokens": 369788679.0, + "step": 14291 + }, + { + "epoch": 1.5695146057544476, + "grad_norm": 1.7560148239135742, + "learning_rate": 5e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7107856869697571, + "num_tokens": 369817036.0, + "step": 14292 + }, + { + "epoch": 1.5696244234570613, + "grad_norm": 1.650526523590088, + "learning_rate": 5e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7207907438278198, + "num_tokens": 369850715.0, + "step": 14293 + }, + { + "epoch": 1.5697342411596749, + "grad_norm": 1.8136708736419678, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7334085702896118, + "num_tokens": 369878682.0, + "step": 14294 + }, + { + "epoch": 1.5698440588622886, + "grad_norm": 1.7386839389801025, + "learning_rate": 5e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7348840236663818, + "num_tokens": 369906099.0, + "step": 14295 + }, + { + "epoch": 1.5699538765649024, + "grad_norm": 1.8271034955978394, + "learning_rate": 5e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7500449419021606, + "num_tokens": 369930500.0, + "step": 14296 + }, + { + "epoch": 1.570063694267516, + "grad_norm": 1.9316470623016357, + "learning_rate": 5e-06, + "loss": 0.894, + "mean_token_accuracy": 0.712464451789856, + "num_tokens": 369955103.0, + "step": 14297 + }, + { + "epoch": 1.5701735119701294, + "grad_norm": 1.9015921354293823, + "learning_rate": 5e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7275063991546631, + "num_tokens": 369980837.0, + "step": 14298 + }, + { + "epoch": 1.5702833296727432, + "grad_norm": 1.8099702596664429, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7334651947021484, + "num_tokens": 370006734.0, + "step": 14299 + }, + { + "epoch": 1.570393147375357, + "grad_norm": 1.8064568042755127, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.737642765045166, + "num_tokens": 370031542.0, + "step": 14300 + }, + { + "epoch": 1.5705029650779707, + "grad_norm": 1.5133153200149536, + "learning_rate": 5e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7075401544570923, + "num_tokens": 370066876.0, + "step": 14301 + }, + { + "epoch": 1.5706127827805842, + "grad_norm": 1.784372329711914, + "learning_rate": 5e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7260211706161499, + "num_tokens": 370094215.0, + "step": 14302 + }, + { + "epoch": 1.5707226004831978, + "grad_norm": 1.737868309020996, + "learning_rate": 5e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7337170243263245, + "num_tokens": 370123203.0, + "step": 14303 + }, + { + "epoch": 1.5708324181858115, + "grad_norm": 1.9138617515563965, + "learning_rate": 5e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7394036650657654, + "num_tokens": 370147021.0, + "step": 14304 + }, + { + "epoch": 1.5709422358884253, + "grad_norm": 1.8741930723190308, + "learning_rate": 5e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7598257064819336, + "num_tokens": 370169258.0, + "step": 14305 + }, + { + "epoch": 1.5710520535910388, + "grad_norm": 1.8075510263442993, + "learning_rate": 5e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.714428186416626, + "num_tokens": 370198197.0, + "step": 14306 + }, + { + "epoch": 1.5711618712936526, + "grad_norm": 1.859259009361267, + "learning_rate": 5e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7338406443595886, + "num_tokens": 370223953.0, + "step": 14307 + }, + { + "epoch": 1.5712716889962661, + "grad_norm": 1.7025766372680664, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7175683975219727, + "num_tokens": 370254763.0, + "step": 14308 + }, + { + "epoch": 1.5713815066988799, + "grad_norm": 1.752402663230896, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7129872441291809, + "num_tokens": 370285070.0, + "step": 14309 + }, + { + "epoch": 1.5714913244014936, + "grad_norm": 2.0135817527770996, + "learning_rate": 5e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7323317527770996, + "num_tokens": 370309205.0, + "step": 14310 + }, + { + "epoch": 1.5716011421041072, + "grad_norm": 1.8035224676132202, + "learning_rate": 5e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7439961433410645, + "num_tokens": 370333798.0, + "step": 14311 + }, + { + "epoch": 1.5717109598067207, + "grad_norm": 1.7772918939590454, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7351478338241577, + "num_tokens": 370360794.0, + "step": 14312 + }, + { + "epoch": 1.5718207775093345, + "grad_norm": 1.801213264465332, + "learning_rate": 5e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7283408045768738, + "num_tokens": 370387134.0, + "step": 14313 + }, + { + "epoch": 1.5719305952119482, + "grad_norm": 1.820081114768982, + "learning_rate": 5e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7282714247703552, + "num_tokens": 370414687.0, + "step": 14314 + }, + { + "epoch": 1.572040412914562, + "grad_norm": 1.728130578994751, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7358360886573792, + "num_tokens": 370440704.0, + "step": 14315 + }, + { + "epoch": 1.5721502306171755, + "grad_norm": 1.965167760848999, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.724219560623169, + "num_tokens": 370466690.0, + "step": 14316 + }, + { + "epoch": 1.572260048319789, + "grad_norm": 1.7634180784225464, + "learning_rate": 5e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7356356382369995, + "num_tokens": 370492630.0, + "step": 14317 + }, + { + "epoch": 1.5723698660224028, + "grad_norm": 1.7056795358657837, + "learning_rate": 5e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7180463671684265, + "num_tokens": 370523505.0, + "step": 14318 + }, + { + "epoch": 1.5724796837250166, + "grad_norm": 1.5390827655792236, + "learning_rate": 5e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7085849046707153, + "num_tokens": 370558222.0, + "step": 14319 + }, + { + "epoch": 1.57258950142763, + "grad_norm": 2.03666353225708, + "learning_rate": 5e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7448545694351196, + "num_tokens": 370578144.0, + "step": 14320 + }, + { + "epoch": 1.5726993191302436, + "grad_norm": 2.176727771759033, + "learning_rate": 5e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7441189885139465, + "num_tokens": 370598461.0, + "step": 14321 + }, + { + "epoch": 1.5728091368328574, + "grad_norm": 1.50178062915802, + "learning_rate": 5e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7300435900688171, + "num_tokens": 370631743.0, + "step": 14322 + }, + { + "epoch": 1.5729189545354711, + "grad_norm": 1.804492473602295, + "learning_rate": 5e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7234678268432617, + "num_tokens": 370656624.0, + "step": 14323 + }, + { + "epoch": 1.573028772238085, + "grad_norm": 2.0663089752197266, + "learning_rate": 5e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7613453269004822, + "num_tokens": 370674976.0, + "step": 14324 + }, + { + "epoch": 1.5731385899406984, + "grad_norm": 1.652551531791687, + "learning_rate": 5e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.727324903011322, + "num_tokens": 370705516.0, + "step": 14325 + }, + { + "epoch": 1.573248407643312, + "grad_norm": 2.0589239597320557, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.737834095954895, + "num_tokens": 370725107.0, + "step": 14326 + }, + { + "epoch": 1.5733582253459257, + "grad_norm": 1.7811139822006226, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7376115918159485, + "num_tokens": 370749572.0, + "step": 14327 + }, + { + "epoch": 1.5734680430485395, + "grad_norm": 1.8592883348464966, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7222722768783569, + "num_tokens": 370778084.0, + "step": 14328 + }, + { + "epoch": 1.5735778607511532, + "grad_norm": 1.5787792205810547, + "learning_rate": 5e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7518647313117981, + "num_tokens": 370809027.0, + "step": 14329 + }, + { + "epoch": 1.5736876784537668, + "grad_norm": 1.9101676940917969, + "learning_rate": 5e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7513059377670288, + "num_tokens": 370832757.0, + "step": 14330 + }, + { + "epoch": 1.5737974961563803, + "grad_norm": 1.8477996587753296, + "learning_rate": 5e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7330102324485779, + "num_tokens": 370860391.0, + "step": 14331 + }, + { + "epoch": 1.573907313858994, + "grad_norm": 1.7620677947998047, + "learning_rate": 5e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7107131481170654, + "num_tokens": 370889067.0, + "step": 14332 + }, + { + "epoch": 1.5740171315616078, + "grad_norm": 1.7934826612472534, + "learning_rate": 5e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7148126363754272, + "num_tokens": 370917100.0, + "step": 14333 + }, + { + "epoch": 1.5741269492642214, + "grad_norm": 1.771355390548706, + "learning_rate": 5e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7426736950874329, + "num_tokens": 370941642.0, + "step": 14334 + }, + { + "epoch": 1.574236766966835, + "grad_norm": 1.6775436401367188, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7274954319000244, + "num_tokens": 370971907.0, + "step": 14335 + }, + { + "epoch": 1.5743465846694487, + "grad_norm": 1.812147617340088, + "learning_rate": 5e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7520150542259216, + "num_tokens": 370996020.0, + "step": 14336 + }, + { + "epoch": 1.5744564023720624, + "grad_norm": 2.2023673057556152, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7337199449539185, + "num_tokens": 371015588.0, + "step": 14337 + }, + { + "epoch": 1.5745662200746762, + "grad_norm": 1.9372076988220215, + "learning_rate": 5e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7263236045837402, + "num_tokens": 371037923.0, + "step": 14338 + }, + { + "epoch": 1.5746760377772897, + "grad_norm": 1.8206015825271606, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7269682288169861, + "num_tokens": 371064801.0, + "step": 14339 + }, + { + "epoch": 1.5747858554799032, + "grad_norm": 1.8116368055343628, + "learning_rate": 5e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7274336814880371, + "num_tokens": 371089620.0, + "step": 14340 + }, + { + "epoch": 1.574895673182517, + "grad_norm": 1.931033968925476, + "learning_rate": 5e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.7595660090446472, + "num_tokens": 371113257.0, + "step": 14341 + }, + { + "epoch": 1.5750054908851308, + "grad_norm": 1.7802435159683228, + "learning_rate": 5e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7119514346122742, + "num_tokens": 371141357.0, + "step": 14342 + }, + { + "epoch": 1.5751153085877443, + "grad_norm": 1.7112624645233154, + "learning_rate": 5e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7212880849838257, + "num_tokens": 371171108.0, + "step": 14343 + }, + { + "epoch": 1.575225126290358, + "grad_norm": 1.8072004318237305, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7343648672103882, + "num_tokens": 371199010.0, + "step": 14344 + }, + { + "epoch": 1.5753349439929716, + "grad_norm": 1.832422137260437, + "learning_rate": 5e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7215761542320251, + "num_tokens": 371225327.0, + "step": 14345 + }, + { + "epoch": 1.5754447616955853, + "grad_norm": 1.8668276071548462, + "learning_rate": 5e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7306064963340759, + "num_tokens": 371252679.0, + "step": 14346 + }, + { + "epoch": 1.575554579398199, + "grad_norm": 1.8040918111801147, + "learning_rate": 5e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7472407817840576, + "num_tokens": 371279297.0, + "step": 14347 + }, + { + "epoch": 1.5756643971008126, + "grad_norm": 1.9805976152420044, + "learning_rate": 5e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7513975501060486, + "num_tokens": 371300555.0, + "step": 14348 + }, + { + "epoch": 1.5757742148034262, + "grad_norm": 1.8190090656280518, + "learning_rate": 5e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7225835919380188, + "num_tokens": 371330308.0, + "step": 14349 + }, + { + "epoch": 1.57588403250604, + "grad_norm": 2.011625051498413, + "learning_rate": 5e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7440761923789978, + "num_tokens": 371352402.0, + "step": 14350 + }, + { + "epoch": 1.5759938502086537, + "grad_norm": 1.6811403036117554, + "learning_rate": 5e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7168468236923218, + "num_tokens": 371382385.0, + "step": 14351 + }, + { + "epoch": 1.5761036679112674, + "grad_norm": 1.67156982421875, + "learning_rate": 5e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7287595272064209, + "num_tokens": 371412468.0, + "step": 14352 + }, + { + "epoch": 1.576213485613881, + "grad_norm": 1.895270586013794, + "learning_rate": 5e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7368409633636475, + "num_tokens": 371436967.0, + "step": 14353 + }, + { + "epoch": 1.5763233033164945, + "grad_norm": 1.7796523571014404, + "learning_rate": 5e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7360423803329468, + "num_tokens": 371464404.0, + "step": 14354 + }, + { + "epoch": 1.5764331210191083, + "grad_norm": 1.81973397731781, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7404453754425049, + "num_tokens": 371489050.0, + "step": 14355 + }, + { + "epoch": 1.576542938721722, + "grad_norm": 1.59901762008667, + "learning_rate": 5e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7246870398521423, + "num_tokens": 371522659.0, + "step": 14356 + }, + { + "epoch": 1.5766527564243356, + "grad_norm": 1.8985469341278076, + "learning_rate": 5e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7134447693824768, + "num_tokens": 371548005.0, + "step": 14357 + }, + { + "epoch": 1.5767625741269493, + "grad_norm": 1.889526128768921, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7328637838363647, + "num_tokens": 371575663.0, + "step": 14358 + }, + { + "epoch": 1.5768723918295628, + "grad_norm": 1.841221570968628, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7313678860664368, + "num_tokens": 371599643.0, + "step": 14359 + }, + { + "epoch": 1.5769822095321766, + "grad_norm": 1.9464629888534546, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7555647492408752, + "num_tokens": 371623336.0, + "step": 14360 + }, + { + "epoch": 1.5770920272347904, + "grad_norm": 1.6542831659317017, + "learning_rate": 5e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7146430015563965, + "num_tokens": 371654403.0, + "step": 14361 + }, + { + "epoch": 1.577201844937404, + "grad_norm": 1.6994898319244385, + "learning_rate": 5e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7248080968856812, + "num_tokens": 371687594.0, + "step": 14362 + }, + { + "epoch": 1.5773116626400174, + "grad_norm": 2.0247347354888916, + "learning_rate": 5e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7480029463768005, + "num_tokens": 371707603.0, + "step": 14363 + }, + { + "epoch": 1.5774214803426312, + "grad_norm": 1.7830930948257446, + "learning_rate": 5e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7290521860122681, + "num_tokens": 371735320.0, + "step": 14364 + }, + { + "epoch": 1.577531298045245, + "grad_norm": 1.9148579835891724, + "learning_rate": 5e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.6950987577438354, + "num_tokens": 371760569.0, + "step": 14365 + }, + { + "epoch": 1.5776411157478587, + "grad_norm": 1.8194260597229004, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7326211929321289, + "num_tokens": 371785488.0, + "step": 14366 + }, + { + "epoch": 1.5777509334504722, + "grad_norm": 1.8422973155975342, + "learning_rate": 5e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7353826761245728, + "num_tokens": 371812258.0, + "step": 14367 + }, + { + "epoch": 1.5778607511530858, + "grad_norm": 1.7374629974365234, + "learning_rate": 5e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7186594605445862, + "num_tokens": 371843050.0, + "step": 14368 + }, + { + "epoch": 1.5779705688556995, + "grad_norm": 1.8504259586334229, + "learning_rate": 5e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.713099479675293, + "num_tokens": 371871433.0, + "step": 14369 + }, + { + "epoch": 1.5780803865583133, + "grad_norm": 1.5877889394760132, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7267596125602722, + "num_tokens": 371902271.0, + "step": 14370 + }, + { + "epoch": 1.5781902042609268, + "grad_norm": 1.7394130229949951, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7321720123291016, + "num_tokens": 371928022.0, + "step": 14371 + }, + { + "epoch": 1.5783000219635406, + "grad_norm": 1.8723469972610474, + "learning_rate": 5e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7600147724151611, + "num_tokens": 371950074.0, + "step": 14372 + }, + { + "epoch": 1.5784098396661541, + "grad_norm": 1.9917408227920532, + "learning_rate": 5e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7351092100143433, + "num_tokens": 371975256.0, + "step": 14373 + }, + { + "epoch": 1.5785196573687679, + "grad_norm": 1.6408181190490723, + "learning_rate": 5e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7009665369987488, + "num_tokens": 372006322.0, + "step": 14374 + }, + { + "epoch": 1.5786294750713816, + "grad_norm": 1.9176329374313354, + "learning_rate": 5e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7266525030136108, + "num_tokens": 372031755.0, + "step": 14375 + }, + { + "epoch": 1.5787392927739952, + "grad_norm": 1.9476711750030518, + "learning_rate": 5e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7298228740692139, + "num_tokens": 372054599.0, + "step": 14376 + }, + { + "epoch": 1.5788491104766087, + "grad_norm": 1.8505276441574097, + "learning_rate": 5e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7193008661270142, + "num_tokens": 372079633.0, + "step": 14377 + }, + { + "epoch": 1.5789589281792225, + "grad_norm": 1.745827078819275, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7288254499435425, + "num_tokens": 372108962.0, + "step": 14378 + }, + { + "epoch": 1.5790687458818362, + "grad_norm": 1.6465888023376465, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7285553216934204, + "num_tokens": 372140597.0, + "step": 14379 + }, + { + "epoch": 1.57917856358445, + "grad_norm": 1.7106263637542725, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7379670143127441, + "num_tokens": 372166814.0, + "step": 14380 + }, + { + "epoch": 1.5792883812870635, + "grad_norm": 1.9805281162261963, + "learning_rate": 5e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.753929853439331, + "num_tokens": 372189083.0, + "step": 14381 + }, + { + "epoch": 1.579398198989677, + "grad_norm": 1.751246452331543, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7201307415962219, + "num_tokens": 372218020.0, + "step": 14382 + }, + { + "epoch": 1.5795080166922908, + "grad_norm": 1.8005638122558594, + "learning_rate": 5e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7358132600784302, + "num_tokens": 372245175.0, + "step": 14383 + }, + { + "epoch": 1.5796178343949046, + "grad_norm": 1.9596083164215088, + "learning_rate": 5e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7326849699020386, + "num_tokens": 372269603.0, + "step": 14384 + }, + { + "epoch": 1.579727652097518, + "grad_norm": 2.180044412612915, + "learning_rate": 5e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7177973389625549, + "num_tokens": 372292402.0, + "step": 14385 + }, + { + "epoch": 1.5798374698001316, + "grad_norm": 1.8969098329544067, + "learning_rate": 5e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.722301721572876, + "num_tokens": 372315839.0, + "step": 14386 + }, + { + "epoch": 1.5799472875027454, + "grad_norm": 1.8727397918701172, + "learning_rate": 5e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7412303686141968, + "num_tokens": 372340095.0, + "step": 14387 + }, + { + "epoch": 1.5800571052053591, + "grad_norm": 1.8163577318191528, + "learning_rate": 5e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7316306829452515, + "num_tokens": 372367301.0, + "step": 14388 + }, + { + "epoch": 1.580166922907973, + "grad_norm": 2.0158162117004395, + "learning_rate": 5e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7515703439712524, + "num_tokens": 372389825.0, + "step": 14389 + }, + { + "epoch": 1.5802767406105864, + "grad_norm": 1.580614447593689, + "learning_rate": 5e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7167409658432007, + "num_tokens": 372424419.0, + "step": 14390 + }, + { + "epoch": 1.5803865583132, + "grad_norm": 2.030015468597412, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7702309489250183, + "num_tokens": 372443377.0, + "step": 14391 + }, + { + "epoch": 1.5804963760158137, + "grad_norm": 1.9335418939590454, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.729694128036499, + "num_tokens": 372467469.0, + "step": 14392 + }, + { + "epoch": 1.5806061937184275, + "grad_norm": 1.6620264053344727, + "learning_rate": 5e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7209689021110535, + "num_tokens": 372497436.0, + "step": 14393 + }, + { + "epoch": 1.5807160114210412, + "grad_norm": 1.7381280660629272, + "learning_rate": 5e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7137764692306519, + "num_tokens": 372527484.0, + "step": 14394 + }, + { + "epoch": 1.5808258291236548, + "grad_norm": 1.8576555252075195, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7328709959983826, + "num_tokens": 372553243.0, + "step": 14395 + }, + { + "epoch": 1.5809356468262683, + "grad_norm": 1.7143138647079468, + "learning_rate": 5e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7144932746887207, + "num_tokens": 372583141.0, + "step": 14396 + }, + { + "epoch": 1.581045464528882, + "grad_norm": 1.8551084995269775, + "learning_rate": 5e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7421778440475464, + "num_tokens": 372606712.0, + "step": 14397 + }, + { + "epoch": 1.5811552822314958, + "grad_norm": 1.7552365064620972, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7362422943115234, + "num_tokens": 372633129.0, + "step": 14398 + }, + { + "epoch": 1.5812650999341094, + "grad_norm": 2.0213112831115723, + "learning_rate": 5e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7365609407424927, + "num_tokens": 372655806.0, + "step": 14399 + }, + { + "epoch": 1.5813749176367229, + "grad_norm": 1.775933861732483, + "learning_rate": 5e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7517462372779846, + "num_tokens": 372679218.0, + "step": 14400 + }, + { + "epoch": 1.5814847353393366, + "grad_norm": 1.956710696220398, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7349196672439575, + "num_tokens": 372700934.0, + "step": 14401 + }, + { + "epoch": 1.5815945530419504, + "grad_norm": 1.8131829500198364, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.726272463798523, + "num_tokens": 372725443.0, + "step": 14402 + }, + { + "epoch": 1.5817043707445642, + "grad_norm": 2.1023826599121094, + "learning_rate": 5e-06, + "loss": 0.6738, + "mean_token_accuracy": 0.7782648801803589, + "num_tokens": 372743242.0, + "step": 14403 + }, + { + "epoch": 1.5818141884471777, + "grad_norm": 1.9079475402832031, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7369146943092346, + "num_tokens": 372766047.0, + "step": 14404 + }, + { + "epoch": 1.5819240061497912, + "grad_norm": 1.9132076501846313, + "learning_rate": 5e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7352904677391052, + "num_tokens": 372790225.0, + "step": 14405 + }, + { + "epoch": 1.582033823852405, + "grad_norm": 1.7109088897705078, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7288713455200195, + "num_tokens": 372820118.0, + "step": 14406 + }, + { + "epoch": 1.5821436415550187, + "grad_norm": 1.836791753768921, + "learning_rate": 5e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7488422393798828, + "num_tokens": 372845157.0, + "step": 14407 + }, + { + "epoch": 1.5822534592576323, + "grad_norm": 1.9626576900482178, + "learning_rate": 5e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7317490577697754, + "num_tokens": 372868176.0, + "step": 14408 + }, + { + "epoch": 1.582363276960246, + "grad_norm": 2.027751922607422, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7463502883911133, + "num_tokens": 372889732.0, + "step": 14409 + }, + { + "epoch": 1.5824730946628596, + "grad_norm": 1.923214077949524, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7275640964508057, + "num_tokens": 372913868.0, + "step": 14410 + }, + { + "epoch": 1.5825829123654733, + "grad_norm": 1.8331058025360107, + "learning_rate": 5e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7482708096504211, + "num_tokens": 372940473.0, + "step": 14411 + }, + { + "epoch": 1.582692730068087, + "grad_norm": 1.6360756158828735, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7346261739730835, + "num_tokens": 372972800.0, + "step": 14412 + }, + { + "epoch": 1.5828025477707006, + "grad_norm": 2.089834451675415, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.73655766248703, + "num_tokens": 372992872.0, + "step": 14413 + }, + { + "epoch": 1.5829123654733142, + "grad_norm": 1.7514270544052124, + "learning_rate": 5e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7420544624328613, + "num_tokens": 373019559.0, + "step": 14414 + }, + { + "epoch": 1.583022183175928, + "grad_norm": 1.6721822023391724, + "learning_rate": 5e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7158465385437012, + "num_tokens": 373051367.0, + "step": 14415 + }, + { + "epoch": 1.5831320008785417, + "grad_norm": 2.0305685997009277, + "learning_rate": 5e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7581251263618469, + "num_tokens": 373070893.0, + "step": 14416 + }, + { + "epoch": 1.5832418185811554, + "grad_norm": 2.1205079555511475, + "learning_rate": 5e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7708240151405334, + "num_tokens": 373087529.0, + "step": 14417 + }, + { + "epoch": 1.583351636283769, + "grad_norm": 2.031906843185425, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7278789281845093, + "num_tokens": 373108544.0, + "step": 14418 + }, + { + "epoch": 1.5834614539863825, + "grad_norm": 1.9395761489868164, + "learning_rate": 5e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7218065857887268, + "num_tokens": 373133596.0, + "step": 14419 + }, + { + "epoch": 1.5835712716889963, + "grad_norm": 1.8272899389266968, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7387109994888306, + "num_tokens": 373157468.0, + "step": 14420 + }, + { + "epoch": 1.58368108939161, + "grad_norm": 1.7102502584457397, + "learning_rate": 5e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7314965724945068, + "num_tokens": 373183957.0, + "step": 14421 + }, + { + "epoch": 1.5837909070942235, + "grad_norm": 1.9933676719665527, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7391747832298279, + "num_tokens": 373206453.0, + "step": 14422 + }, + { + "epoch": 1.5839007247968373, + "grad_norm": 1.9686779975891113, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.737582802772522, + "num_tokens": 373230803.0, + "step": 14423 + }, + { + "epoch": 1.5840105424994508, + "grad_norm": 1.8439005613327026, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7460652589797974, + "num_tokens": 373257460.0, + "step": 14424 + }, + { + "epoch": 1.5841203602020646, + "grad_norm": 1.706449270248413, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7187477350234985, + "num_tokens": 373288052.0, + "step": 14425 + }, + { + "epoch": 1.5842301779046783, + "grad_norm": 1.6829838752746582, + "learning_rate": 5e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7252802848815918, + "num_tokens": 373318410.0, + "step": 14426 + }, + { + "epoch": 1.5843399956072919, + "grad_norm": 1.8998924493789673, + "learning_rate": 5e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7302477359771729, + "num_tokens": 373343291.0, + "step": 14427 + }, + { + "epoch": 1.5844498133099054, + "grad_norm": 1.7823318243026733, + "learning_rate": 5e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7129164338111877, + "num_tokens": 373373278.0, + "step": 14428 + }, + { + "epoch": 1.5845596310125192, + "grad_norm": 1.775234580039978, + "learning_rate": 5e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7360514998435974, + "num_tokens": 373398692.0, + "step": 14429 + }, + { + "epoch": 1.584669448715133, + "grad_norm": 1.667891502380371, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7229149341583252, + "num_tokens": 373426950.0, + "step": 14430 + }, + { + "epoch": 1.5847792664177467, + "grad_norm": 1.5749435424804688, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7176491618156433, + "num_tokens": 373460926.0, + "step": 14431 + }, + { + "epoch": 1.5848890841203602, + "grad_norm": 1.7016422748565674, + "learning_rate": 5e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7330105900764465, + "num_tokens": 373488625.0, + "step": 14432 + }, + { + "epoch": 1.5849989018229738, + "grad_norm": 1.7146198749542236, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7369693517684937, + "num_tokens": 373517296.0, + "step": 14433 + }, + { + "epoch": 1.5851087195255875, + "grad_norm": 1.8834054470062256, + "learning_rate": 5e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.727965235710144, + "num_tokens": 373543749.0, + "step": 14434 + }, + { + "epoch": 1.5852185372282013, + "grad_norm": 2.019331693649292, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7216250896453857, + "num_tokens": 373565927.0, + "step": 14435 + }, + { + "epoch": 1.5853283549308148, + "grad_norm": 1.748727798461914, + "learning_rate": 5e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7207534313201904, + "num_tokens": 373593781.0, + "step": 14436 + }, + { + "epoch": 1.5854381726334283, + "grad_norm": 1.8063859939575195, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7326192259788513, + "num_tokens": 373617495.0, + "step": 14437 + }, + { + "epoch": 1.585547990336042, + "grad_norm": 2.0600082874298096, + "learning_rate": 5e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7316163182258606, + "num_tokens": 373640687.0, + "step": 14438 + }, + { + "epoch": 1.5856578080386559, + "grad_norm": 1.6593637466430664, + "learning_rate": 5e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.719052791595459, + "num_tokens": 373671164.0, + "step": 14439 + }, + { + "epoch": 1.5857676257412696, + "grad_norm": 1.6298812627792358, + "learning_rate": 5e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7320133447647095, + "num_tokens": 373701496.0, + "step": 14440 + }, + { + "epoch": 1.5858774434438832, + "grad_norm": 1.6180883646011353, + "learning_rate": 5e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.7365447282791138, + "num_tokens": 373732705.0, + "step": 14441 + }, + { + "epoch": 1.5859872611464967, + "grad_norm": 1.8246756792068481, + "learning_rate": 5e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7256091833114624, + "num_tokens": 373758788.0, + "step": 14442 + }, + { + "epoch": 1.5860970788491104, + "grad_norm": 1.8532599210739136, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7564967274665833, + "num_tokens": 373783467.0, + "step": 14443 + }, + { + "epoch": 1.5862068965517242, + "grad_norm": 1.8386811017990112, + "learning_rate": 5e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7146741151809692, + "num_tokens": 373809372.0, + "step": 14444 + }, + { + "epoch": 1.586316714254338, + "grad_norm": 1.7400351762771606, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7175250053405762, + "num_tokens": 373837000.0, + "step": 14445 + }, + { + "epoch": 1.5864265319569515, + "grad_norm": 1.8691082000732422, + "learning_rate": 5e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7425219416618347, + "num_tokens": 373861695.0, + "step": 14446 + }, + { + "epoch": 1.586536349659565, + "grad_norm": 1.881732702255249, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.722602128982544, + "num_tokens": 373887927.0, + "step": 14447 + }, + { + "epoch": 1.5866461673621788, + "grad_norm": 1.7087297439575195, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7399088144302368, + "num_tokens": 373917124.0, + "step": 14448 + }, + { + "epoch": 1.5867559850647925, + "grad_norm": 1.803373098373413, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7172123193740845, + "num_tokens": 373943101.0, + "step": 14449 + }, + { + "epoch": 1.586865802767406, + "grad_norm": 1.7021210193634033, + "learning_rate": 5e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7418414354324341, + "num_tokens": 373972097.0, + "step": 14450 + }, + { + "epoch": 1.5869756204700196, + "grad_norm": 1.659985899925232, + "learning_rate": 5e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7147668600082397, + "num_tokens": 374003145.0, + "step": 14451 + }, + { + "epoch": 1.5870854381726334, + "grad_norm": 1.5735421180725098, + "learning_rate": 5e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7154049873352051, + "num_tokens": 374035120.0, + "step": 14452 + }, + { + "epoch": 1.5871952558752471, + "grad_norm": 1.7491395473480225, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7441362738609314, + "num_tokens": 374062229.0, + "step": 14453 + }, + { + "epoch": 1.5873050735778609, + "grad_norm": 2.0851962566375732, + "learning_rate": 5e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7451351881027222, + "num_tokens": 374084036.0, + "step": 14454 + }, + { + "epoch": 1.5874148912804744, + "grad_norm": 1.8385578393936157, + "learning_rate": 5e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7538655400276184, + "num_tokens": 374108863.0, + "step": 14455 + }, + { + "epoch": 1.587524708983088, + "grad_norm": 1.9385451078414917, + "learning_rate": 5e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7359435558319092, + "num_tokens": 374132118.0, + "step": 14456 + }, + { + "epoch": 1.5876345266857017, + "grad_norm": 1.8900824785232544, + "learning_rate": 5e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7228832244873047, + "num_tokens": 374156713.0, + "step": 14457 + }, + { + "epoch": 1.5877443443883155, + "grad_norm": 1.6072031259536743, + "learning_rate": 5e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7074329853057861, + "num_tokens": 374191232.0, + "step": 14458 + }, + { + "epoch": 1.5878541620909292, + "grad_norm": 1.942476511001587, + "learning_rate": 5e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7532109022140503, + "num_tokens": 374213569.0, + "step": 14459 + }, + { + "epoch": 1.5879639797935428, + "grad_norm": 1.7326685190200806, + "learning_rate": 5e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7209922075271606, + "num_tokens": 374241868.0, + "step": 14460 + }, + { + "epoch": 1.5880737974961563, + "grad_norm": 1.977891445159912, + "learning_rate": 5e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7297215461730957, + "num_tokens": 374264003.0, + "step": 14461 + }, + { + "epoch": 1.58818361519877, + "grad_norm": 1.8899658918380737, + "learning_rate": 5e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7373250126838684, + "num_tokens": 374287421.0, + "step": 14462 + }, + { + "epoch": 1.5882934329013838, + "grad_norm": 1.7813562154769897, + "learning_rate": 5e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7189440727233887, + "num_tokens": 374316194.0, + "step": 14463 + }, + { + "epoch": 1.5884032506039973, + "grad_norm": 1.6702356338500977, + "learning_rate": 5e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7290185689926147, + "num_tokens": 374346214.0, + "step": 14464 + }, + { + "epoch": 1.5885130683066109, + "grad_norm": 1.9115383625030518, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7617619037628174, + "num_tokens": 374368056.0, + "step": 14465 + }, + { + "epoch": 1.5886228860092246, + "grad_norm": 1.7775745391845703, + "learning_rate": 5e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7552252411842346, + "num_tokens": 374391642.0, + "step": 14466 + }, + { + "epoch": 1.5887327037118384, + "grad_norm": 2.204453945159912, + "learning_rate": 5e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.6994925737380981, + "num_tokens": 374419011.0, + "step": 14467 + }, + { + "epoch": 1.5888425214144521, + "grad_norm": 1.7555075883865356, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7458707690238953, + "num_tokens": 374447098.0, + "step": 14468 + }, + { + "epoch": 1.5889523391170657, + "grad_norm": 1.8822822570800781, + "learning_rate": 5e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7296149134635925, + "num_tokens": 374472196.0, + "step": 14469 + }, + { + "epoch": 1.5890621568196792, + "grad_norm": 1.4998117685317993, + "learning_rate": 5e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7190889120101929, + "num_tokens": 374507885.0, + "step": 14470 + }, + { + "epoch": 1.589171974522293, + "grad_norm": 1.787996768951416, + "learning_rate": 5e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7490915060043335, + "num_tokens": 374532465.0, + "step": 14471 + }, + { + "epoch": 1.5892817922249067, + "grad_norm": 1.8671104907989502, + "learning_rate": 5e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7253418564796448, + "num_tokens": 374558522.0, + "step": 14472 + }, + { + "epoch": 1.5893916099275203, + "grad_norm": 1.6329798698425293, + "learning_rate": 5e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7331463098526001, + "num_tokens": 374589242.0, + "step": 14473 + }, + { + "epoch": 1.589501427630134, + "grad_norm": 1.943994402885437, + "learning_rate": 5e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7173083424568176, + "num_tokens": 374613758.0, + "step": 14474 + }, + { + "epoch": 1.5896112453327476, + "grad_norm": 2.0838308334350586, + "learning_rate": 5e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7529593706130981, + "num_tokens": 374633341.0, + "step": 14475 + }, + { + "epoch": 1.5897210630353613, + "grad_norm": 1.8197532892227173, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7346263527870178, + "num_tokens": 374659450.0, + "step": 14476 + }, + { + "epoch": 1.589830880737975, + "grad_norm": 2.0575032234191895, + "learning_rate": 5e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7166720032691956, + "num_tokens": 374680628.0, + "step": 14477 + }, + { + "epoch": 1.5899406984405886, + "grad_norm": 1.8645023107528687, + "learning_rate": 5e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7593483924865723, + "num_tokens": 374704157.0, + "step": 14478 + }, + { + "epoch": 1.5900505161432021, + "grad_norm": 2.007675886154175, + "learning_rate": 5e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7202601432800293, + "num_tokens": 374730894.0, + "step": 14479 + }, + { + "epoch": 1.590160333845816, + "grad_norm": 1.7322900295257568, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.74411940574646, + "num_tokens": 374757666.0, + "step": 14480 + }, + { + "epoch": 1.5902701515484297, + "grad_norm": 2.006946325302124, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7385199666023254, + "num_tokens": 374777552.0, + "step": 14481 + }, + { + "epoch": 1.5903799692510434, + "grad_norm": 1.9056520462036133, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7268185615539551, + "num_tokens": 374802443.0, + "step": 14482 + }, + { + "epoch": 1.590489786953657, + "grad_norm": 1.897727608680725, + "learning_rate": 5e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7248093485832214, + "num_tokens": 374829635.0, + "step": 14483 + }, + { + "epoch": 1.5905996046562705, + "grad_norm": 2.036844491958618, + "learning_rate": 5e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7200570106506348, + "num_tokens": 374853876.0, + "step": 14484 + }, + { + "epoch": 1.5907094223588842, + "grad_norm": 1.709258794784546, + "learning_rate": 5e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7151788473129272, + "num_tokens": 374882784.0, + "step": 14485 + }, + { + "epoch": 1.590819240061498, + "grad_norm": 1.5982460975646973, + "learning_rate": 5e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7350045442581177, + "num_tokens": 374913671.0, + "step": 14486 + }, + { + "epoch": 1.5909290577641115, + "grad_norm": 1.7545121908187866, + "learning_rate": 5e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7161588072776794, + "num_tokens": 374942774.0, + "step": 14487 + }, + { + "epoch": 1.5910388754667253, + "grad_norm": 1.899620771408081, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7394940257072449, + "num_tokens": 374967340.0, + "step": 14488 + }, + { + "epoch": 1.5911486931693388, + "grad_norm": 2.0680885314941406, + "learning_rate": 5e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7400892972946167, + "num_tokens": 374990196.0, + "step": 14489 + }, + { + "epoch": 1.5912585108719526, + "grad_norm": 1.7533313035964966, + "learning_rate": 5e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7286222577095032, + "num_tokens": 375017745.0, + "step": 14490 + }, + { + "epoch": 1.5913683285745663, + "grad_norm": 1.9506325721740723, + "learning_rate": 5e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7434822916984558, + "num_tokens": 375039701.0, + "step": 14491 + }, + { + "epoch": 1.5914781462771799, + "grad_norm": 1.832504153251648, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7379048466682434, + "num_tokens": 375065109.0, + "step": 14492 + }, + { + "epoch": 1.5915879639797934, + "grad_norm": 1.844679355621338, + "learning_rate": 5e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7111221551895142, + "num_tokens": 375091504.0, + "step": 14493 + }, + { + "epoch": 1.5916977816824072, + "grad_norm": 1.6827298402786255, + "learning_rate": 5e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7553970217704773, + "num_tokens": 375118434.0, + "step": 14494 + }, + { + "epoch": 1.591807599385021, + "grad_norm": 1.9262709617614746, + "learning_rate": 5e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7215715050697327, + "num_tokens": 375143831.0, + "step": 14495 + }, + { + "epoch": 1.5919174170876347, + "grad_norm": 1.8190375566482544, + "learning_rate": 5e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7261248826980591, + "num_tokens": 375170061.0, + "step": 14496 + }, + { + "epoch": 1.5920272347902482, + "grad_norm": 1.8623172044754028, + "learning_rate": 5e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7559138536453247, + "num_tokens": 375193480.0, + "step": 14497 + }, + { + "epoch": 1.5921370524928617, + "grad_norm": 1.9830915927886963, + "learning_rate": 5e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7371450662612915, + "num_tokens": 375214941.0, + "step": 14498 + }, + { + "epoch": 1.5922468701954755, + "grad_norm": 1.8082590103149414, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.724170446395874, + "num_tokens": 375239596.0, + "step": 14499 + }, + { + "epoch": 1.5923566878980893, + "grad_norm": 1.962004542350769, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7389965057373047, + "num_tokens": 375264797.0, + "step": 14500 + }, + { + "epoch": 1.5924665056007028, + "grad_norm": 1.9198843240737915, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7395414113998413, + "num_tokens": 375288987.0, + "step": 14501 + }, + { + "epoch": 1.5925763233033163, + "grad_norm": 1.8650012016296387, + "learning_rate": 5e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.739755392074585, + "num_tokens": 375314933.0, + "step": 14502 + }, + { + "epoch": 1.59268614100593, + "grad_norm": 1.8323655128479004, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7290533781051636, + "num_tokens": 375343534.0, + "step": 14503 + }, + { + "epoch": 1.5927959587085438, + "grad_norm": 1.9054484367370605, + "learning_rate": 5e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7160682678222656, + "num_tokens": 375367997.0, + "step": 14504 + }, + { + "epoch": 1.5929057764111576, + "grad_norm": 1.979669451713562, + "learning_rate": 5e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.727403998374939, + "num_tokens": 375388853.0, + "step": 14505 + }, + { + "epoch": 1.5930155941137711, + "grad_norm": 1.7972476482391357, + "learning_rate": 5e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7410680651664734, + "num_tokens": 375414519.0, + "step": 14506 + }, + { + "epoch": 1.5931254118163847, + "grad_norm": 1.9686272144317627, + "learning_rate": 5e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7142992615699768, + "num_tokens": 375439139.0, + "step": 14507 + }, + { + "epoch": 1.5932352295189984, + "grad_norm": 1.7033417224884033, + "learning_rate": 5e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7332632541656494, + "num_tokens": 375471400.0, + "step": 14508 + }, + { + "epoch": 1.5933450472216122, + "grad_norm": 1.6678564548492432, + "learning_rate": 5e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7140024900436401, + "num_tokens": 375499654.0, + "step": 14509 + }, + { + "epoch": 1.593454864924226, + "grad_norm": 1.780273199081421, + "learning_rate": 5e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7139065265655518, + "num_tokens": 375525979.0, + "step": 14510 + }, + { + "epoch": 1.5935646826268395, + "grad_norm": 1.8726674318313599, + "learning_rate": 5e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7390346527099609, + "num_tokens": 375549757.0, + "step": 14511 + }, + { + "epoch": 1.593674500329453, + "grad_norm": 1.6306192874908447, + "learning_rate": 5e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7342909574508667, + "num_tokens": 375579802.0, + "step": 14512 + }, + { + "epoch": 1.5937843180320668, + "grad_norm": 2.034139633178711, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.759928286075592, + "num_tokens": 375600119.0, + "step": 14513 + }, + { + "epoch": 1.5938941357346805, + "grad_norm": 1.8786777257919312, + "learning_rate": 5e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7141876816749573, + "num_tokens": 375626771.0, + "step": 14514 + }, + { + "epoch": 1.594003953437294, + "grad_norm": 1.8753023147583008, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.727226734161377, + "num_tokens": 375651113.0, + "step": 14515 + }, + { + "epoch": 1.5941137711399076, + "grad_norm": 1.6934727430343628, + "learning_rate": 5e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7288262844085693, + "num_tokens": 375677958.0, + "step": 14516 + }, + { + "epoch": 1.5942235888425214, + "grad_norm": 1.6818174123764038, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.71980881690979, + "num_tokens": 375709741.0, + "step": 14517 + }, + { + "epoch": 1.5943334065451351, + "grad_norm": 1.903430700302124, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.749237596988678, + "num_tokens": 375732200.0, + "step": 14518 + }, + { + "epoch": 1.5944432242477489, + "grad_norm": 1.7968766689300537, + "learning_rate": 5e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.70835280418396, + "num_tokens": 375762829.0, + "step": 14519 + }, + { + "epoch": 1.5945530419503624, + "grad_norm": 1.7580409049987793, + "learning_rate": 5e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7401295900344849, + "num_tokens": 375792364.0, + "step": 14520 + }, + { + "epoch": 1.594662859652976, + "grad_norm": 1.7846641540527344, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7276510000228882, + "num_tokens": 375819677.0, + "step": 14521 + }, + { + "epoch": 1.5947726773555897, + "grad_norm": 1.5812543630599976, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7441227436065674, + "num_tokens": 375852446.0, + "step": 14522 + }, + { + "epoch": 1.5948824950582035, + "grad_norm": 1.5112055540084839, + "learning_rate": 5e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7204443216323853, + "num_tokens": 375888180.0, + "step": 14523 + }, + { + "epoch": 1.5949923127608172, + "grad_norm": 1.8682122230529785, + "learning_rate": 5e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7220877408981323, + "num_tokens": 375915153.0, + "step": 14524 + }, + { + "epoch": 1.5951021304634307, + "grad_norm": 1.9381968975067139, + "learning_rate": 5e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.730392336845398, + "num_tokens": 375937453.0, + "step": 14525 + }, + { + "epoch": 1.5952119481660443, + "grad_norm": 1.8151471614837646, + "learning_rate": 5e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7097609043121338, + "num_tokens": 375966020.0, + "step": 14526 + }, + { + "epoch": 1.595321765868658, + "grad_norm": 1.9621944427490234, + "learning_rate": 5e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7507872581481934, + "num_tokens": 375987660.0, + "step": 14527 + }, + { + "epoch": 1.5954315835712718, + "grad_norm": 1.7879081964492798, + "learning_rate": 5e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7453714609146118, + "num_tokens": 376013069.0, + "step": 14528 + }, + { + "epoch": 1.5955414012738853, + "grad_norm": 1.9870742559432983, + "learning_rate": 5e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7201244831085205, + "num_tokens": 376037379.0, + "step": 14529 + }, + { + "epoch": 1.5956512189764989, + "grad_norm": 1.8266935348510742, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7284666299819946, + "num_tokens": 376064765.0, + "step": 14530 + }, + { + "epoch": 1.5957610366791126, + "grad_norm": 2.1813530921936035, + "learning_rate": 5e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7465227246284485, + "num_tokens": 376083898.0, + "step": 14531 + }, + { + "epoch": 1.5958708543817264, + "grad_norm": 2.237689733505249, + "learning_rate": 5e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7471523284912109, + "num_tokens": 376101811.0, + "step": 14532 + }, + { + "epoch": 1.5959806720843401, + "grad_norm": 1.9457738399505615, + "learning_rate": 5e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7412599325180054, + "num_tokens": 376123646.0, + "step": 14533 + }, + { + "epoch": 1.5960904897869537, + "grad_norm": 1.6839749813079834, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7172038555145264, + "num_tokens": 376152130.0, + "step": 14534 + }, + { + "epoch": 1.5962003074895672, + "grad_norm": 1.815869927406311, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.732970118522644, + "num_tokens": 376177395.0, + "step": 14535 + }, + { + "epoch": 1.596310125192181, + "grad_norm": 1.8108805418014526, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.742254912853241, + "num_tokens": 376202842.0, + "step": 14536 + }, + { + "epoch": 1.5964199428947947, + "grad_norm": 1.8771616220474243, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7188144326210022, + "num_tokens": 376228058.0, + "step": 14537 + }, + { + "epoch": 1.5965297605974083, + "grad_norm": 2.0152411460876465, + "learning_rate": 5e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7166366577148438, + "num_tokens": 376252575.0, + "step": 14538 + }, + { + "epoch": 1.596639578300022, + "grad_norm": 1.8367429971694946, + "learning_rate": 5e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7142215967178345, + "num_tokens": 376279655.0, + "step": 14539 + }, + { + "epoch": 1.5967493960026355, + "grad_norm": 1.831377625465393, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7192708849906921, + "num_tokens": 376303992.0, + "step": 14540 + }, + { + "epoch": 1.5968592137052493, + "grad_norm": 1.6842596530914307, + "learning_rate": 5e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7242968082427979, + "num_tokens": 376332962.0, + "step": 14541 + }, + { + "epoch": 1.596969031407863, + "grad_norm": 1.853048324584961, + "learning_rate": 5e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7437705993652344, + "num_tokens": 376359838.0, + "step": 14542 + }, + { + "epoch": 1.5970788491104766, + "grad_norm": 1.746638536453247, + "learning_rate": 5e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7403590679168701, + "num_tokens": 376386938.0, + "step": 14543 + }, + { + "epoch": 1.5971886668130901, + "grad_norm": 1.5086617469787598, + "learning_rate": 5e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7067298293113708, + "num_tokens": 376422764.0, + "step": 14544 + }, + { + "epoch": 1.5972984845157039, + "grad_norm": 1.546712875366211, + "learning_rate": 5e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.743453860282898, + "num_tokens": 376454791.0, + "step": 14545 + }, + { + "epoch": 1.5974083022183176, + "grad_norm": 1.9053629636764526, + "learning_rate": 5e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7200139164924622, + "num_tokens": 376479417.0, + "step": 14546 + }, + { + "epoch": 1.5975181199209314, + "grad_norm": 1.7697534561157227, + "learning_rate": 5e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7132304310798645, + "num_tokens": 376507395.0, + "step": 14547 + }, + { + "epoch": 1.597627937623545, + "grad_norm": 1.8441245555877686, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7345174551010132, + "num_tokens": 376532163.0, + "step": 14548 + }, + { + "epoch": 1.5977377553261585, + "grad_norm": 1.6965841054916382, + "learning_rate": 5e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7108199596405029, + "num_tokens": 376559420.0, + "step": 14549 + }, + { + "epoch": 1.5978475730287722, + "grad_norm": 1.8467161655426025, + "learning_rate": 5e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7292846441268921, + "num_tokens": 376584828.0, + "step": 14550 + }, + { + "epoch": 1.597957390731386, + "grad_norm": 1.7050949335098267, + "learning_rate": 5e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7241831421852112, + "num_tokens": 376615539.0, + "step": 14551 + }, + { + "epoch": 1.5980672084339995, + "grad_norm": 1.876839518547058, + "learning_rate": 5e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7400907278060913, + "num_tokens": 376641696.0, + "step": 14552 + }, + { + "epoch": 1.5981770261366133, + "grad_norm": 1.7477631568908691, + "learning_rate": 5e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7065304517745972, + "num_tokens": 376673000.0, + "step": 14553 + }, + { + "epoch": 1.5982868438392268, + "grad_norm": 2.0723073482513428, + "learning_rate": 5e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.748620867729187, + "num_tokens": 376693445.0, + "step": 14554 + }, + { + "epoch": 1.5983966615418406, + "grad_norm": 2.043750762939453, + "learning_rate": 5e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7354445457458496, + "num_tokens": 376714541.0, + "step": 14555 + }, + { + "epoch": 1.5985064792444543, + "grad_norm": 1.96846342086792, + "learning_rate": 5e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7256582379341125, + "num_tokens": 376736503.0, + "step": 14556 + }, + { + "epoch": 1.5986162969470679, + "grad_norm": 1.7437846660614014, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7196661829948425, + "num_tokens": 376763147.0, + "step": 14557 + }, + { + "epoch": 1.5987261146496814, + "grad_norm": 1.907220482826233, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7361827492713928, + "num_tokens": 376789195.0, + "step": 14558 + }, + { + "epoch": 1.5988359323522952, + "grad_norm": 1.908054232597351, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7678653001785278, + "num_tokens": 376810833.0, + "step": 14559 + }, + { + "epoch": 1.598945750054909, + "grad_norm": 1.8459423780441284, + "learning_rate": 5e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7487773299217224, + "num_tokens": 376834129.0, + "step": 14560 + }, + { + "epoch": 1.5990555677575227, + "grad_norm": 1.7474393844604492, + "learning_rate": 5e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7404930591583252, + "num_tokens": 376859334.0, + "step": 14561 + }, + { + "epoch": 1.5991653854601362, + "grad_norm": 1.85596764087677, + "learning_rate": 5e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7177204489707947, + "num_tokens": 376886094.0, + "step": 14562 + }, + { + "epoch": 1.5992752031627497, + "grad_norm": 1.7689011096954346, + "learning_rate": 5e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7241735458374023, + "num_tokens": 376912685.0, + "step": 14563 + }, + { + "epoch": 1.5993850208653635, + "grad_norm": 1.9185764789581299, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7519265413284302, + "num_tokens": 376937151.0, + "step": 14564 + }, + { + "epoch": 1.5994948385679773, + "grad_norm": 1.8497802019119263, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7204121351242065, + "num_tokens": 376963471.0, + "step": 14565 + }, + { + "epoch": 1.5996046562705908, + "grad_norm": 1.7768707275390625, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7330925464630127, + "num_tokens": 376990980.0, + "step": 14566 + }, + { + "epoch": 1.5997144739732043, + "grad_norm": 1.917966604232788, + "learning_rate": 5e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7285329103469849, + "num_tokens": 377017474.0, + "step": 14567 + }, + { + "epoch": 1.599824291675818, + "grad_norm": 1.9006928205490112, + "learning_rate": 5e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7279449701309204, + "num_tokens": 377044981.0, + "step": 14568 + }, + { + "epoch": 1.5999341093784318, + "grad_norm": 1.7801344394683838, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7699512243270874, + "num_tokens": 377067403.0, + "step": 14569 + }, + { + "epoch": 1.6000439270810456, + "grad_norm": 1.7965348958969116, + "learning_rate": 5e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7135438919067383, + "num_tokens": 377093977.0, + "step": 14570 + }, + { + "epoch": 1.6001537447836591, + "grad_norm": 1.9441505670547485, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7425042390823364, + "num_tokens": 377117077.0, + "step": 14571 + }, + { + "epoch": 1.6002635624862727, + "grad_norm": 1.9849258661270142, + "learning_rate": 5e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7344518899917603, + "num_tokens": 377141294.0, + "step": 14572 + }, + { + "epoch": 1.6003733801888864, + "grad_norm": 1.6892683506011963, + "learning_rate": 5e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7304366827011108, + "num_tokens": 377169265.0, + "step": 14573 + }, + { + "epoch": 1.6004831978915002, + "grad_norm": 1.8901498317718506, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.738197386264801, + "num_tokens": 377192665.0, + "step": 14574 + }, + { + "epoch": 1.600593015594114, + "grad_norm": 1.6948928833007812, + "learning_rate": 5e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7504990696907043, + "num_tokens": 377220459.0, + "step": 14575 + }, + { + "epoch": 1.6007028332967275, + "grad_norm": 2.0464420318603516, + "learning_rate": 5e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7483856678009033, + "num_tokens": 377242235.0, + "step": 14576 + }, + { + "epoch": 1.600812650999341, + "grad_norm": 1.7359349727630615, + "learning_rate": 5e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7406954765319824, + "num_tokens": 377266836.0, + "step": 14577 + }, + { + "epoch": 1.6009224687019548, + "grad_norm": 1.6399606466293335, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7303687334060669, + "num_tokens": 377298149.0, + "step": 14578 + }, + { + "epoch": 1.6010322864045685, + "grad_norm": 1.8437920808792114, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7438501119613647, + "num_tokens": 377322344.0, + "step": 14579 + }, + { + "epoch": 1.601142104107182, + "grad_norm": 1.891019344329834, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7215591669082642, + "num_tokens": 377347401.0, + "step": 14580 + }, + { + "epoch": 1.6012519218097956, + "grad_norm": 1.8938469886779785, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7365862727165222, + "num_tokens": 377373197.0, + "step": 14581 + }, + { + "epoch": 1.6013617395124093, + "grad_norm": 1.7169243097305298, + "learning_rate": 5e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7414976358413696, + "num_tokens": 377402682.0, + "step": 14582 + }, + { + "epoch": 1.601471557215023, + "grad_norm": 1.7810801267623901, + "learning_rate": 5e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.736558198928833, + "num_tokens": 377431352.0, + "step": 14583 + }, + { + "epoch": 1.6015813749176369, + "grad_norm": 1.6748948097229004, + "learning_rate": 5e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7097696661949158, + "num_tokens": 377465197.0, + "step": 14584 + }, + { + "epoch": 1.6016911926202504, + "grad_norm": 1.7216070890426636, + "learning_rate": 5e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7124961614608765, + "num_tokens": 377495113.0, + "step": 14585 + }, + { + "epoch": 1.601801010322864, + "grad_norm": 1.7804704904556274, + "learning_rate": 5e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7301433086395264, + "num_tokens": 377522196.0, + "step": 14586 + }, + { + "epoch": 1.6019108280254777, + "grad_norm": 1.728338360786438, + "learning_rate": 5e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7067297101020813, + "num_tokens": 377553712.0, + "step": 14587 + }, + { + "epoch": 1.6020206457280914, + "grad_norm": 1.720483660697937, + "learning_rate": 5e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7338863015174866, + "num_tokens": 377582091.0, + "step": 14588 + }, + { + "epoch": 1.602130463430705, + "grad_norm": 2.0078818798065186, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7286754846572876, + "num_tokens": 377601911.0, + "step": 14589 + }, + { + "epoch": 1.6022402811333187, + "grad_norm": 1.696974515914917, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7332721948623657, + "num_tokens": 377629416.0, + "step": 14590 + }, + { + "epoch": 1.6023500988359323, + "grad_norm": 1.8503273725509644, + "learning_rate": 5e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7596873044967651, + "num_tokens": 377652554.0, + "step": 14591 + }, + { + "epoch": 1.602459916538546, + "grad_norm": 1.8998759984970093, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7664336562156677, + "num_tokens": 377672799.0, + "step": 14592 + }, + { + "epoch": 1.6025697342411598, + "grad_norm": 1.801975131034851, + "learning_rate": 5e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7448030710220337, + "num_tokens": 377697255.0, + "step": 14593 + }, + { + "epoch": 1.6026795519437733, + "grad_norm": 1.693162202835083, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.730772852897644, + "num_tokens": 377724914.0, + "step": 14594 + }, + { + "epoch": 1.6027893696463869, + "grad_norm": 1.7479408979415894, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7311071753501892, + "num_tokens": 377752341.0, + "step": 14595 + }, + { + "epoch": 1.6028991873490006, + "grad_norm": 1.975000262260437, + "learning_rate": 5e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7151386737823486, + "num_tokens": 377776699.0, + "step": 14596 + }, + { + "epoch": 1.6030090050516144, + "grad_norm": 1.659170150756836, + "learning_rate": 5e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7117487192153931, + "num_tokens": 377810910.0, + "step": 14597 + }, + { + "epoch": 1.6031188227542281, + "grad_norm": 1.871383547782898, + "learning_rate": 5e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.71998131275177, + "num_tokens": 377836657.0, + "step": 14598 + }, + { + "epoch": 1.6032286404568417, + "grad_norm": 1.7760080099105835, + "learning_rate": 5e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7241606116294861, + "num_tokens": 377865443.0, + "step": 14599 + }, + { + "epoch": 1.6033384581594552, + "grad_norm": 1.812403678894043, + "learning_rate": 5e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7346140146255493, + "num_tokens": 377888401.0, + "step": 14600 + }, + { + "epoch": 1.603448275862069, + "grad_norm": 1.8363358974456787, + "learning_rate": 5e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7401882410049438, + "num_tokens": 377911824.0, + "step": 14601 + }, + { + "epoch": 1.6035580935646827, + "grad_norm": 1.7037744522094727, + "learning_rate": 5e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7341834902763367, + "num_tokens": 377939401.0, + "step": 14602 + }, + { + "epoch": 1.6036679112672962, + "grad_norm": 1.7265141010284424, + "learning_rate": 5e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7413192391395569, + "num_tokens": 377968225.0, + "step": 14603 + }, + { + "epoch": 1.60377772896991, + "grad_norm": 1.9853920936584473, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7514365315437317, + "num_tokens": 377987790.0, + "step": 14604 + }, + { + "epoch": 1.6038875466725235, + "grad_norm": 1.5969865322113037, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7406654357910156, + "num_tokens": 378018290.0, + "step": 14605 + }, + { + "epoch": 1.6039973643751373, + "grad_norm": 1.8770896196365356, + "learning_rate": 5e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7375811338424683, + "num_tokens": 378041338.0, + "step": 14606 + }, + { + "epoch": 1.604107182077751, + "grad_norm": 1.7130460739135742, + "learning_rate": 5e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7519038915634155, + "num_tokens": 378069260.0, + "step": 14607 + }, + { + "epoch": 1.6042169997803646, + "grad_norm": 1.7146399021148682, + "learning_rate": 5e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7065133452415466, + "num_tokens": 378097456.0, + "step": 14608 + }, + { + "epoch": 1.6043268174829781, + "grad_norm": 1.9392863512039185, + "learning_rate": 5e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7325999140739441, + "num_tokens": 378120496.0, + "step": 14609 + }, + { + "epoch": 1.6044366351855919, + "grad_norm": 1.8793351650238037, + "learning_rate": 5e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7279433608055115, + "num_tokens": 378143345.0, + "step": 14610 + }, + { + "epoch": 1.6045464528882056, + "grad_norm": 1.5821032524108887, + "learning_rate": 5e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7232412695884705, + "num_tokens": 378175117.0, + "step": 14611 + }, + { + "epoch": 1.6046562705908194, + "grad_norm": 1.9495240449905396, + "learning_rate": 5e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7535274028778076, + "num_tokens": 378197344.0, + "step": 14612 + }, + { + "epoch": 1.604766088293433, + "grad_norm": 1.8575407266616821, + "learning_rate": 5e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7486073970794678, + "num_tokens": 378221113.0, + "step": 14613 + }, + { + "epoch": 1.6048759059960465, + "grad_norm": 1.8479607105255127, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7139546871185303, + "num_tokens": 378249122.0, + "step": 14614 + }, + { + "epoch": 1.6049857236986602, + "grad_norm": 1.76747727394104, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7278343439102173, + "num_tokens": 378276587.0, + "step": 14615 + }, + { + "epoch": 1.605095541401274, + "grad_norm": 1.699693202972412, + "learning_rate": 5e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7443517446517944, + "num_tokens": 378304989.0, + "step": 14616 + }, + { + "epoch": 1.6052053591038875, + "grad_norm": 1.9136593341827393, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7402713894844055, + "num_tokens": 378329401.0, + "step": 14617 + }, + { + "epoch": 1.605315176806501, + "grad_norm": 1.7628954648971558, + "learning_rate": 5e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7546743154525757, + "num_tokens": 378355879.0, + "step": 14618 + }, + { + "epoch": 1.6054249945091148, + "grad_norm": 1.7216366529464722, + "learning_rate": 5e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7255858182907104, + "num_tokens": 378386045.0, + "step": 14619 + }, + { + "epoch": 1.6055348122117286, + "grad_norm": 1.6863365173339844, + "learning_rate": 5e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7453817129135132, + "num_tokens": 378414675.0, + "step": 14620 + }, + { + "epoch": 1.6056446299143423, + "grad_norm": 1.8800181150436401, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7364041805267334, + "num_tokens": 378438651.0, + "step": 14621 + }, + { + "epoch": 1.6057544476169558, + "grad_norm": 1.8366339206695557, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7195562124252319, + "num_tokens": 378466126.0, + "step": 14622 + }, + { + "epoch": 1.6058642653195694, + "grad_norm": 2.026034355163574, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7350417375564575, + "num_tokens": 378487076.0, + "step": 14623 + }, + { + "epoch": 1.6059740830221831, + "grad_norm": 2.0399303436279297, + "learning_rate": 5e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7309144735336304, + "num_tokens": 378510112.0, + "step": 14624 + }, + { + "epoch": 1.606083900724797, + "grad_norm": 1.9441689252853394, + "learning_rate": 5e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7286241054534912, + "num_tokens": 378533721.0, + "step": 14625 + }, + { + "epoch": 1.6061937184274107, + "grad_norm": 2.205836534500122, + "learning_rate": 5e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7492353320121765, + "num_tokens": 378554383.0, + "step": 14626 + }, + { + "epoch": 1.6063035361300242, + "grad_norm": 1.6359367370605469, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.702643871307373, + "num_tokens": 378588803.0, + "step": 14627 + }, + { + "epoch": 1.6064133538326377, + "grad_norm": 1.712811827659607, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7231793403625488, + "num_tokens": 378616529.0, + "step": 14628 + }, + { + "epoch": 1.6065231715352515, + "grad_norm": 1.7056740522384644, + "learning_rate": 5e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7160939574241638, + "num_tokens": 378644092.0, + "step": 14629 + }, + { + "epoch": 1.6066329892378652, + "grad_norm": 1.8428066968917847, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7236955761909485, + "num_tokens": 378671279.0, + "step": 14630 + }, + { + "epoch": 1.6067428069404788, + "grad_norm": 1.6841180324554443, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7293765544891357, + "num_tokens": 378700809.0, + "step": 14631 + }, + { + "epoch": 1.6068526246430923, + "grad_norm": 1.854788064956665, + "learning_rate": 5e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7498378753662109, + "num_tokens": 378726538.0, + "step": 14632 + }, + { + "epoch": 1.606962442345706, + "grad_norm": 1.8065898418426514, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7361913919448853, + "num_tokens": 378753002.0, + "step": 14633 + }, + { + "epoch": 1.6070722600483198, + "grad_norm": 1.7034766674041748, + "learning_rate": 5e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7219890356063843, + "num_tokens": 378782608.0, + "step": 14634 + }, + { + "epoch": 1.6071820777509336, + "grad_norm": 1.6564667224884033, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7260071635246277, + "num_tokens": 378813405.0, + "step": 14635 + }, + { + "epoch": 1.6072918954535471, + "grad_norm": 1.7398948669433594, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.73731529712677, + "num_tokens": 378840786.0, + "step": 14636 + }, + { + "epoch": 1.6074017131561606, + "grad_norm": 1.7980549335479736, + "learning_rate": 5e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7234591245651245, + "num_tokens": 378865620.0, + "step": 14637 + }, + { + "epoch": 1.6075115308587744, + "grad_norm": 1.9462004899978638, + "learning_rate": 5e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.705482006072998, + "num_tokens": 378892329.0, + "step": 14638 + }, + { + "epoch": 1.6076213485613882, + "grad_norm": 1.829228401184082, + "learning_rate": 5e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7497459650039673, + "num_tokens": 378918698.0, + "step": 14639 + }, + { + "epoch": 1.607731166264002, + "grad_norm": 1.8695634603500366, + "learning_rate": 5e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7202777862548828, + "num_tokens": 378943555.0, + "step": 14640 + }, + { + "epoch": 1.6078409839666155, + "grad_norm": 1.7196437120437622, + "learning_rate": 5e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7096884250640869, + "num_tokens": 378977153.0, + "step": 14641 + }, + { + "epoch": 1.607950801669229, + "grad_norm": 1.7563937902450562, + "learning_rate": 5e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7289565205574036, + "num_tokens": 379004154.0, + "step": 14642 + }, + { + "epoch": 1.6080606193718427, + "grad_norm": 1.9646365642547607, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7520670294761658, + "num_tokens": 379026934.0, + "step": 14643 + }, + { + "epoch": 1.6081704370744565, + "grad_norm": 1.8834233283996582, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7185885310173035, + "num_tokens": 379052168.0, + "step": 14644 + }, + { + "epoch": 1.60828025477707, + "grad_norm": 1.803615689277649, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.747833251953125, + "num_tokens": 379078258.0, + "step": 14645 + }, + { + "epoch": 1.6083900724796836, + "grad_norm": 2.0170140266418457, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.741243839263916, + "num_tokens": 379100138.0, + "step": 14646 + }, + { + "epoch": 1.6084998901822973, + "grad_norm": 1.8782671689987183, + "learning_rate": 5e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7489983439445496, + "num_tokens": 379123070.0, + "step": 14647 + }, + { + "epoch": 1.608609707884911, + "grad_norm": 1.6969726085662842, + "learning_rate": 5e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7583388686180115, + "num_tokens": 379149899.0, + "step": 14648 + }, + { + "epoch": 1.6087195255875248, + "grad_norm": 1.630950927734375, + "learning_rate": 5e-06, + "loss": 0.959, + "mean_token_accuracy": 0.699225902557373, + "num_tokens": 379182673.0, + "step": 14649 + }, + { + "epoch": 1.6088293432901384, + "grad_norm": 2.0559439659118652, + "learning_rate": 5e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7437407374382019, + "num_tokens": 379202720.0, + "step": 14650 + }, + { + "epoch": 1.608939160992752, + "grad_norm": 1.7577134370803833, + "learning_rate": 5e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7326469421386719, + "num_tokens": 379232266.0, + "step": 14651 + }, + { + "epoch": 1.6090489786953657, + "grad_norm": 2.0060653686523438, + "learning_rate": 5e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7438170909881592, + "num_tokens": 379253663.0, + "step": 14652 + }, + { + "epoch": 1.6091587963979794, + "grad_norm": 1.9740865230560303, + "learning_rate": 5e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7449982762336731, + "num_tokens": 379276906.0, + "step": 14653 + }, + { + "epoch": 1.609268614100593, + "grad_norm": 1.70086669921875, + "learning_rate": 5e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7402292490005493, + "num_tokens": 379304208.0, + "step": 14654 + }, + { + "epoch": 1.6093784318032067, + "grad_norm": 1.8566267490386963, + "learning_rate": 5e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7454403638839722, + "num_tokens": 379327822.0, + "step": 14655 + }, + { + "epoch": 1.6094882495058203, + "grad_norm": 1.6385725736618042, + "learning_rate": 5e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.70681232213974, + "num_tokens": 379361526.0, + "step": 14656 + }, + { + "epoch": 1.609598067208434, + "grad_norm": 1.7676401138305664, + "learning_rate": 5e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7460818290710449, + "num_tokens": 379388346.0, + "step": 14657 + }, + { + "epoch": 1.6097078849110478, + "grad_norm": 1.7924861907958984, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7268772125244141, + "num_tokens": 379413787.0, + "step": 14658 + }, + { + "epoch": 1.6098177026136613, + "grad_norm": 1.873020052909851, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7335506677627563, + "num_tokens": 379439161.0, + "step": 14659 + }, + { + "epoch": 1.6099275203162748, + "grad_norm": 1.969113826751709, + "learning_rate": 5e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7371665239334106, + "num_tokens": 379460947.0, + "step": 14660 + }, + { + "epoch": 1.6100373380188886, + "grad_norm": 1.8132703304290771, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.730548083782196, + "num_tokens": 379486051.0, + "step": 14661 + }, + { + "epoch": 1.6101471557215024, + "grad_norm": 1.9579875469207764, + "learning_rate": 5e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7517393827438354, + "num_tokens": 379507849.0, + "step": 14662 + }, + { + "epoch": 1.6102569734241161, + "grad_norm": 1.8359580039978027, + "learning_rate": 5e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7237036228179932, + "num_tokens": 379533563.0, + "step": 14663 + }, + { + "epoch": 1.6103667911267296, + "grad_norm": 1.7021195888519287, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7224441766738892, + "num_tokens": 379561486.0, + "step": 14664 + }, + { + "epoch": 1.6104766088293432, + "grad_norm": 1.9056191444396973, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7621477842330933, + "num_tokens": 379582595.0, + "step": 14665 + }, + { + "epoch": 1.610586426531957, + "grad_norm": 2.01422381401062, + "learning_rate": 5e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7298798561096191, + "num_tokens": 379603539.0, + "step": 14666 + }, + { + "epoch": 1.6106962442345707, + "grad_norm": 1.795800805091858, + "learning_rate": 5e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7557560205459595, + "num_tokens": 379628462.0, + "step": 14667 + }, + { + "epoch": 1.6108060619371842, + "grad_norm": 1.8146309852600098, + "learning_rate": 5e-06, + "loss": 0.891, + "mean_token_accuracy": 0.723233699798584, + "num_tokens": 379654739.0, + "step": 14668 + }, + { + "epoch": 1.610915879639798, + "grad_norm": 1.6746255159378052, + "learning_rate": 5e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7235619425773621, + "num_tokens": 379684500.0, + "step": 14669 + }, + { + "epoch": 1.6110256973424115, + "grad_norm": 1.9667798280715942, + "learning_rate": 5e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7152267694473267, + "num_tokens": 379708918.0, + "step": 14670 + }, + { + "epoch": 1.6111355150450253, + "grad_norm": 1.7416883707046509, + "learning_rate": 5e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7255926728248596, + "num_tokens": 379735528.0, + "step": 14671 + }, + { + "epoch": 1.611245332747639, + "grad_norm": 1.6178909540176392, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7208181023597717, + "num_tokens": 379767802.0, + "step": 14672 + }, + { + "epoch": 1.6113551504502526, + "grad_norm": 1.794542908668518, + "learning_rate": 5e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7317489385604858, + "num_tokens": 379793294.0, + "step": 14673 + }, + { + "epoch": 1.611464968152866, + "grad_norm": 1.8199570178985596, + "learning_rate": 5e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7433147430419922, + "num_tokens": 379817036.0, + "step": 14674 + }, + { + "epoch": 1.6115747858554799, + "grad_norm": 1.8783398866653442, + "learning_rate": 5e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7450501918792725, + "num_tokens": 379840839.0, + "step": 14675 + }, + { + "epoch": 1.6116846035580936, + "grad_norm": 1.8624969720840454, + "learning_rate": 5e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.745550274848938, + "num_tokens": 379865282.0, + "step": 14676 + }, + { + "epoch": 1.6117944212607074, + "grad_norm": 2.1485676765441895, + "learning_rate": 5e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7362574338912964, + "num_tokens": 379885619.0, + "step": 14677 + }, + { + "epoch": 1.611904238963321, + "grad_norm": 1.9930633306503296, + "learning_rate": 5e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7377310991287231, + "num_tokens": 379910716.0, + "step": 14678 + }, + { + "epoch": 1.6120140566659344, + "grad_norm": 1.6371166706085205, + "learning_rate": 5e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7008814215660095, + "num_tokens": 379943434.0, + "step": 14679 + }, + { + "epoch": 1.6121238743685482, + "grad_norm": 1.7730491161346436, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7385939359664917, + "num_tokens": 379969728.0, + "step": 14680 + }, + { + "epoch": 1.612233692071162, + "grad_norm": 1.686231255531311, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7251778841018677, + "num_tokens": 380001201.0, + "step": 14681 + }, + { + "epoch": 1.6123435097737755, + "grad_norm": 1.7510892152786255, + "learning_rate": 5e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7485647201538086, + "num_tokens": 380028178.0, + "step": 14682 + }, + { + "epoch": 1.612453327476389, + "grad_norm": 1.9969102144241333, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7435230016708374, + "num_tokens": 380050281.0, + "step": 14683 + }, + { + "epoch": 1.6125631451790028, + "grad_norm": 1.9643992185592651, + "learning_rate": 5e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7440502643585205, + "num_tokens": 380072722.0, + "step": 14684 + }, + { + "epoch": 1.6126729628816165, + "grad_norm": 1.7706629037857056, + "learning_rate": 5e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7114869356155396, + "num_tokens": 380101949.0, + "step": 14685 + }, + { + "epoch": 1.6127827805842303, + "grad_norm": 1.9319230318069458, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7224892973899841, + "num_tokens": 380127281.0, + "step": 14686 + }, + { + "epoch": 1.6128925982868438, + "grad_norm": 1.9404948949813843, + "learning_rate": 5e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7603530883789062, + "num_tokens": 380149579.0, + "step": 14687 + }, + { + "epoch": 1.6130024159894574, + "grad_norm": 1.695701241493225, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7303099632263184, + "num_tokens": 380180461.0, + "step": 14688 + }, + { + "epoch": 1.6131122336920711, + "grad_norm": 2.077305793762207, + "learning_rate": 5e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7401803731918335, + "num_tokens": 380204656.0, + "step": 14689 + }, + { + "epoch": 1.6132220513946849, + "grad_norm": 1.8747416734695435, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7275364398956299, + "num_tokens": 380230406.0, + "step": 14690 + }, + { + "epoch": 1.6133318690972986, + "grad_norm": 2.1180830001831055, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7509723901748657, + "num_tokens": 380249928.0, + "step": 14691 + }, + { + "epoch": 1.6134416867999122, + "grad_norm": 1.9461722373962402, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7221299409866333, + "num_tokens": 380275775.0, + "step": 14692 + }, + { + "epoch": 1.6135515045025257, + "grad_norm": 1.982935905456543, + "learning_rate": 5e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7378761768341064, + "num_tokens": 380302317.0, + "step": 14693 + }, + { + "epoch": 1.6136613222051395, + "grad_norm": 2.1047964096069336, + "learning_rate": 5e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7327990531921387, + "num_tokens": 380324051.0, + "step": 14694 + }, + { + "epoch": 1.6137711399077532, + "grad_norm": 1.847659707069397, + "learning_rate": 5e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7164345383644104, + "num_tokens": 380350602.0, + "step": 14695 + }, + { + "epoch": 1.6138809576103668, + "grad_norm": 1.9939489364624023, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7437776923179626, + "num_tokens": 380371760.0, + "step": 14696 + }, + { + "epoch": 1.6139907753129803, + "grad_norm": 1.7907272577285767, + "learning_rate": 5e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.742077112197876, + "num_tokens": 380399465.0, + "step": 14697 + }, + { + "epoch": 1.614100593015594, + "grad_norm": 2.078108787536621, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7372102737426758, + "num_tokens": 380420619.0, + "step": 14698 + }, + { + "epoch": 1.6142104107182078, + "grad_norm": 1.895833134651184, + "learning_rate": 5e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7093737721443176, + "num_tokens": 380446243.0, + "step": 14699 + }, + { + "epoch": 1.6143202284208216, + "grad_norm": 1.7621526718139648, + "learning_rate": 5e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7232271432876587, + "num_tokens": 380475061.0, + "step": 14700 + }, + { + "epoch": 1.614430046123435, + "grad_norm": 1.7995191812515259, + "learning_rate": 5e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7192888259887695, + "num_tokens": 380500270.0, + "step": 14701 + }, + { + "epoch": 1.6145398638260486, + "grad_norm": 1.6318635940551758, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7336580753326416, + "num_tokens": 380531232.0, + "step": 14702 + }, + { + "epoch": 1.6146496815286624, + "grad_norm": 1.8802274465560913, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7254513502120972, + "num_tokens": 380555479.0, + "step": 14703 + }, + { + "epoch": 1.6147594992312762, + "grad_norm": 1.997444748878479, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.730130136013031, + "num_tokens": 380577497.0, + "step": 14704 + }, + { + "epoch": 1.61486931693389, + "grad_norm": 1.8677281141281128, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7218599319458008, + "num_tokens": 380605116.0, + "step": 14705 + }, + { + "epoch": 1.6149791346365034, + "grad_norm": 2.0342838764190674, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7476903796195984, + "num_tokens": 380624779.0, + "step": 14706 + }, + { + "epoch": 1.615088952339117, + "grad_norm": 2.1211869716644287, + "learning_rate": 5e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7670749425888062, + "num_tokens": 380643202.0, + "step": 14707 + }, + { + "epoch": 1.6151987700417307, + "grad_norm": 1.8300070762634277, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7627975940704346, + "num_tokens": 380668074.0, + "step": 14708 + }, + { + "epoch": 1.6153085877443445, + "grad_norm": 2.0153684616088867, + "learning_rate": 5e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7052054405212402, + "num_tokens": 380693100.0, + "step": 14709 + }, + { + "epoch": 1.615418405446958, + "grad_norm": 1.9643408060073853, + "learning_rate": 5e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7486319541931152, + "num_tokens": 380715424.0, + "step": 14710 + }, + { + "epoch": 1.6155282231495716, + "grad_norm": 2.035210609436035, + "learning_rate": 5e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7318717241287231, + "num_tokens": 380740487.0, + "step": 14711 + }, + { + "epoch": 1.6156380408521853, + "grad_norm": 1.9454081058502197, + "learning_rate": 5e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7219565510749817, + "num_tokens": 380764858.0, + "step": 14712 + }, + { + "epoch": 1.615747858554799, + "grad_norm": 1.6062010526657104, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7215839624404907, + "num_tokens": 380798454.0, + "step": 14713 + }, + { + "epoch": 1.6158576762574128, + "grad_norm": 1.5387294292449951, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7312389016151428, + "num_tokens": 380832495.0, + "step": 14714 + }, + { + "epoch": 1.6159674939600264, + "grad_norm": 1.69419527053833, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.724897027015686, + "num_tokens": 380862004.0, + "step": 14715 + }, + { + "epoch": 1.61607731166264, + "grad_norm": 1.9293204545974731, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7649794220924377, + "num_tokens": 380882248.0, + "step": 14716 + }, + { + "epoch": 1.6161871293652537, + "grad_norm": 2.0722756385803223, + "learning_rate": 5e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7368787527084351, + "num_tokens": 380906223.0, + "step": 14717 + }, + { + "epoch": 1.6162969470678674, + "grad_norm": 1.8426083326339722, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7217432260513306, + "num_tokens": 380931458.0, + "step": 14718 + }, + { + "epoch": 1.616406764770481, + "grad_norm": 1.8536248207092285, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7290030121803284, + "num_tokens": 380956076.0, + "step": 14719 + }, + { + "epoch": 1.6165165824730947, + "grad_norm": 1.8303600549697876, + "learning_rate": 5e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7552028894424438, + "num_tokens": 380979946.0, + "step": 14720 + }, + { + "epoch": 1.6166264001757082, + "grad_norm": 1.8912748098373413, + "learning_rate": 5e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7410100102424622, + "num_tokens": 381003539.0, + "step": 14721 + }, + { + "epoch": 1.616736217878322, + "grad_norm": 1.7554306983947754, + "learning_rate": 5e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7340978980064392, + "num_tokens": 381032062.0, + "step": 14722 + }, + { + "epoch": 1.6168460355809358, + "grad_norm": 1.8070602416992188, + "learning_rate": 5e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7256674766540527, + "num_tokens": 381056350.0, + "step": 14723 + }, + { + "epoch": 1.6169558532835493, + "grad_norm": 1.8786031007766724, + "learning_rate": 5e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7324491739273071, + "num_tokens": 381082251.0, + "step": 14724 + }, + { + "epoch": 1.6170656709861628, + "grad_norm": 1.8358676433563232, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7419747710227966, + "num_tokens": 381109273.0, + "step": 14725 + }, + { + "epoch": 1.6171754886887766, + "grad_norm": 1.8023630380630493, + "learning_rate": 5e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7067366242408752, + "num_tokens": 381134287.0, + "step": 14726 + }, + { + "epoch": 1.6172853063913903, + "grad_norm": 2.036752223968506, + "learning_rate": 5e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7305203080177307, + "num_tokens": 381156785.0, + "step": 14727 + }, + { + "epoch": 1.617395124094004, + "grad_norm": 1.9411349296569824, + "learning_rate": 5e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7460956573486328, + "num_tokens": 381177877.0, + "step": 14728 + }, + { + "epoch": 1.6175049417966176, + "grad_norm": 1.9212969541549683, + "learning_rate": 5e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.7451078295707703, + "num_tokens": 381200375.0, + "step": 14729 + }, + { + "epoch": 1.6176147594992312, + "grad_norm": 1.7895585298538208, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7426930665969849, + "num_tokens": 381225005.0, + "step": 14730 + }, + { + "epoch": 1.617724577201845, + "grad_norm": 1.8447380065917969, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7341436147689819, + "num_tokens": 381248914.0, + "step": 14731 + }, + { + "epoch": 1.6178343949044587, + "grad_norm": 2.045767068862915, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7376342415809631, + "num_tokens": 381270685.0, + "step": 14732 + }, + { + "epoch": 1.6179442126070722, + "grad_norm": 2.017333984375, + "learning_rate": 5e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7425261735916138, + "num_tokens": 381292422.0, + "step": 14733 + }, + { + "epoch": 1.618054030309686, + "grad_norm": 1.917235255241394, + "learning_rate": 5e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7080760598182678, + "num_tokens": 381320610.0, + "step": 14734 + }, + { + "epoch": 1.6181638480122995, + "grad_norm": 2.0121207237243652, + "learning_rate": 5e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7441405057907104, + "num_tokens": 381342015.0, + "step": 14735 + }, + { + "epoch": 1.6182736657149133, + "grad_norm": 1.9512816667556763, + "learning_rate": 5e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.7684780359268188, + "num_tokens": 381363480.0, + "step": 14736 + }, + { + "epoch": 1.618383483417527, + "grad_norm": 1.9203276634216309, + "learning_rate": 5e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7354124188423157, + "num_tokens": 381387336.0, + "step": 14737 + }, + { + "epoch": 1.6184933011201406, + "grad_norm": 1.7829747200012207, + "learning_rate": 5e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7204094529151917, + "num_tokens": 381415585.0, + "step": 14738 + }, + { + "epoch": 1.618603118822754, + "grad_norm": 1.9022560119628906, + "learning_rate": 5e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7337247133255005, + "num_tokens": 381439478.0, + "step": 14739 + }, + { + "epoch": 1.6187129365253679, + "grad_norm": 1.8474395275115967, + "learning_rate": 5e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7199647426605225, + "num_tokens": 381465761.0, + "step": 14740 + }, + { + "epoch": 1.6188227542279816, + "grad_norm": 1.8046472072601318, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7405210733413696, + "num_tokens": 381490563.0, + "step": 14741 + }, + { + "epoch": 1.6189325719305954, + "grad_norm": 1.5944573879241943, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7475877404212952, + "num_tokens": 381520098.0, + "step": 14742 + }, + { + "epoch": 1.619042389633209, + "grad_norm": 1.6305099725723267, + "learning_rate": 5e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7169951796531677, + "num_tokens": 381553965.0, + "step": 14743 + }, + { + "epoch": 1.6191522073358224, + "grad_norm": 1.6811705827713013, + "learning_rate": 5e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.724158525466919, + "num_tokens": 381584954.0, + "step": 14744 + }, + { + "epoch": 1.6192620250384362, + "grad_norm": 1.7107021808624268, + "learning_rate": 5e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7294081449508667, + "num_tokens": 381617150.0, + "step": 14745 + }, + { + "epoch": 1.61937184274105, + "grad_norm": 1.9622336626052856, + "learning_rate": 5e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.74782395362854, + "num_tokens": 381639537.0, + "step": 14746 + }, + { + "epoch": 1.6194816604436635, + "grad_norm": 2.0765655040740967, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7216619849205017, + "num_tokens": 381661068.0, + "step": 14747 + }, + { + "epoch": 1.619591478146277, + "grad_norm": 1.7211397886276245, + "learning_rate": 5e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7232370376586914, + "num_tokens": 381689995.0, + "step": 14748 + }, + { + "epoch": 1.6197012958488908, + "grad_norm": 1.900335431098938, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7250168323516846, + "num_tokens": 381715116.0, + "step": 14749 + }, + { + "epoch": 1.6198111135515045, + "grad_norm": 1.6882708072662354, + "learning_rate": 5e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7316510677337646, + "num_tokens": 381745651.0, + "step": 14750 + }, + { + "epoch": 1.6199209312541183, + "grad_norm": 1.7795867919921875, + "learning_rate": 5e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7242963314056396, + "num_tokens": 381772769.0, + "step": 14751 + }, + { + "epoch": 1.6200307489567318, + "grad_norm": 1.717659831047058, + "learning_rate": 5e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7296627759933472, + "num_tokens": 381799245.0, + "step": 14752 + }, + { + "epoch": 1.6201405666593454, + "grad_norm": 1.706601619720459, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.716580331325531, + "num_tokens": 381827576.0, + "step": 14753 + }, + { + "epoch": 1.6202503843619591, + "grad_norm": 1.710520625114441, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7140088081359863, + "num_tokens": 381854752.0, + "step": 14754 + }, + { + "epoch": 1.6203602020645729, + "grad_norm": 1.6321849822998047, + "learning_rate": 5e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7305609583854675, + "num_tokens": 381884527.0, + "step": 14755 + }, + { + "epoch": 1.6204700197671866, + "grad_norm": 2.1096060276031494, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7163589596748352, + "num_tokens": 381906223.0, + "step": 14756 + }, + { + "epoch": 1.6205798374698002, + "grad_norm": 1.8934015035629272, + "learning_rate": 5e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7297112345695496, + "num_tokens": 381931287.0, + "step": 14757 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 1.811977744102478, + "learning_rate": 5e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7418834567070007, + "num_tokens": 381956609.0, + "step": 14758 + }, + { + "epoch": 1.6207994728750275, + "grad_norm": 1.7623543739318848, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7236728668212891, + "num_tokens": 381982047.0, + "step": 14759 + }, + { + "epoch": 1.6209092905776412, + "grad_norm": 1.6999123096466064, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7541896104812622, + "num_tokens": 382009899.0, + "step": 14760 + }, + { + "epoch": 1.6210191082802548, + "grad_norm": 1.5253572463989258, + "learning_rate": 5e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7077928781509399, + "num_tokens": 382045306.0, + "step": 14761 + }, + { + "epoch": 1.6211289259828683, + "grad_norm": 1.7924190759658813, + "learning_rate": 5e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.708409309387207, + "num_tokens": 382072107.0, + "step": 14762 + }, + { + "epoch": 1.621238743685482, + "grad_norm": 1.6829291582107544, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7579807043075562, + "num_tokens": 382099038.0, + "step": 14763 + }, + { + "epoch": 1.6213485613880958, + "grad_norm": 1.8701964616775513, + "learning_rate": 5e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7414131164550781, + "num_tokens": 382126234.0, + "step": 14764 + }, + { + "epoch": 1.6214583790907096, + "grad_norm": 1.8221114873886108, + "learning_rate": 5e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7122644186019897, + "num_tokens": 382153885.0, + "step": 14765 + }, + { + "epoch": 1.621568196793323, + "grad_norm": 2.295207977294922, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7330620288848877, + "num_tokens": 382172610.0, + "step": 14766 + }, + { + "epoch": 1.6216780144959366, + "grad_norm": 2.1282925605773926, + "learning_rate": 5e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7398653626441956, + "num_tokens": 382192284.0, + "step": 14767 + }, + { + "epoch": 1.6217878321985504, + "grad_norm": 1.8406012058258057, + "learning_rate": 5e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7124499082565308, + "num_tokens": 382219903.0, + "step": 14768 + }, + { + "epoch": 1.6218976499011641, + "grad_norm": 1.824200987815857, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7182837724685669, + "num_tokens": 382247471.0, + "step": 14769 + }, + { + "epoch": 1.6220074676037777, + "grad_norm": 1.70112144947052, + "learning_rate": 5e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.6950794458389282, + "num_tokens": 382279422.0, + "step": 14770 + }, + { + "epoch": 1.6221172853063914, + "grad_norm": 2.0732932090759277, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7553172707557678, + "num_tokens": 382298601.0, + "step": 14771 + }, + { + "epoch": 1.622227103009005, + "grad_norm": 1.8710501194000244, + "learning_rate": 5e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7391265630722046, + "num_tokens": 382322873.0, + "step": 14772 + }, + { + "epoch": 1.6223369207116187, + "grad_norm": 1.8236889839172363, + "learning_rate": 5e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.7541804313659668, + "num_tokens": 382345281.0, + "step": 14773 + }, + { + "epoch": 1.6224467384142325, + "grad_norm": 1.8607630729675293, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7036909461021423, + "num_tokens": 382371545.0, + "step": 14774 + }, + { + "epoch": 1.622556556116846, + "grad_norm": 1.9846984148025513, + "learning_rate": 5e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7147634029388428, + "num_tokens": 382396715.0, + "step": 14775 + }, + { + "epoch": 1.6226663738194596, + "grad_norm": 1.946460485458374, + "learning_rate": 5e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7240527868270874, + "num_tokens": 382421154.0, + "step": 14776 + }, + { + "epoch": 1.6227761915220733, + "grad_norm": 1.924384355545044, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.726012110710144, + "num_tokens": 382443503.0, + "step": 14777 + }, + { + "epoch": 1.622886009224687, + "grad_norm": 2.23239803314209, + "learning_rate": 5e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7487179040908813, + "num_tokens": 382460753.0, + "step": 14778 + }, + { + "epoch": 1.6229958269273008, + "grad_norm": 1.6053013801574707, + "learning_rate": 5e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7057696580886841, + "num_tokens": 382491883.0, + "step": 14779 + }, + { + "epoch": 1.6231056446299144, + "grad_norm": 1.672351360321045, + "learning_rate": 5e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7360813617706299, + "num_tokens": 382520737.0, + "step": 14780 + }, + { + "epoch": 1.623215462332528, + "grad_norm": 1.845322608947754, + "learning_rate": 5e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7516445517539978, + "num_tokens": 382546370.0, + "step": 14781 + }, + { + "epoch": 1.6233252800351416, + "grad_norm": 1.9149541854858398, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7323902249336243, + "num_tokens": 382572945.0, + "step": 14782 + }, + { + "epoch": 1.6234350977377554, + "grad_norm": 1.8095297813415527, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7609906792640686, + "num_tokens": 382596332.0, + "step": 14783 + }, + { + "epoch": 1.623544915440369, + "grad_norm": 1.8546936511993408, + "learning_rate": 5e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.744022011756897, + "num_tokens": 382621871.0, + "step": 14784 + }, + { + "epoch": 1.6236547331429827, + "grad_norm": 1.765313982963562, + "learning_rate": 5e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7109915018081665, + "num_tokens": 382652654.0, + "step": 14785 + }, + { + "epoch": 1.6237645508455962, + "grad_norm": 1.8346139192581177, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7287071347236633, + "num_tokens": 382681107.0, + "step": 14786 + }, + { + "epoch": 1.62387436854821, + "grad_norm": 1.8019959926605225, + "learning_rate": 5e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7558084726333618, + "num_tokens": 382705886.0, + "step": 14787 + }, + { + "epoch": 1.6239841862508237, + "grad_norm": 1.9187278747558594, + "learning_rate": 5e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7252211570739746, + "num_tokens": 382731738.0, + "step": 14788 + }, + { + "epoch": 1.6240940039534373, + "grad_norm": 1.821124792098999, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7521650791168213, + "num_tokens": 382757740.0, + "step": 14789 + }, + { + "epoch": 1.6242038216560508, + "grad_norm": 1.8033353090286255, + "learning_rate": 5e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7358624339103699, + "num_tokens": 382784233.0, + "step": 14790 + }, + { + "epoch": 1.6243136393586646, + "grad_norm": 1.9648748636245728, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7457016706466675, + "num_tokens": 382808470.0, + "step": 14791 + }, + { + "epoch": 1.6244234570612783, + "grad_norm": 1.8094700574874878, + "learning_rate": 5e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7347795963287354, + "num_tokens": 382833336.0, + "step": 14792 + }, + { + "epoch": 1.624533274763892, + "grad_norm": 1.836510419845581, + "learning_rate": 5e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7146764993667603, + "num_tokens": 382859008.0, + "step": 14793 + }, + { + "epoch": 1.6246430924665056, + "grad_norm": 1.98225736618042, + "learning_rate": 5e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7357568144798279, + "num_tokens": 382881885.0, + "step": 14794 + }, + { + "epoch": 1.6247529101691192, + "grad_norm": 1.637749195098877, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7199052572250366, + "num_tokens": 382914577.0, + "step": 14795 + }, + { + "epoch": 1.624862727871733, + "grad_norm": 1.9549528360366821, + "learning_rate": 5e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7489345073699951, + "num_tokens": 382935542.0, + "step": 14796 + }, + { + "epoch": 1.6249725455743467, + "grad_norm": 2.253985643386841, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7504334449768066, + "num_tokens": 382954654.0, + "step": 14797 + }, + { + "epoch": 1.6250823632769602, + "grad_norm": 1.9854152202606201, + "learning_rate": 5e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7353860139846802, + "num_tokens": 382977010.0, + "step": 14798 + }, + { + "epoch": 1.6251921809795737, + "grad_norm": 2.11299991607666, + "learning_rate": 5e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7436853647232056, + "num_tokens": 382996938.0, + "step": 14799 + }, + { + "epoch": 1.6253019986821875, + "grad_norm": 1.720755934715271, + "learning_rate": 5e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7187015414237976, + "num_tokens": 383026061.0, + "step": 14800 + }, + { + "epoch": 1.6254118163848013, + "grad_norm": 1.860770583152771, + "learning_rate": 5e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7258864045143127, + "num_tokens": 383050770.0, + "step": 14801 + }, + { + "epoch": 1.625521634087415, + "grad_norm": 1.7658264636993408, + "learning_rate": 5e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7367624044418335, + "num_tokens": 383074489.0, + "step": 14802 + }, + { + "epoch": 1.6256314517900285, + "grad_norm": 1.8719271421432495, + "learning_rate": 5e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7126555442810059, + "num_tokens": 383101323.0, + "step": 14803 + }, + { + "epoch": 1.625741269492642, + "grad_norm": 1.870741367340088, + "learning_rate": 5e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7399656176567078, + "num_tokens": 383125418.0, + "step": 14804 + }, + { + "epoch": 1.6258510871952558, + "grad_norm": 1.7334626913070679, + "learning_rate": 5e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.742244303226471, + "num_tokens": 383152325.0, + "step": 14805 + }, + { + "epoch": 1.6259609048978696, + "grad_norm": 1.583588719367981, + "learning_rate": 5e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7162356376647949, + "num_tokens": 383186966.0, + "step": 14806 + }, + { + "epoch": 1.6260707226004834, + "grad_norm": 1.78895902633667, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7400588989257812, + "num_tokens": 383212012.0, + "step": 14807 + }, + { + "epoch": 1.6261805403030969, + "grad_norm": 1.5925581455230713, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7142132520675659, + "num_tokens": 383243907.0, + "step": 14808 + }, + { + "epoch": 1.6262903580057104, + "grad_norm": 2.0318679809570312, + "learning_rate": 5e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7510741949081421, + "num_tokens": 383263853.0, + "step": 14809 + }, + { + "epoch": 1.6264001757083242, + "grad_norm": 1.8843159675598145, + "learning_rate": 5e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7421577572822571, + "num_tokens": 383288226.0, + "step": 14810 + }, + { + "epoch": 1.626509993410938, + "grad_norm": 1.9890989065170288, + "learning_rate": 5e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7305509448051453, + "num_tokens": 383310229.0, + "step": 14811 + }, + { + "epoch": 1.6266198111135515, + "grad_norm": 1.6760919094085693, + "learning_rate": 5e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7577035427093506, + "num_tokens": 383339118.0, + "step": 14812 + }, + { + "epoch": 1.626729628816165, + "grad_norm": 1.6047124862670898, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7344117164611816, + "num_tokens": 383368722.0, + "step": 14813 + }, + { + "epoch": 1.6268394465187788, + "grad_norm": 1.9043500423431396, + "learning_rate": 5e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.702460527420044, + "num_tokens": 383394390.0, + "step": 14814 + }, + { + "epoch": 1.6269492642213925, + "grad_norm": 1.797951340675354, + "learning_rate": 5e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7132426500320435, + "num_tokens": 383423722.0, + "step": 14815 + }, + { + "epoch": 1.6270590819240063, + "grad_norm": 1.974829077720642, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7284445762634277, + "num_tokens": 383448480.0, + "step": 14816 + }, + { + "epoch": 1.6271688996266198, + "grad_norm": 2.0674381256103516, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7560538053512573, + "num_tokens": 383467831.0, + "step": 14817 + }, + { + "epoch": 1.6272787173292333, + "grad_norm": 1.8673224449157715, + "learning_rate": 5e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7367098331451416, + "num_tokens": 383492135.0, + "step": 14818 + }, + { + "epoch": 1.627388535031847, + "grad_norm": 1.6290706396102905, + "learning_rate": 5e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7233869433403015, + "num_tokens": 383523889.0, + "step": 14819 + }, + { + "epoch": 1.6274983527344609, + "grad_norm": 2.056218385696411, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7533460855484009, + "num_tokens": 383544010.0, + "step": 14820 + }, + { + "epoch": 1.6276081704370746, + "grad_norm": 2.0097036361694336, + "learning_rate": 5e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7402048110961914, + "num_tokens": 383564821.0, + "step": 14821 + }, + { + "epoch": 1.6277179881396882, + "grad_norm": 1.6973828077316284, + "learning_rate": 5e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7389910221099854, + "num_tokens": 383593518.0, + "step": 14822 + }, + { + "epoch": 1.6278278058423017, + "grad_norm": 1.8380827903747559, + "learning_rate": 5e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7313896417617798, + "num_tokens": 383618475.0, + "step": 14823 + }, + { + "epoch": 1.6279376235449154, + "grad_norm": 1.9667178392410278, + "learning_rate": 5e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7452769875526428, + "num_tokens": 383640428.0, + "step": 14824 + }, + { + "epoch": 1.6280474412475292, + "grad_norm": 2.151864767074585, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7536939978599548, + "num_tokens": 383658326.0, + "step": 14825 + }, + { + "epoch": 1.6281572589501427, + "grad_norm": 1.8951804637908936, + "learning_rate": 5e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7271236181259155, + "num_tokens": 383682870.0, + "step": 14826 + }, + { + "epoch": 1.6282670766527563, + "grad_norm": 1.8612444400787354, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7248800992965698, + "num_tokens": 383711368.0, + "step": 14827 + }, + { + "epoch": 1.62837689435537, + "grad_norm": 1.9675679206848145, + "learning_rate": 5e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7319673299789429, + "num_tokens": 383737670.0, + "step": 14828 + }, + { + "epoch": 1.6284867120579838, + "grad_norm": 1.7030164003372192, + "learning_rate": 5e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.719336986541748, + "num_tokens": 383767202.0, + "step": 14829 + }, + { + "epoch": 1.6285965297605975, + "grad_norm": 1.9337170124053955, + "learning_rate": 5e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7119712233543396, + "num_tokens": 383791887.0, + "step": 14830 + }, + { + "epoch": 1.628706347463211, + "grad_norm": 1.8833281993865967, + "learning_rate": 5e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7369675636291504, + "num_tokens": 383815270.0, + "step": 14831 + }, + { + "epoch": 1.6288161651658246, + "grad_norm": 1.6982468366622925, + "learning_rate": 5e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7351288795471191, + "num_tokens": 383846242.0, + "step": 14832 + }, + { + "epoch": 1.6289259828684384, + "grad_norm": 1.8910495042800903, + "learning_rate": 5e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7365148067474365, + "num_tokens": 383869721.0, + "step": 14833 + }, + { + "epoch": 1.6290358005710521, + "grad_norm": 1.9045164585113525, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7467529773712158, + "num_tokens": 383893391.0, + "step": 14834 + }, + { + "epoch": 1.6291456182736657, + "grad_norm": 1.7993043661117554, + "learning_rate": 5e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7400000095367432, + "num_tokens": 383920190.0, + "step": 14835 + }, + { + "epoch": 1.6292554359762794, + "grad_norm": 1.7967960834503174, + "learning_rate": 5e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.744820237159729, + "num_tokens": 383943711.0, + "step": 14836 + }, + { + "epoch": 1.629365253678893, + "grad_norm": 1.7297173738479614, + "learning_rate": 5e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7021933197975159, + "num_tokens": 383974415.0, + "step": 14837 + }, + { + "epoch": 1.6294750713815067, + "grad_norm": 1.966299295425415, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7217282056808472, + "num_tokens": 383999070.0, + "step": 14838 + }, + { + "epoch": 1.6295848890841205, + "grad_norm": 1.7191470861434937, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7435706257820129, + "num_tokens": 384027128.0, + "step": 14839 + }, + { + "epoch": 1.629694706786734, + "grad_norm": 1.9891471862792969, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7308825850486755, + "num_tokens": 384049012.0, + "step": 14840 + }, + { + "epoch": 1.6298045244893475, + "grad_norm": 1.736979365348816, + "learning_rate": 5e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.734093427658081, + "num_tokens": 384075878.0, + "step": 14841 + }, + { + "epoch": 1.6299143421919613, + "grad_norm": 1.929164171218872, + "learning_rate": 5e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7537868618965149, + "num_tokens": 384099285.0, + "step": 14842 + }, + { + "epoch": 1.630024159894575, + "grad_norm": 2.0201828479766846, + "learning_rate": 5e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7351036667823792, + "num_tokens": 384122012.0, + "step": 14843 + }, + { + "epoch": 1.6301339775971888, + "grad_norm": 2.3371169567108154, + "learning_rate": 5e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7367376089096069, + "num_tokens": 384141230.0, + "step": 14844 + }, + { + "epoch": 1.6302437952998023, + "grad_norm": 1.8842408657073975, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7399004697799683, + "num_tokens": 384166471.0, + "step": 14845 + }, + { + "epoch": 1.6303536130024159, + "grad_norm": 1.7843286991119385, + "learning_rate": 5e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7152135968208313, + "num_tokens": 384195944.0, + "step": 14846 + }, + { + "epoch": 1.6304634307050296, + "grad_norm": 1.7146062850952148, + "learning_rate": 5e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7356835007667542, + "num_tokens": 384224844.0, + "step": 14847 + }, + { + "epoch": 1.6305732484076434, + "grad_norm": 1.8966373205184937, + "learning_rate": 5e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7007489800453186, + "num_tokens": 384249793.0, + "step": 14848 + }, + { + "epoch": 1.630683066110257, + "grad_norm": 2.074566125869751, + "learning_rate": 5e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7377157211303711, + "num_tokens": 384268956.0, + "step": 14849 + }, + { + "epoch": 1.6307928838128707, + "grad_norm": 1.9448637962341309, + "learning_rate": 5e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7379546761512756, + "num_tokens": 384290250.0, + "step": 14850 + }, + { + "epoch": 1.6309027015154842, + "grad_norm": 1.8388193845748901, + "learning_rate": 5e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7725110054016113, + "num_tokens": 384313297.0, + "step": 14851 + }, + { + "epoch": 1.631012519218098, + "grad_norm": 1.7577414512634277, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7311422824859619, + "num_tokens": 384339722.0, + "step": 14852 + }, + { + "epoch": 1.6311223369207117, + "grad_norm": 1.7942266464233398, + "learning_rate": 5e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7341927886009216, + "num_tokens": 384366128.0, + "step": 14853 + }, + { + "epoch": 1.6312321546233253, + "grad_norm": 1.7394191026687622, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.737153947353363, + "num_tokens": 384392674.0, + "step": 14854 + }, + { + "epoch": 1.6313419723259388, + "grad_norm": 1.676931381225586, + "learning_rate": 5e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7071787118911743, + "num_tokens": 384421955.0, + "step": 14855 + }, + { + "epoch": 1.6314517900285526, + "grad_norm": 2.15861177444458, + "learning_rate": 5e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7364000082015991, + "num_tokens": 384441873.0, + "step": 14856 + }, + { + "epoch": 1.6315616077311663, + "grad_norm": 1.9364798069000244, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7473577857017517, + "num_tokens": 384464871.0, + "step": 14857 + }, + { + "epoch": 1.63167142543378, + "grad_norm": 1.8765558004379272, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.74638831615448, + "num_tokens": 384489253.0, + "step": 14858 + }, + { + "epoch": 1.6317812431363936, + "grad_norm": 1.6974802017211914, + "learning_rate": 5e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7150459289550781, + "num_tokens": 384520103.0, + "step": 14859 + }, + { + "epoch": 1.6318910608390071, + "grad_norm": 1.9894379377365112, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7385225892066956, + "num_tokens": 384542223.0, + "step": 14860 + }, + { + "epoch": 1.632000878541621, + "grad_norm": 1.9353028535842896, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7587652206420898, + "num_tokens": 384563142.0, + "step": 14861 + }, + { + "epoch": 1.6321106962442347, + "grad_norm": 1.770959734916687, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7330070734024048, + "num_tokens": 384590173.0, + "step": 14862 + }, + { + "epoch": 1.6322205139468482, + "grad_norm": 1.898478388786316, + "learning_rate": 5e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7370362281799316, + "num_tokens": 384614626.0, + "step": 14863 + }, + { + "epoch": 1.6323303316494617, + "grad_norm": 2.0644965171813965, + "learning_rate": 5e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7325693964958191, + "num_tokens": 384636761.0, + "step": 14864 + }, + { + "epoch": 1.6324401493520755, + "grad_norm": 2.151263952255249, + "learning_rate": 5e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7292137145996094, + "num_tokens": 384656798.0, + "step": 14865 + }, + { + "epoch": 1.6325499670546892, + "grad_norm": 1.8981231451034546, + "learning_rate": 5e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7335196733474731, + "num_tokens": 384680474.0, + "step": 14866 + }, + { + "epoch": 1.632659784757303, + "grad_norm": 1.858469009399414, + "learning_rate": 5e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7211931943893433, + "num_tokens": 384706091.0, + "step": 14867 + }, + { + "epoch": 1.6327696024599165, + "grad_norm": 1.8919581174850464, + "learning_rate": 5e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7557447552680969, + "num_tokens": 384729559.0, + "step": 14868 + }, + { + "epoch": 1.63287942016253, + "grad_norm": 2.013648748397827, + "learning_rate": 5e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7358410358428955, + "num_tokens": 384751822.0, + "step": 14869 + }, + { + "epoch": 1.6329892378651438, + "grad_norm": 1.5442774295806885, + "learning_rate": 5e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7307344675064087, + "num_tokens": 384785603.0, + "step": 14870 + }, + { + "epoch": 1.6330990555677576, + "grad_norm": 1.8970321416854858, + "learning_rate": 5e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7217164635658264, + "num_tokens": 384808740.0, + "step": 14871 + }, + { + "epoch": 1.6332088732703713, + "grad_norm": 1.8291759490966797, + "learning_rate": 5e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7458945512771606, + "num_tokens": 384834645.0, + "step": 14872 + }, + { + "epoch": 1.6333186909729849, + "grad_norm": 1.757959008216858, + "learning_rate": 5e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7296090126037598, + "num_tokens": 384861738.0, + "step": 14873 + }, + { + "epoch": 1.6334285086755984, + "grad_norm": 1.8869812488555908, + "learning_rate": 5e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7380682229995728, + "num_tokens": 384888466.0, + "step": 14874 + }, + { + "epoch": 1.6335383263782122, + "grad_norm": 1.5184357166290283, + "learning_rate": 5e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7010127305984497, + "num_tokens": 384927595.0, + "step": 14875 + }, + { + "epoch": 1.633648144080826, + "grad_norm": 2.0018062591552734, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7189632654190063, + "num_tokens": 384950704.0, + "step": 14876 + }, + { + "epoch": 1.6337579617834395, + "grad_norm": 1.9243133068084717, + "learning_rate": 5e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.726525604724884, + "num_tokens": 384975049.0, + "step": 14877 + }, + { + "epoch": 1.633867779486053, + "grad_norm": 2.087279796600342, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7511723041534424, + "num_tokens": 384996835.0, + "step": 14878 + }, + { + "epoch": 1.6339775971886668, + "grad_norm": 1.7129273414611816, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7191509008407593, + "num_tokens": 385027540.0, + "step": 14879 + }, + { + "epoch": 1.6340874148912805, + "grad_norm": 1.8808244466781616, + "learning_rate": 5e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7269055247306824, + "num_tokens": 385053315.0, + "step": 14880 + }, + { + "epoch": 1.6341972325938943, + "grad_norm": 1.8269234895706177, + "learning_rate": 5e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7375489473342896, + "num_tokens": 385078595.0, + "step": 14881 + }, + { + "epoch": 1.6343070502965078, + "grad_norm": 1.894212245941162, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7300182580947876, + "num_tokens": 385103197.0, + "step": 14882 + }, + { + "epoch": 1.6344168679991213, + "grad_norm": 1.8378649950027466, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7220878601074219, + "num_tokens": 385130393.0, + "step": 14883 + }, + { + "epoch": 1.634526685701735, + "grad_norm": 1.8760570287704468, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7310611605644226, + "num_tokens": 385154339.0, + "step": 14884 + }, + { + "epoch": 1.6346365034043489, + "grad_norm": 1.5433584451675415, + "learning_rate": 5e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7389308214187622, + "num_tokens": 385186042.0, + "step": 14885 + }, + { + "epoch": 1.6347463211069626, + "grad_norm": 1.7197781801223755, + "learning_rate": 5e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7078319787979126, + "num_tokens": 385216889.0, + "step": 14886 + }, + { + "epoch": 1.6348561388095761, + "grad_norm": 1.95335054397583, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7181560397148132, + "num_tokens": 385242055.0, + "step": 14887 + }, + { + "epoch": 1.6349659565121897, + "grad_norm": 1.9883216619491577, + "learning_rate": 5e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7519316673278809, + "num_tokens": 385262089.0, + "step": 14888 + }, + { + "epoch": 1.6350757742148034, + "grad_norm": 1.7514301538467407, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7516816258430481, + "num_tokens": 385289805.0, + "step": 14889 + }, + { + "epoch": 1.6351855919174172, + "grad_norm": 1.9519004821777344, + "learning_rate": 5e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7236013412475586, + "num_tokens": 385314769.0, + "step": 14890 + }, + { + "epoch": 1.6352954096200307, + "grad_norm": 1.643377661705017, + "learning_rate": 5e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7184790372848511, + "num_tokens": 385345596.0, + "step": 14891 + }, + { + "epoch": 1.6354052273226443, + "grad_norm": 1.7754133939743042, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7438303232192993, + "num_tokens": 385372705.0, + "step": 14892 + }, + { + "epoch": 1.635515045025258, + "grad_norm": 1.8525371551513672, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7554173469543457, + "num_tokens": 385395106.0, + "step": 14893 + }, + { + "epoch": 1.6356248627278718, + "grad_norm": 1.8528168201446533, + "learning_rate": 5e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7150720953941345, + "num_tokens": 385421303.0, + "step": 14894 + }, + { + "epoch": 1.6357346804304855, + "grad_norm": 1.8319960832595825, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7519824504852295, + "num_tokens": 385445315.0, + "step": 14895 + }, + { + "epoch": 1.635844498133099, + "grad_norm": 1.8560658693313599, + "learning_rate": 5e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7200416326522827, + "num_tokens": 385473329.0, + "step": 14896 + }, + { + "epoch": 1.6359543158357126, + "grad_norm": 1.9365631341934204, + "learning_rate": 5e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7510759830474854, + "num_tokens": 385497141.0, + "step": 14897 + }, + { + "epoch": 1.6360641335383264, + "grad_norm": 1.9170829057693481, + "learning_rate": 5e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.740982174873352, + "num_tokens": 385519822.0, + "step": 14898 + }, + { + "epoch": 1.6361739512409401, + "grad_norm": 1.9192966222763062, + "learning_rate": 5e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7058940529823303, + "num_tokens": 385546422.0, + "step": 14899 + }, + { + "epoch": 1.6362837689435537, + "grad_norm": 1.8247863054275513, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7303881049156189, + "num_tokens": 385572329.0, + "step": 14900 + }, + { + "epoch": 1.6363935866461674, + "grad_norm": 1.800990104675293, + "learning_rate": 5e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7619494199752808, + "num_tokens": 385596774.0, + "step": 14901 + }, + { + "epoch": 1.636503404348781, + "grad_norm": 1.8196163177490234, + "learning_rate": 5e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7339845895767212, + "num_tokens": 385623235.0, + "step": 14902 + }, + { + "epoch": 1.6366132220513947, + "grad_norm": 1.8769078254699707, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7152004837989807, + "num_tokens": 385651191.0, + "step": 14903 + }, + { + "epoch": 1.6367230397540085, + "grad_norm": 2.1484341621398926, + "learning_rate": 5e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7470066547393799, + "num_tokens": 385670909.0, + "step": 14904 + }, + { + "epoch": 1.636832857456622, + "grad_norm": 2.3152689933776855, + "learning_rate": 5e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7349302768707275, + "num_tokens": 385690109.0, + "step": 14905 + }, + { + "epoch": 1.6369426751592355, + "grad_norm": 1.9336268901824951, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.731400728225708, + "num_tokens": 385713552.0, + "step": 14906 + }, + { + "epoch": 1.6370524928618493, + "grad_norm": 1.7249263525009155, + "learning_rate": 5e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7511304020881653, + "num_tokens": 385740089.0, + "step": 14907 + }, + { + "epoch": 1.637162310564463, + "grad_norm": 1.9652742147445679, + "learning_rate": 5e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7314375638961792, + "num_tokens": 385763748.0, + "step": 14908 + }, + { + "epoch": 1.6372721282670768, + "grad_norm": 1.778061866760254, + "learning_rate": 5e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7066121101379395, + "num_tokens": 385791905.0, + "step": 14909 + }, + { + "epoch": 1.6373819459696903, + "grad_norm": 1.9569579362869263, + "learning_rate": 5e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7456334233283997, + "num_tokens": 385814981.0, + "step": 14910 + }, + { + "epoch": 1.6374917636723039, + "grad_norm": 1.9748783111572266, + "learning_rate": 5e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7382866144180298, + "num_tokens": 385838374.0, + "step": 14911 + }, + { + "epoch": 1.6376015813749176, + "grad_norm": 1.9036387205123901, + "learning_rate": 5e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7434121966362, + "num_tokens": 385862462.0, + "step": 14912 + }, + { + "epoch": 1.6377113990775314, + "grad_norm": 1.911171555519104, + "learning_rate": 5e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7152482271194458, + "num_tokens": 385892038.0, + "step": 14913 + }, + { + "epoch": 1.637821216780145, + "grad_norm": 1.8177456855773926, + "learning_rate": 5e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7011585235595703, + "num_tokens": 385922553.0, + "step": 14914 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 2.268508195877075, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7310321927070618, + "num_tokens": 385942999.0, + "step": 14915 + }, + { + "epoch": 1.6380408521853722, + "grad_norm": 1.8069851398468018, + "learning_rate": 5e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7237145900726318, + "num_tokens": 385971217.0, + "step": 14916 + }, + { + "epoch": 1.638150669887986, + "grad_norm": 1.829365611076355, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.724594235420227, + "num_tokens": 385996625.0, + "step": 14917 + }, + { + "epoch": 1.6382604875905997, + "grad_norm": 2.170989751815796, + "learning_rate": 5e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7219676971435547, + "num_tokens": 386018202.0, + "step": 14918 + }, + { + "epoch": 1.6383703052932133, + "grad_norm": 1.8779908418655396, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7325918674468994, + "num_tokens": 386043368.0, + "step": 14919 + }, + { + "epoch": 1.6384801229958268, + "grad_norm": 1.9483017921447754, + "learning_rate": 5e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7301207780838013, + "num_tokens": 386066478.0, + "step": 14920 + }, + { + "epoch": 1.6385899406984406, + "grad_norm": 1.7587230205535889, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7469461560249329, + "num_tokens": 386093015.0, + "step": 14921 + }, + { + "epoch": 1.6386997584010543, + "grad_norm": 1.9688669443130493, + "learning_rate": 5e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7475138902664185, + "num_tokens": 386114983.0, + "step": 14922 + }, + { + "epoch": 1.638809576103668, + "grad_norm": 1.9576083421707153, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7625295519828796, + "num_tokens": 386138113.0, + "step": 14923 + }, + { + "epoch": 1.6389193938062816, + "grad_norm": 1.7535191774368286, + "learning_rate": 5e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7499185800552368, + "num_tokens": 386164728.0, + "step": 14924 + }, + { + "epoch": 1.6390292115088951, + "grad_norm": 1.89975106716156, + "learning_rate": 5e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7206448316574097, + "num_tokens": 386189383.0, + "step": 14925 + }, + { + "epoch": 1.639139029211509, + "grad_norm": 2.017092227935791, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7422277331352234, + "num_tokens": 386212783.0, + "step": 14926 + }, + { + "epoch": 1.6392488469141226, + "grad_norm": 1.7232136726379395, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7481205463409424, + "num_tokens": 386239323.0, + "step": 14927 + }, + { + "epoch": 1.6393586646167362, + "grad_norm": 1.8048183917999268, + "learning_rate": 5e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7197083830833435, + "num_tokens": 386265268.0, + "step": 14928 + }, + { + "epoch": 1.6394684823193497, + "grad_norm": 1.8051331043243408, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7352222204208374, + "num_tokens": 386290650.0, + "step": 14929 + }, + { + "epoch": 1.6395783000219635, + "grad_norm": 1.826013445854187, + "learning_rate": 5e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7429619431495667, + "num_tokens": 386315671.0, + "step": 14930 + }, + { + "epoch": 1.6396881177245772, + "grad_norm": 1.8924959897994995, + "learning_rate": 5e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.710076630115509, + "num_tokens": 386339954.0, + "step": 14931 + }, + { + "epoch": 1.639797935427191, + "grad_norm": 1.909816026687622, + "learning_rate": 5e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7456044554710388, + "num_tokens": 386364032.0, + "step": 14932 + }, + { + "epoch": 1.6399077531298045, + "grad_norm": 2.1461551189422607, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7554556131362915, + "num_tokens": 386382966.0, + "step": 14933 + }, + { + "epoch": 1.640017570832418, + "grad_norm": 1.991256594657898, + "learning_rate": 5e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.746764063835144, + "num_tokens": 386402697.0, + "step": 14934 + }, + { + "epoch": 1.6401273885350318, + "grad_norm": 1.7384006977081299, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7354085445404053, + "num_tokens": 386430935.0, + "step": 14935 + }, + { + "epoch": 1.6402372062376456, + "grad_norm": 1.8880671262741089, + "learning_rate": 5e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7412540912628174, + "num_tokens": 386453961.0, + "step": 14936 + }, + { + "epoch": 1.6403470239402593, + "grad_norm": 1.7113434076309204, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7252554893493652, + "num_tokens": 386484210.0, + "step": 14937 + }, + { + "epoch": 1.6404568416428729, + "grad_norm": 1.7053544521331787, + "learning_rate": 5e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7554911375045776, + "num_tokens": 386512785.0, + "step": 14938 + }, + { + "epoch": 1.6405666593454864, + "grad_norm": 1.902777075767517, + "learning_rate": 5e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7563695907592773, + "num_tokens": 386534608.0, + "step": 14939 + }, + { + "epoch": 1.6406764770481002, + "grad_norm": 1.8788728713989258, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.73298180103302, + "num_tokens": 386559736.0, + "step": 14940 + }, + { + "epoch": 1.640786294750714, + "grad_norm": 1.8333882093429565, + "learning_rate": 5e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7225123643875122, + "num_tokens": 386584676.0, + "step": 14941 + }, + { + "epoch": 1.6408961124533274, + "grad_norm": 1.760014295578003, + "learning_rate": 5e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7220261096954346, + "num_tokens": 386614124.0, + "step": 14942 + }, + { + "epoch": 1.641005930155941, + "grad_norm": 1.9676707983016968, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.737591028213501, + "num_tokens": 386636723.0, + "step": 14943 + }, + { + "epoch": 1.6411157478585547, + "grad_norm": 1.5978304147720337, + "learning_rate": 5e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7254840731620789, + "num_tokens": 386669612.0, + "step": 14944 + }, + { + "epoch": 1.6412255655611685, + "grad_norm": 1.7679047584533691, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7302074432373047, + "num_tokens": 386695459.0, + "step": 14945 + }, + { + "epoch": 1.6413353832637823, + "grad_norm": 1.814003586769104, + "learning_rate": 5e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7010582685470581, + "num_tokens": 386724134.0, + "step": 14946 + }, + { + "epoch": 1.6414452009663958, + "grad_norm": 1.6765087842941284, + "learning_rate": 5e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7193549871444702, + "num_tokens": 386753824.0, + "step": 14947 + }, + { + "epoch": 1.6415550186690093, + "grad_norm": 2.111236333847046, + "learning_rate": 5e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7356780767440796, + "num_tokens": 386774814.0, + "step": 14948 + }, + { + "epoch": 1.641664836371623, + "grad_norm": 1.7378740310668945, + "learning_rate": 5e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7529770135879517, + "num_tokens": 386804504.0, + "step": 14949 + }, + { + "epoch": 1.6417746540742368, + "grad_norm": 1.8524892330169678, + "learning_rate": 5e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7315549850463867, + "num_tokens": 386829785.0, + "step": 14950 + }, + { + "epoch": 1.6418844717768504, + "grad_norm": 1.8484705686569214, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7469661235809326, + "num_tokens": 386855101.0, + "step": 14951 + }, + { + "epoch": 1.6419942894794641, + "grad_norm": 1.7097223997116089, + "learning_rate": 5e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7080141305923462, + "num_tokens": 386887857.0, + "step": 14952 + }, + { + "epoch": 1.6421041071820777, + "grad_norm": 1.6710076332092285, + "learning_rate": 5e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7369670867919922, + "num_tokens": 386916712.0, + "step": 14953 + }, + { + "epoch": 1.6422139248846914, + "grad_norm": 1.6761837005615234, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7362688779830933, + "num_tokens": 386944403.0, + "step": 14954 + }, + { + "epoch": 1.6423237425873052, + "grad_norm": 1.9458742141723633, + "learning_rate": 5e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7163175940513611, + "num_tokens": 386969956.0, + "step": 14955 + }, + { + "epoch": 1.6424335602899187, + "grad_norm": 1.7753089666366577, + "learning_rate": 5e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7526734471321106, + "num_tokens": 386995233.0, + "step": 14956 + }, + { + "epoch": 1.6425433779925322, + "grad_norm": 1.7512894868850708, + "learning_rate": 5e-06, + "loss": 0.7578, + "mean_token_accuracy": 0.7551662921905518, + "num_tokens": 387018971.0, + "step": 14957 + }, + { + "epoch": 1.642653195695146, + "grad_norm": 1.9872325658798218, + "learning_rate": 5e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.736069917678833, + "num_tokens": 387040294.0, + "step": 14958 + }, + { + "epoch": 1.6427630133977598, + "grad_norm": 1.660887598991394, + "learning_rate": 5e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7241254448890686, + "num_tokens": 387069993.0, + "step": 14959 + }, + { + "epoch": 1.6428728311003735, + "grad_norm": 1.8584132194519043, + "learning_rate": 5e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7526372671127319, + "num_tokens": 387092443.0, + "step": 14960 + }, + { + "epoch": 1.642982648802987, + "grad_norm": 1.7951146364212036, + "learning_rate": 5e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.710142195224762, + "num_tokens": 387122040.0, + "step": 14961 + }, + { + "epoch": 1.6430924665056006, + "grad_norm": 1.7824333906173706, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7199309468269348, + "num_tokens": 387148482.0, + "step": 14962 + }, + { + "epoch": 1.6432022842082143, + "grad_norm": 2.0625290870666504, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7439348697662354, + "num_tokens": 387168312.0, + "step": 14963 + }, + { + "epoch": 1.643312101910828, + "grad_norm": 1.7465764284133911, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7262730598449707, + "num_tokens": 387195841.0, + "step": 14964 + }, + { + "epoch": 1.6434219196134416, + "grad_norm": 1.8897547721862793, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7423363924026489, + "num_tokens": 387219322.0, + "step": 14965 + }, + { + "epoch": 1.6435317373160554, + "grad_norm": 1.7809122800827026, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7500171065330505, + "num_tokens": 387244541.0, + "step": 14966 + }, + { + "epoch": 1.643641555018669, + "grad_norm": 1.7490081787109375, + "learning_rate": 5e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7393398284912109, + "num_tokens": 387271757.0, + "step": 14967 + }, + { + "epoch": 1.6437513727212827, + "grad_norm": 1.915865421295166, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7633958458900452, + "num_tokens": 387293496.0, + "step": 14968 + }, + { + "epoch": 1.6438611904238964, + "grad_norm": 2.1282801628112793, + "learning_rate": 5e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7314169406890869, + "num_tokens": 387314925.0, + "step": 14969 + }, + { + "epoch": 1.64397100812651, + "grad_norm": 1.969388723373413, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7464468479156494, + "num_tokens": 387338824.0, + "step": 14970 + }, + { + "epoch": 1.6440808258291235, + "grad_norm": 1.7537946701049805, + "learning_rate": 5e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.741804838180542, + "num_tokens": 387366329.0, + "step": 14971 + }, + { + "epoch": 1.6441906435317373, + "grad_norm": 2.1279473304748535, + "learning_rate": 5e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7457299828529358, + "num_tokens": 387388046.0, + "step": 14972 + }, + { + "epoch": 1.644300461234351, + "grad_norm": 2.0824966430664062, + "learning_rate": 5e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7301923036575317, + "num_tokens": 387410220.0, + "step": 14973 + }, + { + "epoch": 1.6444102789369648, + "grad_norm": 2.176546812057495, + "learning_rate": 5e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7460073232650757, + "num_tokens": 387429173.0, + "step": 14974 + }, + { + "epoch": 1.6445200966395783, + "grad_norm": 1.8446953296661377, + "learning_rate": 5e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7358483076095581, + "num_tokens": 387458312.0, + "step": 14975 + }, + { + "epoch": 1.6446299143421919, + "grad_norm": 1.778296947479248, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7256230115890503, + "num_tokens": 387485218.0, + "step": 14976 + }, + { + "epoch": 1.6447397320448056, + "grad_norm": 1.664526343345642, + "learning_rate": 5e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7023108005523682, + "num_tokens": 387516710.0, + "step": 14977 + }, + { + "epoch": 1.6448495497474194, + "grad_norm": 2.02111554145813, + "learning_rate": 5e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7515509724617004, + "num_tokens": 387537366.0, + "step": 14978 + }, + { + "epoch": 1.644959367450033, + "grad_norm": 1.9216575622558594, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7163963913917542, + "num_tokens": 387563051.0, + "step": 14979 + }, + { + "epoch": 1.6450691851526464, + "grad_norm": 1.7891381978988647, + "learning_rate": 5e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7119125127792358, + "num_tokens": 387590040.0, + "step": 14980 + }, + { + "epoch": 1.6451790028552602, + "grad_norm": 1.495081901550293, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7173382639884949, + "num_tokens": 387624251.0, + "step": 14981 + }, + { + "epoch": 1.645288820557874, + "grad_norm": 1.675998330116272, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7227162718772888, + "num_tokens": 387652737.0, + "step": 14982 + }, + { + "epoch": 1.6453986382604877, + "grad_norm": 1.7814114093780518, + "learning_rate": 5e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7261121273040771, + "num_tokens": 387680887.0, + "step": 14983 + }, + { + "epoch": 1.6455084559631012, + "grad_norm": 1.5704445838928223, + "learning_rate": 5e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7331188321113586, + "num_tokens": 387716577.0, + "step": 14984 + }, + { + "epoch": 1.6456182736657148, + "grad_norm": 2.0127828121185303, + "learning_rate": 5e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7504367828369141, + "num_tokens": 387738103.0, + "step": 14985 + }, + { + "epoch": 1.6457280913683285, + "grad_norm": 1.7425411939620972, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7178386449813843, + "num_tokens": 387766559.0, + "step": 14986 + }, + { + "epoch": 1.6458379090709423, + "grad_norm": 1.8668028116226196, + "learning_rate": 5e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7393457889556885, + "num_tokens": 387790176.0, + "step": 14987 + }, + { + "epoch": 1.645947726773556, + "grad_norm": 1.7881981134414673, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7532137632369995, + "num_tokens": 387813772.0, + "step": 14988 + }, + { + "epoch": 1.6460575444761696, + "grad_norm": 1.963567852973938, + "learning_rate": 5e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7332020998001099, + "num_tokens": 387836904.0, + "step": 14989 + }, + { + "epoch": 1.6461673621787831, + "grad_norm": 1.9246149063110352, + "learning_rate": 5e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7229465246200562, + "num_tokens": 387863284.0, + "step": 14990 + }, + { + "epoch": 1.6462771798813969, + "grad_norm": 1.78090500831604, + "learning_rate": 5e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.733223021030426, + "num_tokens": 387888865.0, + "step": 14991 + }, + { + "epoch": 1.6463869975840106, + "grad_norm": 1.9912495613098145, + "learning_rate": 5e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.75448077917099, + "num_tokens": 387910597.0, + "step": 14992 + }, + { + "epoch": 1.6464968152866242, + "grad_norm": 1.8074462413787842, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7402709722518921, + "num_tokens": 387935733.0, + "step": 14993 + }, + { + "epoch": 1.6466066329892377, + "grad_norm": 1.8499196767807007, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7264333963394165, + "num_tokens": 387961520.0, + "step": 14994 + }, + { + "epoch": 1.6467164506918515, + "grad_norm": 1.834091067314148, + "learning_rate": 5e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7551783323287964, + "num_tokens": 387985364.0, + "step": 14995 + }, + { + "epoch": 1.6468262683944652, + "grad_norm": 1.9043079614639282, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7171188592910767, + "num_tokens": 388011150.0, + "step": 14996 + }, + { + "epoch": 1.646936086097079, + "grad_norm": 1.7793408632278442, + "learning_rate": 5e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7177435159683228, + "num_tokens": 388041528.0, + "step": 14997 + }, + { + "epoch": 1.6470459037996925, + "grad_norm": 1.7055037021636963, + "learning_rate": 5e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7374249696731567, + "num_tokens": 388070406.0, + "step": 14998 + }, + { + "epoch": 1.647155721502306, + "grad_norm": 1.6992567777633667, + "learning_rate": 5e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7281382083892822, + "num_tokens": 388099206.0, + "step": 14999 + }, + { + "epoch": 1.6472655392049198, + "grad_norm": 1.7285765409469604, + "learning_rate": 5e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7201329469680786, + "num_tokens": 388127256.0, + "step": 15000 + }, + { + "epoch": 1.6473753569075336, + "grad_norm": 1.7590047121047974, + "learning_rate": 5e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7447789907455444, + "num_tokens": 388154411.0, + "step": 15001 + }, + { + "epoch": 1.6474851746101473, + "grad_norm": 1.6212074756622314, + "learning_rate": 5e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.743035078048706, + "num_tokens": 388185494.0, + "step": 15002 + }, + { + "epoch": 1.6475949923127609, + "grad_norm": 1.7560099363327026, + "learning_rate": 5e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7236586213111877, + "num_tokens": 388214135.0, + "step": 15003 + }, + { + "epoch": 1.6477048100153744, + "grad_norm": 1.6204066276550293, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7252947092056274, + "num_tokens": 388244424.0, + "step": 15004 + }, + { + "epoch": 1.6478146277179881, + "grad_norm": 2.0806682109832764, + "learning_rate": 5e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7384399175643921, + "num_tokens": 388265117.0, + "step": 15005 + }, + { + "epoch": 1.647924445420602, + "grad_norm": 1.8710453510284424, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7196225523948669, + "num_tokens": 388288580.0, + "step": 15006 + }, + { + "epoch": 1.6480342631232154, + "grad_norm": 1.715725302696228, + "learning_rate": 5e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7417048215866089, + "num_tokens": 388316940.0, + "step": 15007 + }, + { + "epoch": 1.648144080825829, + "grad_norm": 1.9296907186508179, + "learning_rate": 5e-06, + "loss": 0.855, + "mean_token_accuracy": 0.732123851776123, + "num_tokens": 388339779.0, + "step": 15008 + }, + { + "epoch": 1.6482538985284427, + "grad_norm": 1.8936760425567627, + "learning_rate": 5e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.720582902431488, + "num_tokens": 388364678.0, + "step": 15009 + }, + { + "epoch": 1.6483637162310565, + "grad_norm": 1.9857035875320435, + "learning_rate": 5e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.714911937713623, + "num_tokens": 388388092.0, + "step": 15010 + }, + { + "epoch": 1.6484735339336702, + "grad_norm": 1.8276824951171875, + "learning_rate": 5e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7320320010185242, + "num_tokens": 388412895.0, + "step": 15011 + }, + { + "epoch": 1.6485833516362838, + "grad_norm": 2.034797430038452, + "learning_rate": 5e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7434134483337402, + "num_tokens": 388437190.0, + "step": 15012 + }, + { + "epoch": 1.6486931693388973, + "grad_norm": 1.7190895080566406, + "learning_rate": 5e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7290929555892944, + "num_tokens": 388467794.0, + "step": 15013 + }, + { + "epoch": 1.648802987041511, + "grad_norm": 1.8404542207717896, + "learning_rate": 5e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7141314744949341, + "num_tokens": 388493538.0, + "step": 15014 + }, + { + "epoch": 1.6489128047441248, + "grad_norm": 1.8844231367111206, + "learning_rate": 5e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7032469511032104, + "num_tokens": 388519835.0, + "step": 15015 + }, + { + "epoch": 1.6490226224467384, + "grad_norm": 1.735012412071228, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7233947515487671, + "num_tokens": 388547763.0, + "step": 15016 + }, + { + "epoch": 1.6491324401493521, + "grad_norm": 1.7758334875106812, + "learning_rate": 5e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7442392110824585, + "num_tokens": 388572460.0, + "step": 15017 + }, + { + "epoch": 1.6492422578519657, + "grad_norm": 1.840230941772461, + "learning_rate": 5e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7257989645004272, + "num_tokens": 388597188.0, + "step": 15018 + }, + { + "epoch": 1.6493520755545794, + "grad_norm": 1.7372804880142212, + "learning_rate": 5e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.749191403388977, + "num_tokens": 388625261.0, + "step": 15019 + }, + { + "epoch": 1.6494618932571932, + "grad_norm": 1.7493046522140503, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.730136513710022, + "num_tokens": 388650814.0, + "step": 15020 + }, + { + "epoch": 1.6495717109598067, + "grad_norm": 1.9437285661697388, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.729759693145752, + "num_tokens": 388673746.0, + "step": 15021 + }, + { + "epoch": 1.6496815286624202, + "grad_norm": 1.9628231525421143, + "learning_rate": 5e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.6950734257698059, + "num_tokens": 388701275.0, + "step": 15022 + }, + { + "epoch": 1.649791346365034, + "grad_norm": 1.682274341583252, + "learning_rate": 5e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7176510691642761, + "num_tokens": 388732667.0, + "step": 15023 + }, + { + "epoch": 1.6499011640676478, + "grad_norm": 1.7558259963989258, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7335650324821472, + "num_tokens": 388759651.0, + "step": 15024 + }, + { + "epoch": 1.6500109817702615, + "grad_norm": 2.063650608062744, + "learning_rate": 5e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7326786518096924, + "num_tokens": 388782130.0, + "step": 15025 + }, + { + "epoch": 1.650120799472875, + "grad_norm": 1.842969298362732, + "learning_rate": 5e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.720366895198822, + "num_tokens": 388809073.0, + "step": 15026 + }, + { + "epoch": 1.6502306171754886, + "grad_norm": 1.62446928024292, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7235978841781616, + "num_tokens": 388841798.0, + "step": 15027 + }, + { + "epoch": 1.6503404348781023, + "grad_norm": 1.8750516176223755, + "learning_rate": 5e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7728759050369263, + "num_tokens": 388863059.0, + "step": 15028 + }, + { + "epoch": 1.650450252580716, + "grad_norm": 1.7640533447265625, + "learning_rate": 5e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7198607325553894, + "num_tokens": 388890867.0, + "step": 15029 + }, + { + "epoch": 1.6505600702833296, + "grad_norm": 1.8577165603637695, + "learning_rate": 5e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7573592662811279, + "num_tokens": 388912931.0, + "step": 15030 + }, + { + "epoch": 1.6506698879859434, + "grad_norm": 1.848988652229309, + "learning_rate": 5e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7478703260421753, + "num_tokens": 388937510.0, + "step": 15031 + }, + { + "epoch": 1.650779705688557, + "grad_norm": 2.0721628665924072, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7604480385780334, + "num_tokens": 388957507.0, + "step": 15032 + }, + { + "epoch": 1.6508895233911707, + "grad_norm": 2.071739435195923, + "learning_rate": 5e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7404263019561768, + "num_tokens": 388978777.0, + "step": 15033 + }, + { + "epoch": 1.6509993410937844, + "grad_norm": 2.020685911178589, + "learning_rate": 5e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7314100861549377, + "num_tokens": 389001654.0, + "step": 15034 + }, + { + "epoch": 1.651109158796398, + "grad_norm": 1.787304401397705, + "learning_rate": 5e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7583293914794922, + "num_tokens": 389025654.0, + "step": 15035 + }, + { + "epoch": 1.6512189764990115, + "grad_norm": 1.826378583908081, + "learning_rate": 5e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7308791875839233, + "num_tokens": 389051571.0, + "step": 15036 + }, + { + "epoch": 1.6513287942016253, + "grad_norm": 1.7468078136444092, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.713615894317627, + "num_tokens": 389080304.0, + "step": 15037 + }, + { + "epoch": 1.651438611904239, + "grad_norm": 1.7625885009765625, + "learning_rate": 5e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7550272941589355, + "num_tokens": 389105684.0, + "step": 15038 + }, + { + "epoch": 1.6515484296068528, + "grad_norm": 1.902160406112671, + "learning_rate": 5e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7240501046180725, + "num_tokens": 389129572.0, + "step": 15039 + }, + { + "epoch": 1.6516582473094663, + "grad_norm": 2.0362510681152344, + "learning_rate": 5e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7283962368965149, + "num_tokens": 389151945.0, + "step": 15040 + }, + { + "epoch": 1.6517680650120798, + "grad_norm": 1.7744050025939941, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7132470607757568, + "num_tokens": 389179808.0, + "step": 15041 + }, + { + "epoch": 1.6518778827146936, + "grad_norm": 1.988985300064087, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7451936602592468, + "num_tokens": 389203878.0, + "step": 15042 + }, + { + "epoch": 1.6519877004173074, + "grad_norm": 1.8718187808990479, + "learning_rate": 5e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7545114159584045, + "num_tokens": 389229124.0, + "step": 15043 + }, + { + "epoch": 1.652097518119921, + "grad_norm": 1.9019949436187744, + "learning_rate": 5e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.712805449962616, + "num_tokens": 389253004.0, + "step": 15044 + }, + { + "epoch": 1.6522073358225344, + "grad_norm": 1.714077115058899, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7197436094284058, + "num_tokens": 389283168.0, + "step": 15045 + }, + { + "epoch": 1.6523171535251482, + "grad_norm": 1.861768364906311, + "learning_rate": 5e-06, + "loss": 0.846, + "mean_token_accuracy": 0.730219841003418, + "num_tokens": 389306022.0, + "step": 15046 + }, + { + "epoch": 1.652426971227762, + "grad_norm": 1.873411774635315, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7355687618255615, + "num_tokens": 389330061.0, + "step": 15047 + }, + { + "epoch": 1.6525367889303757, + "grad_norm": 1.6910769939422607, + "learning_rate": 5e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7400994300842285, + "num_tokens": 389356222.0, + "step": 15048 + }, + { + "epoch": 1.6526466066329892, + "grad_norm": 2.0232737064361572, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7347537875175476, + "num_tokens": 389377341.0, + "step": 15049 + }, + { + "epoch": 1.6527564243356028, + "grad_norm": 1.6350808143615723, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.701337456703186, + "num_tokens": 389408298.0, + "step": 15050 + }, + { + "epoch": 1.6528662420382165, + "grad_norm": 1.8387154340744019, + "learning_rate": 5e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7218921184539795, + "num_tokens": 389433346.0, + "step": 15051 + }, + { + "epoch": 1.6529760597408303, + "grad_norm": 2.214486598968506, + "learning_rate": 5e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7472822070121765, + "num_tokens": 389453451.0, + "step": 15052 + }, + { + "epoch": 1.653085877443444, + "grad_norm": 1.808891773223877, + "learning_rate": 5e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.730278491973877, + "num_tokens": 389479546.0, + "step": 15053 + }, + { + "epoch": 1.6531956951460576, + "grad_norm": 1.8128223419189453, + "learning_rate": 5e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7427307367324829, + "num_tokens": 389505513.0, + "step": 15054 + }, + { + "epoch": 1.653305512848671, + "grad_norm": 2.0075159072875977, + "learning_rate": 5e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7410562038421631, + "num_tokens": 389526279.0, + "step": 15055 + }, + { + "epoch": 1.6534153305512849, + "grad_norm": 1.7117257118225098, + "learning_rate": 5e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7510548233985901, + "num_tokens": 389552131.0, + "step": 15056 + }, + { + "epoch": 1.6535251482538986, + "grad_norm": 1.8389556407928467, + "learning_rate": 5e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7267417907714844, + "num_tokens": 389577302.0, + "step": 15057 + }, + { + "epoch": 1.6536349659565122, + "grad_norm": 2.0674734115600586, + "learning_rate": 5e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7476081848144531, + "num_tokens": 389599285.0, + "step": 15058 + }, + { + "epoch": 1.6537447836591257, + "grad_norm": 1.784148931503296, + "learning_rate": 5e-06, + "loss": 0.819, + "mean_token_accuracy": 0.73745197057724, + "num_tokens": 389626982.0, + "step": 15059 + }, + { + "epoch": 1.6538546013617395, + "grad_norm": 1.8257222175598145, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.769402265548706, + "num_tokens": 389649734.0, + "step": 15060 + }, + { + "epoch": 1.6539644190643532, + "grad_norm": 1.7707685232162476, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.73350590467453, + "num_tokens": 389678388.0, + "step": 15061 + }, + { + "epoch": 1.654074236766967, + "grad_norm": 1.7144266366958618, + "learning_rate": 5e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7407485842704773, + "num_tokens": 389705914.0, + "step": 15062 + }, + { + "epoch": 1.6541840544695805, + "grad_norm": 1.8726584911346436, + "learning_rate": 5e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7407354116439819, + "num_tokens": 389732127.0, + "step": 15063 + }, + { + "epoch": 1.654293872172194, + "grad_norm": 1.8177610635757446, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.732785165309906, + "num_tokens": 389757442.0, + "step": 15064 + }, + { + "epoch": 1.6544036898748078, + "grad_norm": 1.8616033792495728, + "learning_rate": 5e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7103295922279358, + "num_tokens": 389783737.0, + "step": 15065 + }, + { + "epoch": 1.6545135075774215, + "grad_norm": 1.7693462371826172, + "learning_rate": 5e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7382873296737671, + "num_tokens": 389809550.0, + "step": 15066 + }, + { + "epoch": 1.6546233252800353, + "grad_norm": 1.9174939393997192, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7292340993881226, + "num_tokens": 389834695.0, + "step": 15067 + }, + { + "epoch": 1.6547331429826488, + "grad_norm": 2.0156056880950928, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7416700124740601, + "num_tokens": 389856431.0, + "step": 15068 + }, + { + "epoch": 1.6548429606852624, + "grad_norm": 1.7940539121627808, + "learning_rate": 5e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7150603532791138, + "num_tokens": 389883100.0, + "step": 15069 + }, + { + "epoch": 1.6549527783878761, + "grad_norm": 1.7669545412063599, + "learning_rate": 5e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7368956804275513, + "num_tokens": 389908885.0, + "step": 15070 + }, + { + "epoch": 1.65506259609049, + "grad_norm": 1.8319143056869507, + "learning_rate": 5e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7526976466178894, + "num_tokens": 389934779.0, + "step": 15071 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 1.7948837280273438, + "learning_rate": 5e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7287954092025757, + "num_tokens": 389960803.0, + "step": 15072 + }, + { + "epoch": 1.655282231495717, + "grad_norm": 1.9492175579071045, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.724031388759613, + "num_tokens": 389984452.0, + "step": 15073 + }, + { + "epoch": 1.6553920491983307, + "grad_norm": 1.7953304052352905, + "learning_rate": 5e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7114043235778809, + "num_tokens": 390011299.0, + "step": 15074 + }, + { + "epoch": 1.6555018669009445, + "grad_norm": 1.82699716091156, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7666575908660889, + "num_tokens": 390033865.0, + "step": 15075 + }, + { + "epoch": 1.6556116846035582, + "grad_norm": 1.7141464948654175, + "learning_rate": 5e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7235751748085022, + "num_tokens": 390065700.0, + "step": 15076 + }, + { + "epoch": 1.6557215023061718, + "grad_norm": 1.8698242902755737, + "learning_rate": 5e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.738654613494873, + "num_tokens": 390090871.0, + "step": 15077 + }, + { + "epoch": 1.6558313200087853, + "grad_norm": 1.6830592155456543, + "learning_rate": 5e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7392159104347229, + "num_tokens": 390122345.0, + "step": 15078 + }, + { + "epoch": 1.655941137711399, + "grad_norm": 1.6998381614685059, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7369767427444458, + "num_tokens": 390150633.0, + "step": 15079 + }, + { + "epoch": 1.6560509554140128, + "grad_norm": 1.7712986469268799, + "learning_rate": 5e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7304608225822449, + "num_tokens": 390179488.0, + "step": 15080 + }, + { + "epoch": 1.6561607731166264, + "grad_norm": 1.9942129850387573, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.735761284828186, + "num_tokens": 390203338.0, + "step": 15081 + }, + { + "epoch": 1.65627059081924, + "grad_norm": 1.7007722854614258, + "learning_rate": 5e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7300989627838135, + "num_tokens": 390233584.0, + "step": 15082 + }, + { + "epoch": 1.6563804085218536, + "grad_norm": 2.2066245079040527, + "learning_rate": 5e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7376151084899902, + "num_tokens": 390252219.0, + "step": 15083 + }, + { + "epoch": 1.6564902262244674, + "grad_norm": 1.82936692237854, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7341246604919434, + "num_tokens": 390277436.0, + "step": 15084 + }, + { + "epoch": 1.6566000439270812, + "grad_norm": 1.7103136777877808, + "learning_rate": 5e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7447867393493652, + "num_tokens": 390303107.0, + "step": 15085 + }, + { + "epoch": 1.6567098616296947, + "grad_norm": 1.7788888216018677, + "learning_rate": 5e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7499398589134216, + "num_tokens": 390328353.0, + "step": 15086 + }, + { + "epoch": 1.6568196793323082, + "grad_norm": 1.6817309856414795, + "learning_rate": 5e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7214425802230835, + "num_tokens": 390356802.0, + "step": 15087 + }, + { + "epoch": 1.656929497034922, + "grad_norm": 1.920698881149292, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7251060605049133, + "num_tokens": 390381454.0, + "step": 15088 + }, + { + "epoch": 1.6570393147375357, + "grad_norm": 2.048517942428589, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7612640857696533, + "num_tokens": 390400123.0, + "step": 15089 + }, + { + "epoch": 1.6571491324401495, + "grad_norm": 1.9662479162216187, + "learning_rate": 5e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7283954620361328, + "num_tokens": 390421861.0, + "step": 15090 + }, + { + "epoch": 1.657258950142763, + "grad_norm": 1.8328568935394287, + "learning_rate": 5e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7422051429748535, + "num_tokens": 390447532.0, + "step": 15091 + }, + { + "epoch": 1.6573687678453766, + "grad_norm": 1.931564450263977, + "learning_rate": 5e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7402178645133972, + "num_tokens": 390472459.0, + "step": 15092 + }, + { + "epoch": 1.6574785855479903, + "grad_norm": 2.0444204807281494, + "learning_rate": 5e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7288556098937988, + "num_tokens": 390495660.0, + "step": 15093 + }, + { + "epoch": 1.657588403250604, + "grad_norm": 1.9893240928649902, + "learning_rate": 5e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7501546144485474, + "num_tokens": 390517255.0, + "step": 15094 + }, + { + "epoch": 1.6576982209532176, + "grad_norm": 1.932174801826477, + "learning_rate": 5e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7253308296203613, + "num_tokens": 390542285.0, + "step": 15095 + }, + { + "epoch": 1.6578080386558314, + "grad_norm": 1.8792064189910889, + "learning_rate": 5e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7127846479415894, + "num_tokens": 390569663.0, + "step": 15096 + }, + { + "epoch": 1.657917856358445, + "grad_norm": 1.7967363595962524, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7261245250701904, + "num_tokens": 390597577.0, + "step": 15097 + }, + { + "epoch": 1.6580276740610587, + "grad_norm": 2.096073627471924, + "learning_rate": 5e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7469752430915833, + "num_tokens": 390618193.0, + "step": 15098 + }, + { + "epoch": 1.6581374917636724, + "grad_norm": 1.9124184846878052, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7355355024337769, + "num_tokens": 390642469.0, + "step": 15099 + }, + { + "epoch": 1.658247309466286, + "grad_norm": 2.161264657974243, + "learning_rate": 5e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7429277300834656, + "num_tokens": 390661906.0, + "step": 15100 + }, + { + "epoch": 1.6583571271688995, + "grad_norm": 1.6879044771194458, + "learning_rate": 5e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7416869401931763, + "num_tokens": 390688921.0, + "step": 15101 + }, + { + "epoch": 1.6584669448715132, + "grad_norm": 1.4954860210418701, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7180089950561523, + "num_tokens": 390724749.0, + "step": 15102 + }, + { + "epoch": 1.658576762574127, + "grad_norm": 1.87306809425354, + "learning_rate": 5e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7300227880477905, + "num_tokens": 390750824.0, + "step": 15103 + }, + { + "epoch": 1.6586865802767408, + "grad_norm": 1.7273436784744263, + "learning_rate": 5e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.7605118155479431, + "num_tokens": 390777303.0, + "step": 15104 + }, + { + "epoch": 1.6587963979793543, + "grad_norm": 1.7211894989013672, + "learning_rate": 5e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7360751032829285, + "num_tokens": 390804753.0, + "step": 15105 + }, + { + "epoch": 1.6589062156819678, + "grad_norm": 1.8463596105575562, + "learning_rate": 5e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7069013714790344, + "num_tokens": 390831903.0, + "step": 15106 + }, + { + "epoch": 1.6590160333845816, + "grad_norm": 1.7044278383255005, + "learning_rate": 5e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7194505929946899, + "num_tokens": 390861215.0, + "step": 15107 + }, + { + "epoch": 1.6591258510871953, + "grad_norm": 1.5714937448501587, + "learning_rate": 5e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.6981610059738159, + "num_tokens": 390895240.0, + "step": 15108 + }, + { + "epoch": 1.6592356687898089, + "grad_norm": 1.8416173458099365, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7654807567596436, + "num_tokens": 390918495.0, + "step": 15109 + }, + { + "epoch": 1.6593454864924224, + "grad_norm": 1.9832321405410767, + "learning_rate": 5e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7486984729766846, + "num_tokens": 390939675.0, + "step": 15110 + }, + { + "epoch": 1.6594553041950362, + "grad_norm": 1.6404975652694702, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7355753183364868, + "num_tokens": 390969810.0, + "step": 15111 + }, + { + "epoch": 1.65956512189765, + "grad_norm": 1.856736421585083, + "learning_rate": 5e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7333271503448486, + "num_tokens": 390995943.0, + "step": 15112 + }, + { + "epoch": 1.6596749396002637, + "grad_norm": 1.7293481826782227, + "learning_rate": 5e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7386584281921387, + "num_tokens": 391022715.0, + "step": 15113 + }, + { + "epoch": 1.6597847573028772, + "grad_norm": 1.9742451906204224, + "learning_rate": 5e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7398381233215332, + "num_tokens": 391046671.0, + "step": 15114 + }, + { + "epoch": 1.6598945750054908, + "grad_norm": 1.7397485971450806, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7208985090255737, + "num_tokens": 391077068.0, + "step": 15115 + }, + { + "epoch": 1.6600043927081045, + "grad_norm": 1.8339557647705078, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7345699071884155, + "num_tokens": 391100775.0, + "step": 15116 + }, + { + "epoch": 1.6601142104107183, + "grad_norm": 1.7391538619995117, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7354223728179932, + "num_tokens": 391130575.0, + "step": 15117 + }, + { + "epoch": 1.660224028113332, + "grad_norm": 2.0322189331054688, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7351474761962891, + "num_tokens": 391153815.0, + "step": 15118 + }, + { + "epoch": 1.6603338458159456, + "grad_norm": 1.9748234748840332, + "learning_rate": 5e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7640124559402466, + "num_tokens": 391173962.0, + "step": 15119 + }, + { + "epoch": 1.660443663518559, + "grad_norm": 2.015902280807495, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7207673788070679, + "num_tokens": 391197040.0, + "step": 15120 + }, + { + "epoch": 1.6605534812211729, + "grad_norm": 1.8125529289245605, + "learning_rate": 5e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7024540901184082, + "num_tokens": 391225162.0, + "step": 15121 + }, + { + "epoch": 1.6606632989237866, + "grad_norm": 2.018563985824585, + "learning_rate": 5e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.6942061185836792, + "num_tokens": 391246650.0, + "step": 15122 + }, + { + "epoch": 1.6607731166264001, + "grad_norm": 1.9690091609954834, + "learning_rate": 5e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7318824529647827, + "num_tokens": 391269354.0, + "step": 15123 + }, + { + "epoch": 1.6608829343290137, + "grad_norm": 1.8520011901855469, + "learning_rate": 5e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7270838022232056, + "num_tokens": 391295007.0, + "step": 15124 + }, + { + "epoch": 1.6609927520316274, + "grad_norm": 1.827729344367981, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7304545044898987, + "num_tokens": 391320809.0, + "step": 15125 + }, + { + "epoch": 1.6611025697342412, + "grad_norm": 1.634886622428894, + "learning_rate": 5e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7554166316986084, + "num_tokens": 391350637.0, + "step": 15126 + }, + { + "epoch": 1.661212387436855, + "grad_norm": 1.708013892173767, + "learning_rate": 5e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7356120944023132, + "num_tokens": 391376671.0, + "step": 15127 + }, + { + "epoch": 1.6613222051394685, + "grad_norm": 1.9753435850143433, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7521153688430786, + "num_tokens": 391396140.0, + "step": 15128 + }, + { + "epoch": 1.661432022842082, + "grad_norm": 1.8042545318603516, + "learning_rate": 5e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7545455694198608, + "num_tokens": 391420810.0, + "step": 15129 + }, + { + "epoch": 1.6615418405446958, + "grad_norm": 1.8267731666564941, + "learning_rate": 5e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7402673959732056, + "num_tokens": 391444410.0, + "step": 15130 + }, + { + "epoch": 1.6616516582473095, + "grad_norm": 1.7855123281478882, + "learning_rate": 5e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7367088794708252, + "num_tokens": 391472883.0, + "step": 15131 + }, + { + "epoch": 1.661761475949923, + "grad_norm": 1.7922258377075195, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7156575322151184, + "num_tokens": 391500664.0, + "step": 15132 + }, + { + "epoch": 1.6618712936525368, + "grad_norm": 1.8394993543624878, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7548892498016357, + "num_tokens": 391522805.0, + "step": 15133 + }, + { + "epoch": 1.6619811113551504, + "grad_norm": 2.153784990310669, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7407586574554443, + "num_tokens": 391541811.0, + "step": 15134 + }, + { + "epoch": 1.6620909290577641, + "grad_norm": 1.7837830781936646, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7359236478805542, + "num_tokens": 391569999.0, + "step": 15135 + }, + { + "epoch": 1.6622007467603779, + "grad_norm": 1.8403184413909912, + "learning_rate": 5e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7675480842590332, + "num_tokens": 391592510.0, + "step": 15136 + }, + { + "epoch": 1.6623105644629914, + "grad_norm": 1.8277145624160767, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7355442047119141, + "num_tokens": 391619585.0, + "step": 15137 + }, + { + "epoch": 1.662420382165605, + "grad_norm": 1.6947649717330933, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7214319705963135, + "num_tokens": 391649884.0, + "step": 15138 + }, + { + "epoch": 1.6625301998682187, + "grad_norm": 2.0288960933685303, + "learning_rate": 5e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7479271292686462, + "num_tokens": 391671269.0, + "step": 15139 + }, + { + "epoch": 1.6626400175708325, + "grad_norm": 1.7866332530975342, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.752513587474823, + "num_tokens": 391696947.0, + "step": 15140 + }, + { + "epoch": 1.6627498352734462, + "grad_norm": 1.8773709535598755, + "learning_rate": 5e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7391732931137085, + "num_tokens": 391719956.0, + "step": 15141 + }, + { + "epoch": 1.6628596529760598, + "grad_norm": 1.7898390293121338, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.736376166343689, + "num_tokens": 391745742.0, + "step": 15142 + }, + { + "epoch": 1.6629694706786733, + "grad_norm": 1.8221795558929443, + "learning_rate": 5e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7161712646484375, + "num_tokens": 391771018.0, + "step": 15143 + }, + { + "epoch": 1.663079288381287, + "grad_norm": 1.885646939277649, + "learning_rate": 5e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.7419853210449219, + "num_tokens": 391794582.0, + "step": 15144 + }, + { + "epoch": 1.6631891060839008, + "grad_norm": 1.9238706827163696, + "learning_rate": 5e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7455267310142517, + "num_tokens": 391816691.0, + "step": 15145 + }, + { + "epoch": 1.6632989237865143, + "grad_norm": 1.8758662939071655, + "learning_rate": 5e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.731290876865387, + "num_tokens": 391839948.0, + "step": 15146 + }, + { + "epoch": 1.663408741489128, + "grad_norm": 1.7850418090820312, + "learning_rate": 5e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7345422506332397, + "num_tokens": 391865200.0, + "step": 15147 + }, + { + "epoch": 1.6635185591917416, + "grad_norm": 2.0950205326080322, + "learning_rate": 5e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7454005479812622, + "num_tokens": 391886406.0, + "step": 15148 + }, + { + "epoch": 1.6636283768943554, + "grad_norm": 1.7253341674804688, + "learning_rate": 5e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7374876737594604, + "num_tokens": 391916001.0, + "step": 15149 + }, + { + "epoch": 1.6637381945969691, + "grad_norm": 1.768193006515503, + "learning_rate": 5e-06, + "loss": 0.856, + "mean_token_accuracy": 0.731626033782959, + "num_tokens": 391942704.0, + "step": 15150 + }, + { + "epoch": 1.6638480122995827, + "grad_norm": 1.6415754556655884, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7353466153144836, + "num_tokens": 391972226.0, + "step": 15151 + }, + { + "epoch": 1.6639578300021962, + "grad_norm": 1.6765482425689697, + "learning_rate": 5e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7141173481941223, + "num_tokens": 392000596.0, + "step": 15152 + }, + { + "epoch": 1.66406764770481, + "grad_norm": 1.6244022846221924, + "learning_rate": 5e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7222716212272644, + "num_tokens": 392031046.0, + "step": 15153 + }, + { + "epoch": 1.6641774654074237, + "grad_norm": 1.983682632446289, + "learning_rate": 5e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7300249934196472, + "num_tokens": 392057751.0, + "step": 15154 + }, + { + "epoch": 1.6642872831100375, + "grad_norm": 1.8376610279083252, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7260787487030029, + "num_tokens": 392083167.0, + "step": 15155 + }, + { + "epoch": 1.664397100812651, + "grad_norm": 1.8094779253005981, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7124661207199097, + "num_tokens": 392110723.0, + "step": 15156 + }, + { + "epoch": 1.6645069185152646, + "grad_norm": 2.1126277446746826, + "learning_rate": 5e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7368580102920532, + "num_tokens": 392131148.0, + "step": 15157 + }, + { + "epoch": 1.6646167362178783, + "grad_norm": 1.8230040073394775, + "learning_rate": 5e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7159048318862915, + "num_tokens": 392161080.0, + "step": 15158 + }, + { + "epoch": 1.664726553920492, + "grad_norm": 1.8956348896026611, + "learning_rate": 5e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7470678687095642, + "num_tokens": 392183948.0, + "step": 15159 + }, + { + "epoch": 1.6648363716231056, + "grad_norm": 1.7998676300048828, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7353472709655762, + "num_tokens": 392208829.0, + "step": 15160 + }, + { + "epoch": 1.6649461893257194, + "grad_norm": 1.8034443855285645, + "learning_rate": 5e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7374695539474487, + "num_tokens": 392234221.0, + "step": 15161 + }, + { + "epoch": 1.665056007028333, + "grad_norm": 1.619095802307129, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7327747344970703, + "num_tokens": 392263904.0, + "step": 15162 + }, + { + "epoch": 1.6651658247309467, + "grad_norm": 1.6800298690795898, + "learning_rate": 5e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7519185543060303, + "num_tokens": 392290999.0, + "step": 15163 + }, + { + "epoch": 1.6652756424335604, + "grad_norm": 1.8755160570144653, + "learning_rate": 5e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7483407258987427, + "num_tokens": 392314800.0, + "step": 15164 + }, + { + "epoch": 1.665385460136174, + "grad_norm": 1.893474817276001, + "learning_rate": 5e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7267863750457764, + "num_tokens": 392341412.0, + "step": 15165 + }, + { + "epoch": 1.6654952778387875, + "grad_norm": 1.795357584953308, + "learning_rate": 5e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7669475674629211, + "num_tokens": 392365292.0, + "step": 15166 + }, + { + "epoch": 1.6656050955414012, + "grad_norm": 1.8447279930114746, + "learning_rate": 5e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7163378000259399, + "num_tokens": 392390006.0, + "step": 15167 + }, + { + "epoch": 1.665714913244015, + "grad_norm": 1.8624727725982666, + "learning_rate": 5e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7315800189971924, + "num_tokens": 392416100.0, + "step": 15168 + }, + { + "epoch": 1.6658247309466288, + "grad_norm": 1.9646332263946533, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.731500506401062, + "num_tokens": 392438523.0, + "step": 15169 + }, + { + "epoch": 1.6659345486492423, + "grad_norm": 1.835806131362915, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7434920072555542, + "num_tokens": 392462390.0, + "step": 15170 + }, + { + "epoch": 1.6660443663518558, + "grad_norm": 1.794020652770996, + "learning_rate": 5e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7368777990341187, + "num_tokens": 392490048.0, + "step": 15171 + }, + { + "epoch": 1.6661541840544696, + "grad_norm": 2.184030532836914, + "learning_rate": 5e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7542083263397217, + "num_tokens": 392508141.0, + "step": 15172 + }, + { + "epoch": 1.6662640017570833, + "grad_norm": 1.6612532138824463, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7142682075500488, + "num_tokens": 392538908.0, + "step": 15173 + }, + { + "epoch": 1.6663738194596969, + "grad_norm": 2.065718412399292, + "learning_rate": 5e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7267670035362244, + "num_tokens": 392559266.0, + "step": 15174 + }, + { + "epoch": 1.6664836371623104, + "grad_norm": 1.9195388555526733, + "learning_rate": 5e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7604719400405884, + "num_tokens": 392580237.0, + "step": 15175 + }, + { + "epoch": 1.6665934548649242, + "grad_norm": 1.8939223289489746, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7349656224250793, + "num_tokens": 392602188.0, + "step": 15176 + }, + { + "epoch": 1.666703272567538, + "grad_norm": 1.9323291778564453, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7648278474807739, + "num_tokens": 392623860.0, + "step": 15177 + }, + { + "epoch": 1.6668130902701517, + "grad_norm": 1.7800618410110474, + "learning_rate": 5e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7372958064079285, + "num_tokens": 392649711.0, + "step": 15178 + }, + { + "epoch": 1.6669229079727652, + "grad_norm": 1.716979742050171, + "learning_rate": 5e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7266507148742676, + "num_tokens": 392676348.0, + "step": 15179 + }, + { + "epoch": 1.6670327256753787, + "grad_norm": 1.853032112121582, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7643778920173645, + "num_tokens": 392699354.0, + "step": 15180 + }, + { + "epoch": 1.6671425433779925, + "grad_norm": 1.7887487411499023, + "learning_rate": 5e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.718979001045227, + "num_tokens": 392726507.0, + "step": 15181 + }, + { + "epoch": 1.6672523610806063, + "grad_norm": 2.0724587440490723, + "learning_rate": 5e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7087981700897217, + "num_tokens": 392750368.0, + "step": 15182 + }, + { + "epoch": 1.66736217878322, + "grad_norm": 1.721283197402954, + "learning_rate": 5e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7120943069458008, + "num_tokens": 392779695.0, + "step": 15183 + }, + { + "epoch": 1.6674719964858336, + "grad_norm": 1.757208228111267, + "learning_rate": 5e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7560125589370728, + "num_tokens": 392806152.0, + "step": 15184 + }, + { + "epoch": 1.667581814188447, + "grad_norm": 1.9116179943084717, + "learning_rate": 5e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7212033271789551, + "num_tokens": 392832570.0, + "step": 15185 + }, + { + "epoch": 1.6676916318910608, + "grad_norm": 1.78531813621521, + "learning_rate": 5e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7508572340011597, + "num_tokens": 392857801.0, + "step": 15186 + }, + { + "epoch": 1.6678014495936746, + "grad_norm": 2.0820791721343994, + "learning_rate": 5e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7434331178665161, + "num_tokens": 392879407.0, + "step": 15187 + }, + { + "epoch": 1.6679112672962881, + "grad_norm": 1.6632000207901, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7252684831619263, + "num_tokens": 392910898.0, + "step": 15188 + }, + { + "epoch": 1.6680210849989017, + "grad_norm": 1.7635905742645264, + "learning_rate": 5e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7250071167945862, + "num_tokens": 392936744.0, + "step": 15189 + }, + { + "epoch": 1.6681309027015154, + "grad_norm": 1.7698436975479126, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7376763820648193, + "num_tokens": 392962326.0, + "step": 15190 + }, + { + "epoch": 1.6682407204041292, + "grad_norm": 1.7586137056350708, + "learning_rate": 5e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7050328850746155, + "num_tokens": 392992094.0, + "step": 15191 + }, + { + "epoch": 1.668350538106743, + "grad_norm": 1.7102664709091187, + "learning_rate": 5e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7391437292098999, + "num_tokens": 393016377.0, + "step": 15192 + }, + { + "epoch": 1.6684603558093565, + "grad_norm": 1.8009097576141357, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7296552658081055, + "num_tokens": 393044092.0, + "step": 15193 + }, + { + "epoch": 1.66857017351197, + "grad_norm": 1.9049978256225586, + "learning_rate": 5e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7036030888557434, + "num_tokens": 393068507.0, + "step": 15194 + }, + { + "epoch": 1.6686799912145838, + "grad_norm": 2.100343942642212, + "learning_rate": 5e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7507306337356567, + "num_tokens": 393090931.0, + "step": 15195 + }, + { + "epoch": 1.6687898089171975, + "grad_norm": 1.8077088594436646, + "learning_rate": 5e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7459638118743896, + "num_tokens": 393117307.0, + "step": 15196 + }, + { + "epoch": 1.668899626619811, + "grad_norm": 1.8482635021209717, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7418006658554077, + "num_tokens": 393145145.0, + "step": 15197 + }, + { + "epoch": 1.6690094443224248, + "grad_norm": 1.9610447883605957, + "learning_rate": 5e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7212966680526733, + "num_tokens": 393170119.0, + "step": 15198 + }, + { + "epoch": 1.6691192620250384, + "grad_norm": 2.002764940261841, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7350127696990967, + "num_tokens": 393193430.0, + "step": 15199 + }, + { + "epoch": 1.669229079727652, + "grad_norm": 2.121567487716675, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7259308099746704, + "num_tokens": 393214213.0, + "step": 15200 + }, + { + "epoch": 1.6693388974302659, + "grad_norm": 1.6870571374893188, + "learning_rate": 5e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7148845791816711, + "num_tokens": 393245606.0, + "step": 15201 + }, + { + "epoch": 1.6694487151328794, + "grad_norm": 1.6777265071868896, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7310243844985962, + "num_tokens": 393274657.0, + "step": 15202 + }, + { + "epoch": 1.669558532835493, + "grad_norm": 1.64844810962677, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7349625825881958, + "num_tokens": 393302547.0, + "step": 15203 + }, + { + "epoch": 1.6696683505381067, + "grad_norm": 1.9774843454360962, + "learning_rate": 5e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.734276533126831, + "num_tokens": 393324613.0, + "step": 15204 + }, + { + "epoch": 1.6697781682407205, + "grad_norm": 1.9781901836395264, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7429168224334717, + "num_tokens": 393346206.0, + "step": 15205 + }, + { + "epoch": 1.6698879859433342, + "grad_norm": 1.9853280782699585, + "learning_rate": 5e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.743074893951416, + "num_tokens": 393368560.0, + "step": 15206 + }, + { + "epoch": 1.6699978036459477, + "grad_norm": 1.748631477355957, + "learning_rate": 5e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7316564917564392, + "num_tokens": 393395100.0, + "step": 15207 + }, + { + "epoch": 1.6701076213485613, + "grad_norm": 1.8444656133651733, + "learning_rate": 5e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7324533462524414, + "num_tokens": 393421765.0, + "step": 15208 + }, + { + "epoch": 1.670217439051175, + "grad_norm": 1.9252427816390991, + "learning_rate": 5e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7288205623626709, + "num_tokens": 393447799.0, + "step": 15209 + }, + { + "epoch": 1.6703272567537888, + "grad_norm": 1.7922320365905762, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7376699447631836, + "num_tokens": 393475225.0, + "step": 15210 + }, + { + "epoch": 1.6704370744564023, + "grad_norm": 1.8877623081207275, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7201124429702759, + "num_tokens": 393502813.0, + "step": 15211 + }, + { + "epoch": 1.670546892159016, + "grad_norm": 1.7624232769012451, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7408164739608765, + "num_tokens": 393531218.0, + "step": 15212 + }, + { + "epoch": 1.6706567098616296, + "grad_norm": 1.9451030492782593, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.720122754573822, + "num_tokens": 393555964.0, + "step": 15213 + }, + { + "epoch": 1.6707665275642434, + "grad_norm": 1.7579455375671387, + "learning_rate": 5e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.748975396156311, + "num_tokens": 393581655.0, + "step": 15214 + }, + { + "epoch": 1.6708763452668571, + "grad_norm": 1.9537500143051147, + "learning_rate": 5e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7517554759979248, + "num_tokens": 393603575.0, + "step": 15215 + }, + { + "epoch": 1.6709861629694707, + "grad_norm": 1.7873369455337524, + "learning_rate": 5e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7176541090011597, + "num_tokens": 393635523.0, + "step": 15216 + }, + { + "epoch": 1.6710959806720842, + "grad_norm": 1.7441953420639038, + "learning_rate": 5e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7214974761009216, + "num_tokens": 393665781.0, + "step": 15217 + }, + { + "epoch": 1.671205798374698, + "grad_norm": 1.8996762037277222, + "learning_rate": 5e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7470224499702454, + "num_tokens": 393689239.0, + "step": 15218 + }, + { + "epoch": 1.6713156160773117, + "grad_norm": 1.872674822807312, + "learning_rate": 5e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7267128825187683, + "num_tokens": 393715565.0, + "step": 15219 + }, + { + "epoch": 1.6714254337799255, + "grad_norm": 2.053497076034546, + "learning_rate": 5e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7474008798599243, + "num_tokens": 393735616.0, + "step": 15220 + }, + { + "epoch": 1.671535251482539, + "grad_norm": 1.944608211517334, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7443014979362488, + "num_tokens": 393757428.0, + "step": 15221 + }, + { + "epoch": 1.6716450691851525, + "grad_norm": 1.7787338495254517, + "learning_rate": 5e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7432668209075928, + "num_tokens": 393782663.0, + "step": 15222 + }, + { + "epoch": 1.6717548868877663, + "grad_norm": 2.131152391433716, + "learning_rate": 5e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7660049796104431, + "num_tokens": 393801674.0, + "step": 15223 + }, + { + "epoch": 1.67186470459038, + "grad_norm": 1.6443281173706055, + "learning_rate": 5e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7581493854522705, + "num_tokens": 393832457.0, + "step": 15224 + }, + { + "epoch": 1.6719745222929936, + "grad_norm": 1.7708672285079956, + "learning_rate": 5e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7365626096725464, + "num_tokens": 393857879.0, + "step": 15225 + }, + { + "epoch": 1.6720843399956071, + "grad_norm": 1.7237731218338013, + "learning_rate": 5e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7475653886795044, + "num_tokens": 393885320.0, + "step": 15226 + }, + { + "epoch": 1.6721941576982209, + "grad_norm": 1.9487714767456055, + "learning_rate": 5e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7466214895248413, + "num_tokens": 393906687.0, + "step": 15227 + }, + { + "epoch": 1.6723039754008346, + "grad_norm": 1.7863372564315796, + "learning_rate": 5e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7292773723602295, + "num_tokens": 393933473.0, + "step": 15228 + }, + { + "epoch": 1.6724137931034484, + "grad_norm": 1.6981201171875, + "learning_rate": 5e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7171770334243774, + "num_tokens": 393962508.0, + "step": 15229 + }, + { + "epoch": 1.672523610806062, + "grad_norm": 1.8712058067321777, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7411988377571106, + "num_tokens": 393988456.0, + "step": 15230 + }, + { + "epoch": 1.6726334285086755, + "grad_norm": 1.6060901880264282, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7160606384277344, + "num_tokens": 394019409.0, + "step": 15231 + }, + { + "epoch": 1.6727432462112892, + "grad_norm": 1.8571498394012451, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7482366561889648, + "num_tokens": 394044337.0, + "step": 15232 + }, + { + "epoch": 1.672853063913903, + "grad_norm": 1.9107344150543213, + "learning_rate": 5e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7275952100753784, + "num_tokens": 394068656.0, + "step": 15233 + }, + { + "epoch": 1.6729628816165167, + "grad_norm": 1.6855435371398926, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7273906469345093, + "num_tokens": 394099056.0, + "step": 15234 + }, + { + "epoch": 1.6730726993191303, + "grad_norm": 1.8110740184783936, + "learning_rate": 5e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7203290462493896, + "num_tokens": 394123517.0, + "step": 15235 + }, + { + "epoch": 1.6731825170217438, + "grad_norm": 1.9211996793746948, + "learning_rate": 5e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7366285920143127, + "num_tokens": 394148375.0, + "step": 15236 + }, + { + "epoch": 1.6732923347243576, + "grad_norm": 1.817726969718933, + "learning_rate": 5e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.717119574546814, + "num_tokens": 394174764.0, + "step": 15237 + }, + { + "epoch": 1.6734021524269713, + "grad_norm": 1.8495662212371826, + "learning_rate": 5e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7288854718208313, + "num_tokens": 394200274.0, + "step": 15238 + }, + { + "epoch": 1.6735119701295849, + "grad_norm": 1.8649883270263672, + "learning_rate": 5e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7364457845687866, + "num_tokens": 394226354.0, + "step": 15239 + }, + { + "epoch": 1.6736217878321984, + "grad_norm": 2.033081293106079, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7340417504310608, + "num_tokens": 394248138.0, + "step": 15240 + }, + { + "epoch": 1.6737316055348122, + "grad_norm": 2.0454416275024414, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7412058711051941, + "num_tokens": 394269089.0, + "step": 15241 + }, + { + "epoch": 1.673841423237426, + "grad_norm": 2.0720720291137695, + "learning_rate": 5e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7559575438499451, + "num_tokens": 394288265.0, + "step": 15242 + }, + { + "epoch": 1.6739512409400397, + "grad_norm": 1.7256420850753784, + "learning_rate": 5e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7322646379470825, + "num_tokens": 394317153.0, + "step": 15243 + }, + { + "epoch": 1.6740610586426532, + "grad_norm": 1.6061489582061768, + "learning_rate": 5e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7215107679367065, + "num_tokens": 394348423.0, + "step": 15244 + }, + { + "epoch": 1.6741708763452667, + "grad_norm": 1.5846202373504639, + "learning_rate": 5e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7430177927017212, + "num_tokens": 394377371.0, + "step": 15245 + }, + { + "epoch": 1.6742806940478805, + "grad_norm": 1.9484699964523315, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.723496675491333, + "num_tokens": 394400179.0, + "step": 15246 + }, + { + "epoch": 1.6743905117504942, + "grad_norm": 1.6549491882324219, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7288992404937744, + "num_tokens": 394429708.0, + "step": 15247 + }, + { + "epoch": 1.674500329453108, + "grad_norm": 1.7657262086868286, + "learning_rate": 5e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7356983423233032, + "num_tokens": 394458956.0, + "step": 15248 + }, + { + "epoch": 1.6746101471557215, + "grad_norm": 1.8325210809707642, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7384309768676758, + "num_tokens": 394485087.0, + "step": 15249 + }, + { + "epoch": 1.674719964858335, + "grad_norm": 1.948496699333191, + "learning_rate": 5e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7594964504241943, + "num_tokens": 394507459.0, + "step": 15250 + }, + { + "epoch": 1.6748297825609488, + "grad_norm": 1.765204668045044, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7156262993812561, + "num_tokens": 394536106.0, + "step": 15251 + }, + { + "epoch": 1.6749396002635626, + "grad_norm": 1.936221718788147, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7450883388519287, + "num_tokens": 394557350.0, + "step": 15252 + }, + { + "epoch": 1.6750494179661761, + "grad_norm": 1.7266992330551147, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7307209372520447, + "num_tokens": 394584982.0, + "step": 15253 + }, + { + "epoch": 1.6751592356687897, + "grad_norm": 1.7018895149230957, + "learning_rate": 5e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7508593797683716, + "num_tokens": 394611628.0, + "step": 15254 + }, + { + "epoch": 1.6752690533714034, + "grad_norm": 2.0642480850219727, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7437704801559448, + "num_tokens": 394631150.0, + "step": 15255 + }, + { + "epoch": 1.6753788710740172, + "grad_norm": 1.8920750617980957, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7162111401557922, + "num_tokens": 394656057.0, + "step": 15256 + }, + { + "epoch": 1.675488688776631, + "grad_norm": 1.8027914762496948, + "learning_rate": 5e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7557787895202637, + "num_tokens": 394681245.0, + "step": 15257 + }, + { + "epoch": 1.6755985064792445, + "grad_norm": 1.9828345775604248, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7200119495391846, + "num_tokens": 394703888.0, + "step": 15258 + }, + { + "epoch": 1.675708324181858, + "grad_norm": 1.8064829111099243, + "learning_rate": 5e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7514379024505615, + "num_tokens": 394730830.0, + "step": 15259 + }, + { + "epoch": 1.6758181418844718, + "grad_norm": 1.8626563549041748, + "learning_rate": 5e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.740905225276947, + "num_tokens": 394757007.0, + "step": 15260 + }, + { + "epoch": 1.6759279595870855, + "grad_norm": 1.7885171175003052, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7737082242965698, + "num_tokens": 394781498.0, + "step": 15261 + }, + { + "epoch": 1.676037777289699, + "grad_norm": 1.9558217525482178, + "learning_rate": 5e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.739683210849762, + "num_tokens": 394805412.0, + "step": 15262 + }, + { + "epoch": 1.6761475949923128, + "grad_norm": 1.7347755432128906, + "learning_rate": 5e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7399688959121704, + "num_tokens": 394834036.0, + "step": 15263 + }, + { + "epoch": 1.6762574126949263, + "grad_norm": 1.7683850526809692, + "learning_rate": 5e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7210628986358643, + "num_tokens": 394862885.0, + "step": 15264 + }, + { + "epoch": 1.67636723039754, + "grad_norm": 1.942691683769226, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.722007155418396, + "num_tokens": 394886710.0, + "step": 15265 + }, + { + "epoch": 1.6764770481001539, + "grad_norm": 2.1634833812713623, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7717359066009521, + "num_tokens": 394904044.0, + "step": 15266 + }, + { + "epoch": 1.6765868658027674, + "grad_norm": 1.5804073810577393, + "learning_rate": 5e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7340619564056396, + "num_tokens": 394936234.0, + "step": 15267 + }, + { + "epoch": 1.676696683505381, + "grad_norm": 1.915637731552124, + "learning_rate": 5e-06, + "loss": 0.966, + "mean_token_accuracy": 0.6989449262619019, + "num_tokens": 394962720.0, + "step": 15268 + }, + { + "epoch": 1.6768065012079947, + "grad_norm": 1.5911697149276733, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7158712148666382, + "num_tokens": 394997271.0, + "step": 15269 + }, + { + "epoch": 1.6769163189106084, + "grad_norm": 1.8394684791564941, + "learning_rate": 5e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7404850721359253, + "num_tokens": 395020511.0, + "step": 15270 + }, + { + "epoch": 1.6770261366132222, + "grad_norm": 1.6707130670547485, + "learning_rate": 5e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7346243858337402, + "num_tokens": 395049283.0, + "step": 15271 + }, + { + "epoch": 1.6771359543158357, + "grad_norm": 1.9875702857971191, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7319197654724121, + "num_tokens": 395072261.0, + "step": 15272 + }, + { + "epoch": 1.6772457720184493, + "grad_norm": 1.8172521591186523, + "learning_rate": 5e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.735843300819397, + "num_tokens": 395096681.0, + "step": 15273 + }, + { + "epoch": 1.677355589721063, + "grad_norm": 1.9131286144256592, + "learning_rate": 5e-06, + "loss": 0.794, + "mean_token_accuracy": 0.7437829971313477, + "num_tokens": 395120623.0, + "step": 15274 + }, + { + "epoch": 1.6774654074236768, + "grad_norm": 1.818422555923462, + "learning_rate": 5e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7160897850990295, + "num_tokens": 395147899.0, + "step": 15275 + }, + { + "epoch": 1.6775752251262903, + "grad_norm": 1.8193246126174927, + "learning_rate": 5e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.6998536586761475, + "num_tokens": 395176081.0, + "step": 15276 + }, + { + "epoch": 1.677685042828904, + "grad_norm": 2.0005109310150146, + "learning_rate": 5e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7332860231399536, + "num_tokens": 395199992.0, + "step": 15277 + }, + { + "epoch": 1.6777948605315176, + "grad_norm": 1.7939461469650269, + "learning_rate": 5e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7287274599075317, + "num_tokens": 395226937.0, + "step": 15278 + }, + { + "epoch": 1.6779046782341314, + "grad_norm": 1.9973289966583252, + "learning_rate": 5e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7255231738090515, + "num_tokens": 395250406.0, + "step": 15279 + }, + { + "epoch": 1.6780144959367451, + "grad_norm": 1.770369291305542, + "learning_rate": 5e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.71271812915802, + "num_tokens": 395279893.0, + "step": 15280 + }, + { + "epoch": 1.6781243136393587, + "grad_norm": 2.016493320465088, + "learning_rate": 5e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7436903715133667, + "num_tokens": 395301666.0, + "step": 15281 + }, + { + "epoch": 1.6782341313419722, + "grad_norm": 1.688741683959961, + "learning_rate": 5e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7306761741638184, + "num_tokens": 395333338.0, + "step": 15282 + }, + { + "epoch": 1.678343949044586, + "grad_norm": 1.7306545972824097, + "learning_rate": 5e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7093812227249146, + "num_tokens": 395362007.0, + "step": 15283 + }, + { + "epoch": 1.6784537667471997, + "grad_norm": 1.637909173965454, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7416777014732361, + "num_tokens": 395391606.0, + "step": 15284 + }, + { + "epoch": 1.6785635844498135, + "grad_norm": 1.7445354461669922, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.720641553401947, + "num_tokens": 395419180.0, + "step": 15285 + }, + { + "epoch": 1.678673402152427, + "grad_norm": 1.767871379852295, + "learning_rate": 5e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7353475093841553, + "num_tokens": 395447823.0, + "step": 15286 + }, + { + "epoch": 1.6787832198550405, + "grad_norm": 1.7510145902633667, + "learning_rate": 5e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.713155210018158, + "num_tokens": 395475950.0, + "step": 15287 + }, + { + "epoch": 1.6788930375576543, + "grad_norm": 1.7884438037872314, + "learning_rate": 5e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.726754367351532, + "num_tokens": 395502563.0, + "step": 15288 + }, + { + "epoch": 1.679002855260268, + "grad_norm": 1.892836570739746, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7221031188964844, + "num_tokens": 395526632.0, + "step": 15289 + }, + { + "epoch": 1.6791126729628816, + "grad_norm": 1.625242829322815, + "learning_rate": 5e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7414165735244751, + "num_tokens": 395557375.0, + "step": 15290 + }, + { + "epoch": 1.6792224906654951, + "grad_norm": 1.7295897006988525, + "learning_rate": 5e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7123051881790161, + "num_tokens": 395584068.0, + "step": 15291 + }, + { + "epoch": 1.6793323083681089, + "grad_norm": 1.8415719270706177, + "learning_rate": 5e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7354090809822083, + "num_tokens": 395609223.0, + "step": 15292 + }, + { + "epoch": 1.6794421260707226, + "grad_norm": 2.003124713897705, + "learning_rate": 5e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.728329598903656, + "num_tokens": 395631191.0, + "step": 15293 + }, + { + "epoch": 1.6795519437733364, + "grad_norm": 1.692191243171692, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7289512157440186, + "num_tokens": 395660402.0, + "step": 15294 + }, + { + "epoch": 1.67966176147595, + "grad_norm": 2.0038185119628906, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7269805073738098, + "num_tokens": 395681794.0, + "step": 15295 + }, + { + "epoch": 1.6797715791785635, + "grad_norm": 1.7230639457702637, + "learning_rate": 5e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7220784425735474, + "num_tokens": 395708722.0, + "step": 15296 + }, + { + "epoch": 1.6798813968811772, + "grad_norm": 2.004126787185669, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7664854526519775, + "num_tokens": 395729399.0, + "step": 15297 + }, + { + "epoch": 1.679991214583791, + "grad_norm": 1.7814558744430542, + "learning_rate": 5e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7118774056434631, + "num_tokens": 395758106.0, + "step": 15298 + }, + { + "epoch": 1.6801010322864047, + "grad_norm": 1.5986250638961792, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7269346714019775, + "num_tokens": 395788929.0, + "step": 15299 + }, + { + "epoch": 1.6802108499890183, + "grad_norm": 1.7961136102676392, + "learning_rate": 5e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7251102924346924, + "num_tokens": 395816047.0, + "step": 15300 + }, + { + "epoch": 1.6803206676916318, + "grad_norm": 1.8733808994293213, + "learning_rate": 5e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7360185384750366, + "num_tokens": 395840365.0, + "step": 15301 + }, + { + "epoch": 1.6804304853942456, + "grad_norm": 1.7655856609344482, + "learning_rate": 5e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7536758780479431, + "num_tokens": 395865370.0, + "step": 15302 + }, + { + "epoch": 1.6805403030968593, + "grad_norm": 1.725296974182129, + "learning_rate": 5e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7220414876937866, + "num_tokens": 395893129.0, + "step": 15303 + }, + { + "epoch": 1.6806501207994728, + "grad_norm": 2.08904767036438, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7289104461669922, + "num_tokens": 395914898.0, + "step": 15304 + }, + { + "epoch": 1.6807599385020864, + "grad_norm": 1.7381788492202759, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7322636842727661, + "num_tokens": 395943755.0, + "step": 15305 + }, + { + "epoch": 1.6808697562047001, + "grad_norm": 1.6311441659927368, + "learning_rate": 5e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7276833653450012, + "num_tokens": 395973966.0, + "step": 15306 + }, + { + "epoch": 1.680979573907314, + "grad_norm": 1.8855860233306885, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7272450923919678, + "num_tokens": 395997889.0, + "step": 15307 + }, + { + "epoch": 1.6810893916099277, + "grad_norm": 1.6895734071731567, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7485170364379883, + "num_tokens": 396026802.0, + "step": 15308 + }, + { + "epoch": 1.6811992093125412, + "grad_norm": 2.0997607707977295, + "learning_rate": 5e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7370815277099609, + "num_tokens": 396048394.0, + "step": 15309 + }, + { + "epoch": 1.6813090270151547, + "grad_norm": 1.9511961936950684, + "learning_rate": 5e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7311732769012451, + "num_tokens": 396070748.0, + "step": 15310 + }, + { + "epoch": 1.6814188447177685, + "grad_norm": 1.561670184135437, + "learning_rate": 5e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7365001440048218, + "num_tokens": 396101670.0, + "step": 15311 + }, + { + "epoch": 1.6815286624203822, + "grad_norm": 1.8033416271209717, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7333061695098877, + "num_tokens": 396126606.0, + "step": 15312 + }, + { + "epoch": 1.681638480122996, + "grad_norm": 1.9477815628051758, + "learning_rate": 5e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7532042264938354, + "num_tokens": 396147845.0, + "step": 15313 + }, + { + "epoch": 1.6817482978256095, + "grad_norm": 1.651789665222168, + "learning_rate": 5e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7276352643966675, + "num_tokens": 396175543.0, + "step": 15314 + }, + { + "epoch": 1.681858115528223, + "grad_norm": 1.9686658382415771, + "learning_rate": 5e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7276803255081177, + "num_tokens": 396200369.0, + "step": 15315 + }, + { + "epoch": 1.6819679332308368, + "grad_norm": 1.7300881147384644, + "learning_rate": 5e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.720413088798523, + "num_tokens": 396229497.0, + "step": 15316 + }, + { + "epoch": 1.6820777509334506, + "grad_norm": 1.8218215703964233, + "learning_rate": 5e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7311312556266785, + "num_tokens": 396255049.0, + "step": 15317 + }, + { + "epoch": 1.6821875686360641, + "grad_norm": 1.7525599002838135, + "learning_rate": 5e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.735992431640625, + "num_tokens": 396283222.0, + "step": 15318 + }, + { + "epoch": 1.6822973863386776, + "grad_norm": 2.023162364959717, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7402791976928711, + "num_tokens": 396304011.0, + "step": 15319 + }, + { + "epoch": 1.6824072040412914, + "grad_norm": 1.9207181930541992, + "learning_rate": 5e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.726327657699585, + "num_tokens": 396328277.0, + "step": 15320 + }, + { + "epoch": 1.6825170217439052, + "grad_norm": 1.7625443935394287, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7151147127151489, + "num_tokens": 396357283.0, + "step": 15321 + }, + { + "epoch": 1.682626839446519, + "grad_norm": 1.955167293548584, + "learning_rate": 5e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7144491672515869, + "num_tokens": 396381445.0, + "step": 15322 + }, + { + "epoch": 1.6827366571491325, + "grad_norm": 2.1034910678863525, + "learning_rate": 5e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7387929558753967, + "num_tokens": 396404401.0, + "step": 15323 + }, + { + "epoch": 1.682846474851746, + "grad_norm": 1.8672306537628174, + "learning_rate": 5e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7404754161834717, + "num_tokens": 396429753.0, + "step": 15324 + }, + { + "epoch": 1.6829562925543597, + "grad_norm": 1.697916865348816, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7209130525588989, + "num_tokens": 396456523.0, + "step": 15325 + }, + { + "epoch": 1.6830661102569735, + "grad_norm": 1.9081393480300903, + "learning_rate": 5e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7217017412185669, + "num_tokens": 396482085.0, + "step": 15326 + }, + { + "epoch": 1.683175927959587, + "grad_norm": 1.5982897281646729, + "learning_rate": 5e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.6980547904968262, + "num_tokens": 396514577.0, + "step": 15327 + }, + { + "epoch": 1.6832857456622008, + "grad_norm": 2.1858532428741455, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7201169729232788, + "num_tokens": 396534969.0, + "step": 15328 + }, + { + "epoch": 1.6833955633648143, + "grad_norm": 1.9162110090255737, + "learning_rate": 5e-06, + "loss": 0.7863, + "mean_token_accuracy": 0.7466881275177002, + "num_tokens": 396558393.0, + "step": 15329 + }, + { + "epoch": 1.683505381067428, + "grad_norm": 1.8019722700119019, + "learning_rate": 5e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7087864875793457, + "num_tokens": 396587197.0, + "step": 15330 + }, + { + "epoch": 1.6836151987700418, + "grad_norm": 1.550697922706604, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7268185615539551, + "num_tokens": 396617639.0, + "step": 15331 + }, + { + "epoch": 1.6837250164726554, + "grad_norm": 1.81160306930542, + "learning_rate": 5e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.73948073387146, + "num_tokens": 396644083.0, + "step": 15332 + }, + { + "epoch": 1.683834834175269, + "grad_norm": 1.8410496711730957, + "learning_rate": 5e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7159293293952942, + "num_tokens": 396668677.0, + "step": 15333 + }, + { + "epoch": 1.6839446518778827, + "grad_norm": 1.931453824043274, + "learning_rate": 5e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7280168533325195, + "num_tokens": 396692838.0, + "step": 15334 + }, + { + "epoch": 1.6840544695804964, + "grad_norm": 1.8210346698760986, + "learning_rate": 5e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.6981605887413025, + "num_tokens": 396720745.0, + "step": 15335 + }, + { + "epoch": 1.6841642872831102, + "grad_norm": 1.7605115175247192, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7196336984634399, + "num_tokens": 396749820.0, + "step": 15336 + }, + { + "epoch": 1.6842741049857237, + "grad_norm": 1.8424943685531616, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7532618641853333, + "num_tokens": 396774641.0, + "step": 15337 + }, + { + "epoch": 1.6843839226883373, + "grad_norm": 1.9306384325027466, + "learning_rate": 5e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7452942132949829, + "num_tokens": 396798814.0, + "step": 15338 + }, + { + "epoch": 1.684493740390951, + "grad_norm": 1.9353145360946655, + "learning_rate": 5e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7310687899589539, + "num_tokens": 396823820.0, + "step": 15339 + }, + { + "epoch": 1.6846035580935648, + "grad_norm": 1.7719653844833374, + "learning_rate": 5e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7146129608154297, + "num_tokens": 396851542.0, + "step": 15340 + }, + { + "epoch": 1.6847133757961783, + "grad_norm": 1.729517936706543, + "learning_rate": 5e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7098115682601929, + "num_tokens": 396880050.0, + "step": 15341 + }, + { + "epoch": 1.684823193498792, + "grad_norm": 1.9804303646087646, + "learning_rate": 5e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7376050353050232, + "num_tokens": 396903791.0, + "step": 15342 + }, + { + "epoch": 1.6849330112014056, + "grad_norm": 2.0191116333007812, + "learning_rate": 5e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7597456574440002, + "num_tokens": 396924109.0, + "step": 15343 + }, + { + "epoch": 1.6850428289040194, + "grad_norm": 1.985988974571228, + "learning_rate": 5e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7398439645767212, + "num_tokens": 396946682.0, + "step": 15344 + }, + { + "epoch": 1.685152646606633, + "grad_norm": 1.7527210712432861, + "learning_rate": 5e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.726012110710144, + "num_tokens": 396975653.0, + "step": 15345 + }, + { + "epoch": 1.6852624643092466, + "grad_norm": 1.8808190822601318, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7497819662094116, + "num_tokens": 396999801.0, + "step": 15346 + }, + { + "epoch": 1.6853722820118602, + "grad_norm": 2.0512197017669678, + "learning_rate": 5e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7125034332275391, + "num_tokens": 397021609.0, + "step": 15347 + }, + { + "epoch": 1.685482099714474, + "grad_norm": 1.8778337240219116, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7296812534332275, + "num_tokens": 397045638.0, + "step": 15348 + }, + { + "epoch": 1.6855919174170877, + "grad_norm": 1.7170361280441284, + "learning_rate": 5e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7085337042808533, + "num_tokens": 397075586.0, + "step": 15349 + }, + { + "epoch": 1.6857017351197014, + "grad_norm": 1.794898509979248, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7262803316116333, + "num_tokens": 397101322.0, + "step": 15350 + }, + { + "epoch": 1.685811552822315, + "grad_norm": 1.6966125965118408, + "learning_rate": 5e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7070284485816956, + "num_tokens": 397131889.0, + "step": 15351 + }, + { + "epoch": 1.6859213705249285, + "grad_norm": 1.597347617149353, + "learning_rate": 5e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6932982206344604, + "num_tokens": 397170439.0, + "step": 15352 + }, + { + "epoch": 1.6860311882275423, + "grad_norm": 1.860291600227356, + "learning_rate": 5e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7324365377426147, + "num_tokens": 397197064.0, + "step": 15353 + }, + { + "epoch": 1.686141005930156, + "grad_norm": 2.0867438316345215, + "learning_rate": 5e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.758891224861145, + "num_tokens": 397216844.0, + "step": 15354 + }, + { + "epoch": 1.6862508236327696, + "grad_norm": 1.9977922439575195, + "learning_rate": 5e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7200363874435425, + "num_tokens": 397240264.0, + "step": 15355 + }, + { + "epoch": 1.686360641335383, + "grad_norm": 1.8682841062545776, + "learning_rate": 5e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7118946313858032, + "num_tokens": 397268188.0, + "step": 15356 + }, + { + "epoch": 1.6864704590379969, + "grad_norm": 1.8133195638656616, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7250486612319946, + "num_tokens": 397296195.0, + "step": 15357 + }, + { + "epoch": 1.6865802767406106, + "grad_norm": 1.604912281036377, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7407962083816528, + "num_tokens": 397329987.0, + "step": 15358 + }, + { + "epoch": 1.6866900944432244, + "grad_norm": 1.7887139320373535, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7261462807655334, + "num_tokens": 397354379.0, + "step": 15359 + }, + { + "epoch": 1.686799912145838, + "grad_norm": 1.8305437564849854, + "learning_rate": 5e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7305747270584106, + "num_tokens": 397380086.0, + "step": 15360 + }, + { + "epoch": 1.6869097298484514, + "grad_norm": 1.7692610025405884, + "learning_rate": 5e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7408737540245056, + "num_tokens": 397406041.0, + "step": 15361 + }, + { + "epoch": 1.6870195475510652, + "grad_norm": 1.8784488439559937, + "learning_rate": 5e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7396033406257629, + "num_tokens": 397428339.0, + "step": 15362 + }, + { + "epoch": 1.687129365253679, + "grad_norm": 2.0351386070251465, + "learning_rate": 5e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7380879521369934, + "num_tokens": 397449999.0, + "step": 15363 + }, + { + "epoch": 1.6872391829562927, + "grad_norm": 1.8942078351974487, + "learning_rate": 5e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7215433120727539, + "num_tokens": 397473993.0, + "step": 15364 + }, + { + "epoch": 1.6873490006589063, + "grad_norm": 1.9183896780014038, + "learning_rate": 5e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7540135383605957, + "num_tokens": 397496063.0, + "step": 15365 + }, + { + "epoch": 1.6874588183615198, + "grad_norm": 1.8377964496612549, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7188923358917236, + "num_tokens": 397523903.0, + "step": 15366 + }, + { + "epoch": 1.6875686360641335, + "grad_norm": 1.5985655784606934, + "learning_rate": 5e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7202552556991577, + "num_tokens": 397557015.0, + "step": 15367 + }, + { + "epoch": 1.6876784537667473, + "grad_norm": 1.7788313627243042, + "learning_rate": 5e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7465214729309082, + "num_tokens": 397581744.0, + "step": 15368 + }, + { + "epoch": 1.6877882714693608, + "grad_norm": 1.750060796737671, + "learning_rate": 5e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7342893481254578, + "num_tokens": 397609270.0, + "step": 15369 + }, + { + "epoch": 1.6878980891719744, + "grad_norm": 1.7061058282852173, + "learning_rate": 5e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7393091917037964, + "num_tokens": 397637844.0, + "step": 15370 + }, + { + "epoch": 1.6880079068745881, + "grad_norm": 1.7112979888916016, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7318419218063354, + "num_tokens": 397665774.0, + "step": 15371 + }, + { + "epoch": 1.6881177245772019, + "grad_norm": 1.9400588274002075, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.726688027381897, + "num_tokens": 397687656.0, + "step": 15372 + }, + { + "epoch": 1.6882275422798156, + "grad_norm": 1.8001738786697388, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7428427934646606, + "num_tokens": 397711411.0, + "step": 15373 + }, + { + "epoch": 1.6883373599824292, + "grad_norm": 1.9814705848693848, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.731059193611145, + "num_tokens": 397733325.0, + "step": 15374 + }, + { + "epoch": 1.6884471776850427, + "grad_norm": 1.8426482677459717, + "learning_rate": 5e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7425374984741211, + "num_tokens": 397757556.0, + "step": 15375 + }, + { + "epoch": 1.6885569953876565, + "grad_norm": 1.9228037595748901, + "learning_rate": 5e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.751809298992157, + "num_tokens": 397779559.0, + "step": 15376 + }, + { + "epoch": 1.6886668130902702, + "grad_norm": 1.8261445760726929, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7302849292755127, + "num_tokens": 397802265.0, + "step": 15377 + }, + { + "epoch": 1.6887766307928838, + "grad_norm": 1.7863280773162842, + "learning_rate": 5e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7183277606964111, + "num_tokens": 397828763.0, + "step": 15378 + }, + { + "epoch": 1.6888864484954975, + "grad_norm": 1.739777684211731, + "learning_rate": 5e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7241792678833008, + "num_tokens": 397855763.0, + "step": 15379 + }, + { + "epoch": 1.688996266198111, + "grad_norm": 2.1633148193359375, + "learning_rate": 5e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7391581535339355, + "num_tokens": 397876022.0, + "step": 15380 + }, + { + "epoch": 1.6891060839007248, + "grad_norm": 1.6459940671920776, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7164676785469055, + "num_tokens": 397906597.0, + "step": 15381 + }, + { + "epoch": 1.6892159016033386, + "grad_norm": 1.7689565420150757, + "learning_rate": 5e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7207446694374084, + "num_tokens": 397935531.0, + "step": 15382 + }, + { + "epoch": 1.689325719305952, + "grad_norm": 1.8927783966064453, + "learning_rate": 5e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7443373203277588, + "num_tokens": 397959712.0, + "step": 15383 + }, + { + "epoch": 1.6894355370085656, + "grad_norm": 1.8479866981506348, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7245669960975647, + "num_tokens": 397989199.0, + "step": 15384 + }, + { + "epoch": 1.6895453547111794, + "grad_norm": 1.6869865655899048, + "learning_rate": 5e-06, + "loss": 0.8081, + "mean_token_accuracy": 0.7396395206451416, + "num_tokens": 398017700.0, + "step": 15385 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 1.9084657430648804, + "learning_rate": 5e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7464978694915771, + "num_tokens": 398041614.0, + "step": 15386 + }, + { + "epoch": 1.689764990116407, + "grad_norm": 1.7994776964187622, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7232420444488525, + "num_tokens": 398066495.0, + "step": 15387 + }, + { + "epoch": 1.6898748078190204, + "grad_norm": 1.7649778127670288, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7465178370475769, + "num_tokens": 398091581.0, + "step": 15388 + }, + { + "epoch": 1.689984625521634, + "grad_norm": 1.8981221914291382, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7230514883995056, + "num_tokens": 398117570.0, + "step": 15389 + }, + { + "epoch": 1.6900944432242477, + "grad_norm": 1.815616250038147, + "learning_rate": 5e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.72407066822052, + "num_tokens": 398143891.0, + "step": 15390 + }, + { + "epoch": 1.6902042609268615, + "grad_norm": 1.6841446161270142, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7346935868263245, + "num_tokens": 398172041.0, + "step": 15391 + }, + { + "epoch": 1.690314078629475, + "grad_norm": 1.7963817119598389, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.723846435546875, + "num_tokens": 398197925.0, + "step": 15392 + }, + { + "epoch": 1.6904238963320888, + "grad_norm": 2.08825945854187, + "learning_rate": 5e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7457075715065002, + "num_tokens": 398219133.0, + "step": 15393 + }, + { + "epoch": 1.6905337140347023, + "grad_norm": 1.9254502058029175, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7360869646072388, + "num_tokens": 398242455.0, + "step": 15394 + }, + { + "epoch": 1.690643531737316, + "grad_norm": 1.9361834526062012, + "learning_rate": 5e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7129195928573608, + "num_tokens": 398268370.0, + "step": 15395 + }, + { + "epoch": 1.6907533494399298, + "grad_norm": 2.035770893096924, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.743961751461029, + "num_tokens": 398289033.0, + "step": 15396 + }, + { + "epoch": 1.6908631671425434, + "grad_norm": 1.7090476751327515, + "learning_rate": 5e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7172516584396362, + "num_tokens": 398317549.0, + "step": 15397 + }, + { + "epoch": 1.690972984845157, + "grad_norm": 1.9106953144073486, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7287890911102295, + "num_tokens": 398344669.0, + "step": 15398 + }, + { + "epoch": 1.6910828025477707, + "grad_norm": 1.8382487297058105, + "learning_rate": 5e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7167412638664246, + "num_tokens": 398367738.0, + "step": 15399 + }, + { + "epoch": 1.6911926202503844, + "grad_norm": 1.7372400760650635, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7285571098327637, + "num_tokens": 398397429.0, + "step": 15400 + }, + { + "epoch": 1.6913024379529982, + "grad_norm": 1.7337766885757446, + "learning_rate": 5e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7303370237350464, + "num_tokens": 398424437.0, + "step": 15401 + }, + { + "epoch": 1.6914122556556117, + "grad_norm": 1.7119195461273193, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7277789115905762, + "num_tokens": 398450639.0, + "step": 15402 + }, + { + "epoch": 1.6915220733582252, + "grad_norm": 1.9502995014190674, + "learning_rate": 5e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7229111790657043, + "num_tokens": 398473986.0, + "step": 15403 + }, + { + "epoch": 1.691631891060839, + "grad_norm": 1.8187763690948486, + "learning_rate": 5e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7418406009674072, + "num_tokens": 398500576.0, + "step": 15404 + }, + { + "epoch": 1.6917417087634528, + "grad_norm": 1.8647773265838623, + "learning_rate": 5e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7226712107658386, + "num_tokens": 398529457.0, + "step": 15405 + }, + { + "epoch": 1.6918515264660663, + "grad_norm": 2.0057220458984375, + "learning_rate": 5e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7418574094772339, + "num_tokens": 398551705.0, + "step": 15406 + }, + { + "epoch": 1.6919613441686798, + "grad_norm": 1.6301288604736328, + "learning_rate": 5e-06, + "loss": 0.942, + "mean_token_accuracy": 0.710390031337738, + "num_tokens": 398583222.0, + "step": 15407 + }, + { + "epoch": 1.6920711618712936, + "grad_norm": 1.8707374334335327, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7223262786865234, + "num_tokens": 398608196.0, + "step": 15408 + }, + { + "epoch": 1.6921809795739073, + "grad_norm": 1.9851840734481812, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7192891836166382, + "num_tokens": 398632234.0, + "step": 15409 + }, + { + "epoch": 1.692290797276521, + "grad_norm": 1.9428763389587402, + "learning_rate": 5e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7360923886299133, + "num_tokens": 398654456.0, + "step": 15410 + }, + { + "epoch": 1.6924006149791346, + "grad_norm": 1.6800448894500732, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7343694567680359, + "num_tokens": 398684086.0, + "step": 15411 + }, + { + "epoch": 1.6925104326817482, + "grad_norm": 1.705346941947937, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7429258227348328, + "num_tokens": 398710441.0, + "step": 15412 + }, + { + "epoch": 1.692620250384362, + "grad_norm": 1.8389140367507935, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7391178607940674, + "num_tokens": 398735051.0, + "step": 15413 + }, + { + "epoch": 1.6927300680869757, + "grad_norm": 1.6802716255187988, + "learning_rate": 5e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7404446601867676, + "num_tokens": 398763540.0, + "step": 15414 + }, + { + "epoch": 1.6928398857895894, + "grad_norm": 1.7437869310379028, + "learning_rate": 5e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7188817262649536, + "num_tokens": 398792882.0, + "step": 15415 + }, + { + "epoch": 1.692949703492203, + "grad_norm": 1.8090734481811523, + "learning_rate": 5e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7259210348129272, + "num_tokens": 398817659.0, + "step": 15416 + }, + { + "epoch": 1.6930595211948165, + "grad_norm": 2.1302740573883057, + "learning_rate": 5e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.707258939743042, + "num_tokens": 398838152.0, + "step": 15417 + }, + { + "epoch": 1.6931693388974303, + "grad_norm": 1.866170883178711, + "learning_rate": 5e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7203319668769836, + "num_tokens": 398865034.0, + "step": 15418 + }, + { + "epoch": 1.693279156600044, + "grad_norm": 2.121690273284912, + "learning_rate": 5e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7478703260421753, + "num_tokens": 398885389.0, + "step": 15419 + }, + { + "epoch": 1.6933889743026576, + "grad_norm": 1.827607274055481, + "learning_rate": 5e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.712389349937439, + "num_tokens": 398912580.0, + "step": 15420 + }, + { + "epoch": 1.693498792005271, + "grad_norm": 1.7374922037124634, + "learning_rate": 5e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7187516689300537, + "num_tokens": 398940631.0, + "step": 15421 + }, + { + "epoch": 1.6936086097078848, + "grad_norm": 2.040947198867798, + "learning_rate": 5e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7427250146865845, + "num_tokens": 398962430.0, + "step": 15422 + }, + { + "epoch": 1.6937184274104986, + "grad_norm": 1.7506457567214966, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7405041456222534, + "num_tokens": 398989907.0, + "step": 15423 + }, + { + "epoch": 1.6938282451131124, + "grad_norm": 1.6180241107940674, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7218614816665649, + "num_tokens": 399022199.0, + "step": 15424 + }, + { + "epoch": 1.693938062815726, + "grad_norm": 1.70087468624115, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7371976971626282, + "num_tokens": 399049873.0, + "step": 15425 + }, + { + "epoch": 1.6940478805183394, + "grad_norm": 1.7271053791046143, + "learning_rate": 5e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.74009770154953, + "num_tokens": 399075101.0, + "step": 15426 + }, + { + "epoch": 1.6941576982209532, + "grad_norm": 1.9678183794021606, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.740715742111206, + "num_tokens": 399099000.0, + "step": 15427 + }, + { + "epoch": 1.694267515923567, + "grad_norm": 1.7606658935546875, + "learning_rate": 5e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7081820964813232, + "num_tokens": 399128636.0, + "step": 15428 + }, + { + "epoch": 1.6943773336261807, + "grad_norm": 1.7537370920181274, + "learning_rate": 5e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7179083824157715, + "num_tokens": 399157899.0, + "step": 15429 + }, + { + "epoch": 1.6944871513287942, + "grad_norm": 1.8172317743301392, + "learning_rate": 5e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7228041887283325, + "num_tokens": 399186037.0, + "step": 15430 + }, + { + "epoch": 1.6945969690314078, + "grad_norm": 1.6709799766540527, + "learning_rate": 5e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7200779318809509, + "num_tokens": 399217379.0, + "step": 15431 + }, + { + "epoch": 1.6947067867340215, + "grad_norm": 1.6239360570907593, + "learning_rate": 5e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7127720713615417, + "num_tokens": 399249007.0, + "step": 15432 + }, + { + "epoch": 1.6948166044366353, + "grad_norm": 1.9552327394485474, + "learning_rate": 5e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7289209961891174, + "num_tokens": 399273088.0, + "step": 15433 + }, + { + "epoch": 1.6949264221392488, + "grad_norm": 1.843213438987732, + "learning_rate": 5e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7401766180992126, + "num_tokens": 399297199.0, + "step": 15434 + }, + { + "epoch": 1.6950362398418624, + "grad_norm": 2.036757707595825, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7374654412269592, + "num_tokens": 399319325.0, + "step": 15435 + }, + { + "epoch": 1.6951460575444761, + "grad_norm": 1.7009592056274414, + "learning_rate": 5e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7071982026100159, + "num_tokens": 399352910.0, + "step": 15436 + }, + { + "epoch": 1.6952558752470899, + "grad_norm": 1.7997781038284302, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7307201623916626, + "num_tokens": 399377592.0, + "step": 15437 + }, + { + "epoch": 1.6953656929497036, + "grad_norm": 1.854246973991394, + "learning_rate": 5e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.738375186920166, + "num_tokens": 399402479.0, + "step": 15438 + }, + { + "epoch": 1.6954755106523172, + "grad_norm": 1.8220385313034058, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7248734831809998, + "num_tokens": 399427933.0, + "step": 15439 + }, + { + "epoch": 1.6955853283549307, + "grad_norm": 1.6801594495773315, + "learning_rate": 5e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7262007594108582, + "num_tokens": 399459131.0, + "step": 15440 + }, + { + "epoch": 1.6956951460575445, + "grad_norm": 1.9896241426467896, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7312684655189514, + "num_tokens": 399481877.0, + "step": 15441 + }, + { + "epoch": 1.6958049637601582, + "grad_norm": 1.881276249885559, + "learning_rate": 5e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6903283596038818, + "num_tokens": 399511459.0, + "step": 15442 + }, + { + "epoch": 1.6959147814627717, + "grad_norm": 1.5940616130828857, + "learning_rate": 5e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7326433658599854, + "num_tokens": 399544294.0, + "step": 15443 + }, + { + "epoch": 1.6960245991653855, + "grad_norm": 2.062650442123413, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7413201332092285, + "num_tokens": 399565939.0, + "step": 15444 + }, + { + "epoch": 1.696134416867999, + "grad_norm": 1.8478542566299438, + "learning_rate": 5e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7170361280441284, + "num_tokens": 399593420.0, + "step": 15445 + }, + { + "epoch": 1.6962442345706128, + "grad_norm": 1.8569016456604004, + "learning_rate": 5e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7482655644416809, + "num_tokens": 399617274.0, + "step": 15446 + }, + { + "epoch": 1.6963540522732266, + "grad_norm": 1.8670024871826172, + "learning_rate": 5e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7236159443855286, + "num_tokens": 399642553.0, + "step": 15447 + }, + { + "epoch": 1.69646386997584, + "grad_norm": 1.9744571447372437, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7229684591293335, + "num_tokens": 399666302.0, + "step": 15448 + }, + { + "epoch": 1.6965736876784536, + "grad_norm": 1.7474979162216187, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.726287841796875, + "num_tokens": 399694917.0, + "step": 15449 + }, + { + "epoch": 1.6966835053810674, + "grad_norm": 1.962963581085205, + "learning_rate": 5e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7207269668579102, + "num_tokens": 399718853.0, + "step": 15450 + }, + { + "epoch": 1.6967933230836811, + "grad_norm": 2.002718687057495, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7322155237197876, + "num_tokens": 399739839.0, + "step": 15451 + }, + { + "epoch": 1.696903140786295, + "grad_norm": 1.6154474020004272, + "learning_rate": 5e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7169287204742432, + "num_tokens": 399772145.0, + "step": 15452 + }, + { + "epoch": 1.6970129584889084, + "grad_norm": 1.7157236337661743, + "learning_rate": 5e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7341324090957642, + "num_tokens": 399800771.0, + "step": 15453 + }, + { + "epoch": 1.697122776191522, + "grad_norm": 1.8247826099395752, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7258688807487488, + "num_tokens": 399828397.0, + "step": 15454 + }, + { + "epoch": 1.6972325938941357, + "grad_norm": 1.6312607526779175, + "learning_rate": 5e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7412768602371216, + "num_tokens": 399856705.0, + "step": 15455 + }, + { + "epoch": 1.6973424115967495, + "grad_norm": 1.8245289325714111, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7173217535018921, + "num_tokens": 399884493.0, + "step": 15456 + }, + { + "epoch": 1.697452229299363, + "grad_norm": 1.7290507555007935, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7259889245033264, + "num_tokens": 399913787.0, + "step": 15457 + }, + { + "epoch": 1.6975620470019768, + "grad_norm": 1.81126868724823, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7580046057701111, + "num_tokens": 399937117.0, + "step": 15458 + }, + { + "epoch": 1.6976718647045903, + "grad_norm": 1.827399492263794, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7307074069976807, + "num_tokens": 399962332.0, + "step": 15459 + }, + { + "epoch": 1.697781682407204, + "grad_norm": 1.7974951267242432, + "learning_rate": 5e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.7599143385887146, + "num_tokens": 399986520.0, + "step": 15460 + }, + { + "epoch": 1.6978915001098178, + "grad_norm": 2.1071536540985107, + "learning_rate": 5e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7257778644561768, + "num_tokens": 400007164.0, + "step": 15461 + }, + { + "epoch": 1.6980013178124314, + "grad_norm": 1.6282670497894287, + "learning_rate": 5e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7297585010528564, + "num_tokens": 400038430.0, + "step": 15462 + }, + { + "epoch": 1.698111135515045, + "grad_norm": 1.6490957736968994, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7404925227165222, + "num_tokens": 400068396.0, + "step": 15463 + }, + { + "epoch": 1.6982209532176586, + "grad_norm": 1.9884512424468994, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7274743318557739, + "num_tokens": 400092061.0, + "step": 15464 + }, + { + "epoch": 1.6983307709202724, + "grad_norm": 1.7535343170166016, + "learning_rate": 5e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7345798015594482, + "num_tokens": 400118132.0, + "step": 15465 + }, + { + "epoch": 1.6984405886228862, + "grad_norm": 1.8608753681182861, + "learning_rate": 5e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7229252457618713, + "num_tokens": 400143950.0, + "step": 15466 + }, + { + "epoch": 1.6985504063254997, + "grad_norm": 1.7440986633300781, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7164779901504517, + "num_tokens": 400175236.0, + "step": 15467 + }, + { + "epoch": 1.6986602240281132, + "grad_norm": 1.8698984384536743, + "learning_rate": 5e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7159409523010254, + "num_tokens": 400201257.0, + "step": 15468 + }, + { + "epoch": 1.698770041730727, + "grad_norm": 1.7639292478561401, + "learning_rate": 5e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7248878479003906, + "num_tokens": 400231011.0, + "step": 15469 + }, + { + "epoch": 1.6988798594333407, + "grad_norm": 1.7912806272506714, + "learning_rate": 5e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7213384509086609, + "num_tokens": 400259068.0, + "step": 15470 + }, + { + "epoch": 1.6989896771359543, + "grad_norm": 1.7533453702926636, + "learning_rate": 5e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7291346788406372, + "num_tokens": 400286568.0, + "step": 15471 + }, + { + "epoch": 1.6990994948385678, + "grad_norm": 1.8488560914993286, + "learning_rate": 5e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7283756732940674, + "num_tokens": 400311986.0, + "step": 15472 + }, + { + "epoch": 1.6992093125411816, + "grad_norm": 1.7100659608840942, + "learning_rate": 5e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7333213090896606, + "num_tokens": 400338919.0, + "step": 15473 + }, + { + "epoch": 1.6993191302437953, + "grad_norm": 1.8974640369415283, + "learning_rate": 5e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7449972033500671, + "num_tokens": 400361957.0, + "step": 15474 + }, + { + "epoch": 1.699428947946409, + "grad_norm": 1.8281031847000122, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7301164865493774, + "num_tokens": 400387198.0, + "step": 15475 + }, + { + "epoch": 1.6995387656490226, + "grad_norm": 1.7599577903747559, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7092649340629578, + "num_tokens": 400415686.0, + "step": 15476 + }, + { + "epoch": 1.6996485833516362, + "grad_norm": 1.8419896364212036, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7389798164367676, + "num_tokens": 400442607.0, + "step": 15477 + }, + { + "epoch": 1.69975840105425, + "grad_norm": 1.7041572332382202, + "learning_rate": 5e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7316591739654541, + "num_tokens": 400471673.0, + "step": 15478 + }, + { + "epoch": 1.6998682187568637, + "grad_norm": 2.0723352432250977, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7295634746551514, + "num_tokens": 400491778.0, + "step": 15479 + }, + { + "epoch": 1.6999780364594774, + "grad_norm": 1.7874599695205688, + "learning_rate": 5e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7276790142059326, + "num_tokens": 400519346.0, + "step": 15480 + }, + { + "epoch": 1.700087854162091, + "grad_norm": 1.922710657119751, + "learning_rate": 5e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7575941681861877, + "num_tokens": 400541047.0, + "step": 15481 + }, + { + "epoch": 1.7001976718647045, + "grad_norm": 2.1080482006073, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7365901470184326, + "num_tokens": 400562056.0, + "step": 15482 + }, + { + "epoch": 1.7003074895673183, + "grad_norm": 1.721914291381836, + "learning_rate": 5e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7387226819992065, + "num_tokens": 400586726.0, + "step": 15483 + }, + { + "epoch": 1.700417307269932, + "grad_norm": 1.8090434074401855, + "learning_rate": 5e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.73311448097229, + "num_tokens": 400613338.0, + "step": 15484 + }, + { + "epoch": 1.7005271249725455, + "grad_norm": 2.145260810852051, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7325012683868408, + "num_tokens": 400633117.0, + "step": 15485 + }, + { + "epoch": 1.700636942675159, + "grad_norm": 1.776231050491333, + "learning_rate": 5e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.718731701374054, + "num_tokens": 400661522.0, + "step": 15486 + }, + { + "epoch": 1.7007467603777728, + "grad_norm": 1.7149590253829956, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7334485650062561, + "num_tokens": 400690050.0, + "step": 15487 + }, + { + "epoch": 1.7008565780803866, + "grad_norm": 1.6210070848464966, + "learning_rate": 5e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7146978974342346, + "num_tokens": 400721666.0, + "step": 15488 + }, + { + "epoch": 1.7009663957830004, + "grad_norm": 1.8719193935394287, + "learning_rate": 5e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7209867835044861, + "num_tokens": 400746645.0, + "step": 15489 + }, + { + "epoch": 1.7010762134856139, + "grad_norm": 1.8535337448120117, + "learning_rate": 5e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.711693525314331, + "num_tokens": 400772540.0, + "step": 15490 + }, + { + "epoch": 1.7011860311882274, + "grad_norm": 1.864865779876709, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7442365884780884, + "num_tokens": 400798217.0, + "step": 15491 + }, + { + "epoch": 1.7012958488908412, + "grad_norm": 1.77541184425354, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7409838438034058, + "num_tokens": 400824855.0, + "step": 15492 + }, + { + "epoch": 1.701405666593455, + "grad_norm": 1.9655941724777222, + "learning_rate": 5e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7499145269393921, + "num_tokens": 400847288.0, + "step": 15493 + }, + { + "epoch": 1.7015154842960687, + "grad_norm": 1.86173677444458, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7144546508789062, + "num_tokens": 400875082.0, + "step": 15494 + }, + { + "epoch": 1.7016253019986822, + "grad_norm": 1.7220515012741089, + "learning_rate": 5e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7016201615333557, + "num_tokens": 400905764.0, + "step": 15495 + }, + { + "epoch": 1.7017351197012958, + "grad_norm": 1.914889931678772, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7278515696525574, + "num_tokens": 400930685.0, + "step": 15496 + }, + { + "epoch": 1.7018449374039095, + "grad_norm": 1.6952394247055054, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7446829080581665, + "num_tokens": 400957902.0, + "step": 15497 + }, + { + "epoch": 1.7019547551065233, + "grad_norm": 1.9523696899414062, + "learning_rate": 5e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7301538586616516, + "num_tokens": 400981331.0, + "step": 15498 + }, + { + "epoch": 1.7020645728091368, + "grad_norm": 1.9550412893295288, + "learning_rate": 5e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7477266192436218, + "num_tokens": 401003310.0, + "step": 15499 + }, + { + "epoch": 1.7021743905117503, + "grad_norm": 1.823309302330017, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.741911768913269, + "num_tokens": 401026921.0, + "step": 15500 + }, + { + "epoch": 1.702284208214364, + "grad_norm": 1.9345316886901855, + "learning_rate": 5e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7373572587966919, + "num_tokens": 401049680.0, + "step": 15501 + }, + { + "epoch": 1.7023940259169779, + "grad_norm": 1.9438351392745972, + "learning_rate": 5e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7260454893112183, + "num_tokens": 401075301.0, + "step": 15502 + }, + { + "epoch": 1.7025038436195916, + "grad_norm": 1.9299074411392212, + "learning_rate": 5e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.733834445476532, + "num_tokens": 401099241.0, + "step": 15503 + }, + { + "epoch": 1.7026136613222052, + "grad_norm": 1.7570964097976685, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7278603315353394, + "num_tokens": 401126264.0, + "step": 15504 + }, + { + "epoch": 1.7027234790248187, + "grad_norm": 1.8931573629379272, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7224360704421997, + "num_tokens": 401152566.0, + "step": 15505 + }, + { + "epoch": 1.7028332967274324, + "grad_norm": 1.7948020696640015, + "learning_rate": 5e-06, + "loss": 0.849, + "mean_token_accuracy": 0.733296275138855, + "num_tokens": 401178635.0, + "step": 15506 + }, + { + "epoch": 1.7029431144300462, + "grad_norm": 1.8624211549758911, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7125031352043152, + "num_tokens": 401204438.0, + "step": 15507 + }, + { + "epoch": 1.7030529321326597, + "grad_norm": 2.1225924491882324, + "learning_rate": 5e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7673755884170532, + "num_tokens": 401224316.0, + "step": 15508 + }, + { + "epoch": 1.7031627498352735, + "grad_norm": 2.0043442249298096, + "learning_rate": 5e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7376325130462646, + "num_tokens": 401245337.0, + "step": 15509 + }, + { + "epoch": 1.703272567537887, + "grad_norm": 1.8348171710968018, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7124937772750854, + "num_tokens": 401273035.0, + "step": 15510 + }, + { + "epoch": 1.7033823852405008, + "grad_norm": 1.7572391033172607, + "learning_rate": 5e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7248237133026123, + "num_tokens": 401301837.0, + "step": 15511 + }, + { + "epoch": 1.7034922029431145, + "grad_norm": 2.002375364303589, + "learning_rate": 5e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7167391180992126, + "num_tokens": 401326520.0, + "step": 15512 + }, + { + "epoch": 1.703602020645728, + "grad_norm": 1.6608242988586426, + "learning_rate": 5e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7488478422164917, + "num_tokens": 401355365.0, + "step": 15513 + }, + { + "epoch": 1.7037118383483416, + "grad_norm": 1.783282995223999, + "learning_rate": 5e-06, + "loss": 0.855, + "mean_token_accuracy": 0.733460009098053, + "num_tokens": 401381092.0, + "step": 15514 + }, + { + "epoch": 1.7038216560509554, + "grad_norm": 1.9659637212753296, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7342445254325867, + "num_tokens": 401404223.0, + "step": 15515 + }, + { + "epoch": 1.7039314737535691, + "grad_norm": 2.1167452335357666, + "learning_rate": 5e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7440893054008484, + "num_tokens": 401424835.0, + "step": 15516 + }, + { + "epoch": 1.7040412914561829, + "grad_norm": 1.9897382259368896, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7377541065216064, + "num_tokens": 401447946.0, + "step": 15517 + }, + { + "epoch": 1.7041511091587964, + "grad_norm": 1.8934451341629028, + "learning_rate": 5e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7487420439720154, + "num_tokens": 401471898.0, + "step": 15518 + }, + { + "epoch": 1.70426092686141, + "grad_norm": 1.7187676429748535, + "learning_rate": 5e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7654802203178406, + "num_tokens": 401496697.0, + "step": 15519 + }, + { + "epoch": 1.7043707445640237, + "grad_norm": 1.8681094646453857, + "learning_rate": 5e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7285727262496948, + "num_tokens": 401521008.0, + "step": 15520 + }, + { + "epoch": 1.7044805622666375, + "grad_norm": 2.019408702850342, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7285972833633423, + "num_tokens": 401543394.0, + "step": 15521 + }, + { + "epoch": 1.704590379969251, + "grad_norm": 1.878218173980713, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7151570320129395, + "num_tokens": 401571742.0, + "step": 15522 + }, + { + "epoch": 1.7047001976718648, + "grad_norm": 1.6619635820388794, + "learning_rate": 5e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7247233390808105, + "num_tokens": 401602946.0, + "step": 15523 + }, + { + "epoch": 1.7048100153744783, + "grad_norm": 1.8632453680038452, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7193419933319092, + "num_tokens": 401628095.0, + "step": 15524 + }, + { + "epoch": 1.704919833077092, + "grad_norm": 1.8338614702224731, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7341444492340088, + "num_tokens": 401653268.0, + "step": 15525 + }, + { + "epoch": 1.7050296507797058, + "grad_norm": 1.8840973377227783, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7313047647476196, + "num_tokens": 401677427.0, + "step": 15526 + }, + { + "epoch": 1.7051394684823193, + "grad_norm": 1.7945879697799683, + "learning_rate": 5e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7291808128356934, + "num_tokens": 401702718.0, + "step": 15527 + }, + { + "epoch": 1.7052492861849329, + "grad_norm": 1.6459232568740845, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7127665281295776, + "num_tokens": 401733782.0, + "step": 15528 + }, + { + "epoch": 1.7053591038875466, + "grad_norm": 1.7692347764968872, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7357239723205566, + "num_tokens": 401762227.0, + "step": 15529 + }, + { + "epoch": 1.7054689215901604, + "grad_norm": 2.0165867805480957, + "learning_rate": 5e-06, + "loss": 0.78, + "mean_token_accuracy": 0.7460787296295166, + "num_tokens": 401781836.0, + "step": 15530 + }, + { + "epoch": 1.7055787392927741, + "grad_norm": 1.8346285820007324, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.744735598564148, + "num_tokens": 401806082.0, + "step": 15531 + }, + { + "epoch": 1.7056885569953877, + "grad_norm": 1.8193106651306152, + "learning_rate": 5e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7178338170051575, + "num_tokens": 401832729.0, + "step": 15532 + }, + { + "epoch": 1.7057983746980012, + "grad_norm": 1.8278882503509521, + "learning_rate": 5e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7399111986160278, + "num_tokens": 401857116.0, + "step": 15533 + }, + { + "epoch": 1.705908192400615, + "grad_norm": 1.9579229354858398, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7301532030105591, + "num_tokens": 401882167.0, + "step": 15534 + }, + { + "epoch": 1.7060180101032287, + "grad_norm": 1.9053218364715576, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7272859811782837, + "num_tokens": 401906144.0, + "step": 15535 + }, + { + "epoch": 1.7061278278058423, + "grad_norm": 1.6721562147140503, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7415904998779297, + "num_tokens": 401935841.0, + "step": 15536 + }, + { + "epoch": 1.7062376455084558, + "grad_norm": 2.081411838531494, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7444807887077332, + "num_tokens": 401954987.0, + "step": 15537 + }, + { + "epoch": 1.7063474632110696, + "grad_norm": 1.860446810722351, + "learning_rate": 5e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7389249801635742, + "num_tokens": 401979243.0, + "step": 15538 + }, + { + "epoch": 1.7064572809136833, + "grad_norm": 1.8742446899414062, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.740321695804596, + "num_tokens": 402002988.0, + "step": 15539 + }, + { + "epoch": 1.706567098616297, + "grad_norm": 1.7692466974258423, + "learning_rate": 5e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.720305323600769, + "num_tokens": 402029436.0, + "step": 15540 + }, + { + "epoch": 1.7066769163189106, + "grad_norm": 1.7139190435409546, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7177962064743042, + "num_tokens": 402058437.0, + "step": 15541 + }, + { + "epoch": 1.7067867340215241, + "grad_norm": 1.7765400409698486, + "learning_rate": 5e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7151546478271484, + "num_tokens": 402086491.0, + "step": 15542 + }, + { + "epoch": 1.706896551724138, + "grad_norm": 2.1404306888580322, + "learning_rate": 5e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7517312169075012, + "num_tokens": 402106254.0, + "step": 15543 + }, + { + "epoch": 1.7070063694267517, + "grad_norm": 2.060403347015381, + "learning_rate": 5e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7420880794525146, + "num_tokens": 402127732.0, + "step": 15544 + }, + { + "epoch": 1.7071161871293654, + "grad_norm": 1.8073325157165527, + "learning_rate": 5e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.719286322593689, + "num_tokens": 402155653.0, + "step": 15545 + }, + { + "epoch": 1.707226004831979, + "grad_norm": 1.7921314239501953, + "learning_rate": 5e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7510428428649902, + "num_tokens": 402185338.0, + "step": 15546 + }, + { + "epoch": 1.7073358225345925, + "grad_norm": 2.088719367980957, + "learning_rate": 5e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7230120897293091, + "num_tokens": 402208144.0, + "step": 15547 + }, + { + "epoch": 1.7074456402372062, + "grad_norm": 1.7653584480285645, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7343990802764893, + "num_tokens": 402235870.0, + "step": 15548 + }, + { + "epoch": 1.70755545793982, + "grad_norm": 1.7765724658966064, + "learning_rate": 5e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7478671073913574, + "num_tokens": 402259054.0, + "step": 15549 + }, + { + "epoch": 1.7076652756424335, + "grad_norm": 1.8189047574996948, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7224029302597046, + "num_tokens": 402284851.0, + "step": 15550 + }, + { + "epoch": 1.707775093345047, + "grad_norm": 1.7395265102386475, + "learning_rate": 5e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.743246853351593, + "num_tokens": 402311148.0, + "step": 15551 + }, + { + "epoch": 1.7078849110476608, + "grad_norm": 1.6819894313812256, + "learning_rate": 5e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.715366542339325, + "num_tokens": 402342002.0, + "step": 15552 + }, + { + "epoch": 1.7079947287502746, + "grad_norm": 2.1039037704467773, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7313729524612427, + "num_tokens": 402362334.0, + "step": 15553 + }, + { + "epoch": 1.7081045464528883, + "grad_norm": 2.205282688140869, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7339208126068115, + "num_tokens": 402381011.0, + "step": 15554 + }, + { + "epoch": 1.7082143641555019, + "grad_norm": 1.591919183731079, + "learning_rate": 5e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7350936532020569, + "num_tokens": 402412557.0, + "step": 15555 + }, + { + "epoch": 1.7083241818581154, + "grad_norm": 1.724012851715088, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7186427116394043, + "num_tokens": 402443000.0, + "step": 15556 + }, + { + "epoch": 1.7084339995607292, + "grad_norm": 1.672619104385376, + "learning_rate": 5e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7167017459869385, + "num_tokens": 402471402.0, + "step": 15557 + }, + { + "epoch": 1.708543817263343, + "grad_norm": 1.7348558902740479, + "learning_rate": 5e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7314496040344238, + "num_tokens": 402498260.0, + "step": 15558 + }, + { + "epoch": 1.7086536349659565, + "grad_norm": 1.824140191078186, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7150643467903137, + "num_tokens": 402525851.0, + "step": 15559 + }, + { + "epoch": 1.7087634526685702, + "grad_norm": 1.9753891229629517, + "learning_rate": 5e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7401865720748901, + "num_tokens": 402549042.0, + "step": 15560 + }, + { + "epoch": 1.7088732703711838, + "grad_norm": 2.0284080505371094, + "learning_rate": 5e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7393167018890381, + "num_tokens": 402570447.0, + "step": 15561 + }, + { + "epoch": 1.7089830880737975, + "grad_norm": 1.62384033203125, + "learning_rate": 5e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7180576324462891, + "num_tokens": 402601518.0, + "step": 15562 + }, + { + "epoch": 1.7090929057764113, + "grad_norm": 1.9951215982437134, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7332754731178284, + "num_tokens": 402624808.0, + "step": 15563 + }, + { + "epoch": 1.7092027234790248, + "grad_norm": 1.6687052249908447, + "learning_rate": 5e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7344403862953186, + "num_tokens": 402653219.0, + "step": 15564 + }, + { + "epoch": 1.7093125411816383, + "grad_norm": 1.7378169298171997, + "learning_rate": 5e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7032510042190552, + "num_tokens": 402682656.0, + "step": 15565 + }, + { + "epoch": 1.709422358884252, + "grad_norm": 1.9272547960281372, + "learning_rate": 5e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7333040237426758, + "num_tokens": 402706477.0, + "step": 15566 + }, + { + "epoch": 1.7095321765868658, + "grad_norm": 1.809051275253296, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7364912629127502, + "num_tokens": 402734565.0, + "step": 15567 + }, + { + "epoch": 1.7096419942894796, + "grad_norm": 1.818973183631897, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.738558292388916, + "num_tokens": 402761587.0, + "step": 15568 + }, + { + "epoch": 1.7097518119920931, + "grad_norm": 1.7484805583953857, + "learning_rate": 5e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7070348262786865, + "num_tokens": 402787443.0, + "step": 15569 + }, + { + "epoch": 1.7098616296947067, + "grad_norm": 1.8011306524276733, + "learning_rate": 5e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7140734195709229, + "num_tokens": 402816257.0, + "step": 15570 + }, + { + "epoch": 1.7099714473973204, + "grad_norm": 1.8290565013885498, + "learning_rate": 5e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7344413995742798, + "num_tokens": 402841589.0, + "step": 15571 + }, + { + "epoch": 1.7100812650999342, + "grad_norm": 2.21256685256958, + "learning_rate": 5e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7568846940994263, + "num_tokens": 402860397.0, + "step": 15572 + }, + { + "epoch": 1.7101910828025477, + "grad_norm": 1.7899315357208252, + "learning_rate": 5e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7380799055099487, + "num_tokens": 402884952.0, + "step": 15573 + }, + { + "epoch": 1.7103009005051615, + "grad_norm": 1.5875593423843384, + "learning_rate": 5e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.711624026298523, + "num_tokens": 402918680.0, + "step": 15574 + }, + { + "epoch": 1.710410718207775, + "grad_norm": 1.719709873199463, + "learning_rate": 5e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7366428375244141, + "num_tokens": 402945228.0, + "step": 15575 + }, + { + "epoch": 1.7105205359103888, + "grad_norm": 2.0259344577789307, + "learning_rate": 5e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7389388084411621, + "num_tokens": 402966377.0, + "step": 15576 + }, + { + "epoch": 1.7106303536130025, + "grad_norm": 1.9113699197769165, + "learning_rate": 5e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7055677175521851, + "num_tokens": 402991698.0, + "step": 15577 + }, + { + "epoch": 1.710740171315616, + "grad_norm": 2.049982786178589, + "learning_rate": 5e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7337662577629089, + "num_tokens": 403014152.0, + "step": 15578 + }, + { + "epoch": 1.7108499890182296, + "grad_norm": 1.8079596757888794, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7120013236999512, + "num_tokens": 403040000.0, + "step": 15579 + }, + { + "epoch": 1.7109598067208434, + "grad_norm": 1.8203909397125244, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.761900782585144, + "num_tokens": 403064288.0, + "step": 15580 + }, + { + "epoch": 1.7110696244234571, + "grad_norm": 1.8124240636825562, + "learning_rate": 5e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7154387831687927, + "num_tokens": 403092537.0, + "step": 15581 + }, + { + "epoch": 1.7111794421260709, + "grad_norm": 1.8860130310058594, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7454496622085571, + "num_tokens": 403114553.0, + "step": 15582 + }, + { + "epoch": 1.7112892598286844, + "grad_norm": 2.0182950496673584, + "learning_rate": 5e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7457444071769714, + "num_tokens": 403134367.0, + "step": 15583 + }, + { + "epoch": 1.711399077531298, + "grad_norm": 2.0868289470672607, + "learning_rate": 5e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7232892513275146, + "num_tokens": 403154167.0, + "step": 15584 + }, + { + "epoch": 1.7115088952339117, + "grad_norm": 1.824105978012085, + "learning_rate": 5e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.719887375831604, + "num_tokens": 403180916.0, + "step": 15585 + }, + { + "epoch": 1.7116187129365255, + "grad_norm": 1.9042167663574219, + "learning_rate": 5e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7483460903167725, + "num_tokens": 403204278.0, + "step": 15586 + }, + { + "epoch": 1.711728530639139, + "grad_norm": 1.8421485424041748, + "learning_rate": 5e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7223443984985352, + "num_tokens": 403229785.0, + "step": 15587 + }, + { + "epoch": 1.7118383483417525, + "grad_norm": 1.7814265489578247, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7364983558654785, + "num_tokens": 403256113.0, + "step": 15588 + }, + { + "epoch": 1.7119481660443663, + "grad_norm": 1.7414696216583252, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.773187518119812, + "num_tokens": 403282294.0, + "step": 15589 + }, + { + "epoch": 1.71205798374698, + "grad_norm": 1.9098149538040161, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7459717988967896, + "num_tokens": 403304744.0, + "step": 15590 + }, + { + "epoch": 1.7121678014495938, + "grad_norm": 1.9181965589523315, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7297912836074829, + "num_tokens": 403327453.0, + "step": 15591 + }, + { + "epoch": 1.7122776191522073, + "grad_norm": 1.9331717491149902, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7354015707969666, + "num_tokens": 403350440.0, + "step": 15592 + }, + { + "epoch": 1.7123874368548209, + "grad_norm": 1.9592949151992798, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7270220518112183, + "num_tokens": 403374039.0, + "step": 15593 + }, + { + "epoch": 1.7124972545574346, + "grad_norm": 1.700563907623291, + "learning_rate": 5e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7575066089630127, + "num_tokens": 403402420.0, + "step": 15594 + }, + { + "epoch": 1.7126070722600484, + "grad_norm": 1.710548758506775, + "learning_rate": 5e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.734375536441803, + "num_tokens": 403431873.0, + "step": 15595 + }, + { + "epoch": 1.7127168899626621, + "grad_norm": 1.9837747812271118, + "learning_rate": 5e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7364009022712708, + "num_tokens": 403453018.0, + "step": 15596 + }, + { + "epoch": 1.7128267076652757, + "grad_norm": 1.8261326551437378, + "learning_rate": 5e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7157519459724426, + "num_tokens": 403479927.0, + "step": 15597 + }, + { + "epoch": 1.7129365253678892, + "grad_norm": 1.9137983322143555, + "learning_rate": 5e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7447783946990967, + "num_tokens": 403504506.0, + "step": 15598 + }, + { + "epoch": 1.713046343070503, + "grad_norm": 1.9748646020889282, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7372257113456726, + "num_tokens": 403531222.0, + "step": 15599 + }, + { + "epoch": 1.7131561607731167, + "grad_norm": 1.8478070497512817, + "learning_rate": 5e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7420859336853027, + "num_tokens": 403555292.0, + "step": 15600 + }, + { + "epoch": 1.7132659784757303, + "grad_norm": 1.8845816850662231, + "learning_rate": 5e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7480867505073547, + "num_tokens": 403580320.0, + "step": 15601 + }, + { + "epoch": 1.7133757961783438, + "grad_norm": 1.8927875757217407, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7189427614212036, + "num_tokens": 403605369.0, + "step": 15602 + }, + { + "epoch": 1.7134856138809575, + "grad_norm": 1.735060691833496, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.745531439781189, + "num_tokens": 403632144.0, + "step": 15603 + }, + { + "epoch": 1.7135954315835713, + "grad_norm": 1.6987507343292236, + "learning_rate": 5e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7477996349334717, + "num_tokens": 403660645.0, + "step": 15604 + }, + { + "epoch": 1.713705249286185, + "grad_norm": 1.825135588645935, + "learning_rate": 5e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7385931015014648, + "num_tokens": 403687877.0, + "step": 15605 + }, + { + "epoch": 1.7138150669887986, + "grad_norm": 2.0119407176971436, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7300401926040649, + "num_tokens": 403708199.0, + "step": 15606 + }, + { + "epoch": 1.7139248846914121, + "grad_norm": 1.882293939590454, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7332581877708435, + "num_tokens": 403733445.0, + "step": 15607 + }, + { + "epoch": 1.7140347023940259, + "grad_norm": 2.0769388675689697, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7457902431488037, + "num_tokens": 403754335.0, + "step": 15608 + }, + { + "epoch": 1.7141445200966396, + "grad_norm": 1.795047640800476, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7410712242126465, + "num_tokens": 403779991.0, + "step": 15609 + }, + { + "epoch": 1.7142543377992534, + "grad_norm": 1.8703669309616089, + "learning_rate": 5e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7368246912956238, + "num_tokens": 403805279.0, + "step": 15610 + }, + { + "epoch": 1.714364155501867, + "grad_norm": 1.871694803237915, + "learning_rate": 5e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7468655109405518, + "num_tokens": 403827691.0, + "step": 15611 + }, + { + "epoch": 1.7144739732044805, + "grad_norm": 1.7327184677124023, + "learning_rate": 5e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7183146476745605, + "num_tokens": 403857438.0, + "step": 15612 + }, + { + "epoch": 1.7145837909070942, + "grad_norm": 1.9932876825332642, + "learning_rate": 5e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.7455440163612366, + "num_tokens": 403877045.0, + "step": 15613 + }, + { + "epoch": 1.714693608609708, + "grad_norm": 1.9827245473861694, + "learning_rate": 5e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7423147559165955, + "num_tokens": 403898545.0, + "step": 15614 + }, + { + "epoch": 1.7148034263123215, + "grad_norm": 1.9061864614486694, + "learning_rate": 5e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7495971918106079, + "num_tokens": 403919356.0, + "step": 15615 + }, + { + "epoch": 1.714913244014935, + "grad_norm": 1.7771215438842773, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7249091267585754, + "num_tokens": 403947256.0, + "step": 15616 + }, + { + "epoch": 1.7150230617175488, + "grad_norm": 1.677232265472412, + "learning_rate": 5e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7178304195404053, + "num_tokens": 403976226.0, + "step": 15617 + }, + { + "epoch": 1.7151328794201626, + "grad_norm": 1.71206796169281, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7232584953308105, + "num_tokens": 404005429.0, + "step": 15618 + }, + { + "epoch": 1.7152426971227763, + "grad_norm": 1.6980384588241577, + "learning_rate": 5e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.71910560131073, + "num_tokens": 404032477.0, + "step": 15619 + }, + { + "epoch": 1.7153525148253899, + "grad_norm": 1.8608200550079346, + "learning_rate": 5e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7157646417617798, + "num_tokens": 404057624.0, + "step": 15620 + }, + { + "epoch": 1.7154623325280034, + "grad_norm": 1.6477540731430054, + "learning_rate": 5e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.7351015210151672, + "num_tokens": 404084552.0, + "step": 15621 + }, + { + "epoch": 1.7155721502306172, + "grad_norm": 1.7987498044967651, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7341734170913696, + "num_tokens": 404110284.0, + "step": 15622 + }, + { + "epoch": 1.715681967933231, + "grad_norm": 1.5772366523742676, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.721031665802002, + "num_tokens": 404142309.0, + "step": 15623 + }, + { + "epoch": 1.7157917856358444, + "grad_norm": 1.8848768472671509, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7592435479164124, + "num_tokens": 404165560.0, + "step": 15624 + }, + { + "epoch": 1.7159016033384582, + "grad_norm": 1.7335765361785889, + "learning_rate": 5e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7230044603347778, + "num_tokens": 404194486.0, + "step": 15625 + }, + { + "epoch": 1.7160114210410717, + "grad_norm": 1.8515561819076538, + "learning_rate": 5e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.725853443145752, + "num_tokens": 404220378.0, + "step": 15626 + }, + { + "epoch": 1.7161212387436855, + "grad_norm": 1.7005183696746826, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.715606153011322, + "num_tokens": 404248539.0, + "step": 15627 + }, + { + "epoch": 1.7162310564462993, + "grad_norm": 1.7622748613357544, + "learning_rate": 5e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7263048887252808, + "num_tokens": 404275539.0, + "step": 15628 + }, + { + "epoch": 1.7163408741489128, + "grad_norm": 1.662919282913208, + "learning_rate": 5e-06, + "loss": 0.794, + "mean_token_accuracy": 0.7455366849899292, + "num_tokens": 404303469.0, + "step": 15629 + }, + { + "epoch": 1.7164506918515263, + "grad_norm": 2.2466626167297363, + "learning_rate": 5e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7517795562744141, + "num_tokens": 404321281.0, + "step": 15630 + }, + { + "epoch": 1.71656050955414, + "grad_norm": 1.983551263809204, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.725466787815094, + "num_tokens": 404344449.0, + "step": 15631 + }, + { + "epoch": 1.7166703272567538, + "grad_norm": 1.7746278047561646, + "learning_rate": 5e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7295748591423035, + "num_tokens": 404371017.0, + "step": 15632 + }, + { + "epoch": 1.7167801449593676, + "grad_norm": 2.1223342418670654, + "learning_rate": 5e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7414517402648926, + "num_tokens": 404391537.0, + "step": 15633 + }, + { + "epoch": 1.7168899626619811, + "grad_norm": 1.790424108505249, + "learning_rate": 5e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7308975458145142, + "num_tokens": 404418354.0, + "step": 15634 + }, + { + "epoch": 1.7169997803645947, + "grad_norm": 1.6743563413619995, + "learning_rate": 5e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7398646473884583, + "num_tokens": 404447462.0, + "step": 15635 + }, + { + "epoch": 1.7171095980672084, + "grad_norm": 1.831363558769226, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7343722581863403, + "num_tokens": 404472860.0, + "step": 15636 + }, + { + "epoch": 1.7172194157698222, + "grad_norm": 1.8453130722045898, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7349357008934021, + "num_tokens": 404499019.0, + "step": 15637 + }, + { + "epoch": 1.7173292334724357, + "grad_norm": 1.7030428647994995, + "learning_rate": 5e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.714577317237854, + "num_tokens": 404528504.0, + "step": 15638 + }, + { + "epoch": 1.7174390511750495, + "grad_norm": 1.8035134077072144, + "learning_rate": 5e-06, + "loss": 0.931, + "mean_token_accuracy": 0.6980198621749878, + "num_tokens": 404555580.0, + "step": 15639 + }, + { + "epoch": 1.717548868877663, + "grad_norm": 1.8807843923568726, + "learning_rate": 5e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7601221799850464, + "num_tokens": 404578189.0, + "step": 15640 + }, + { + "epoch": 1.7176586865802768, + "grad_norm": 1.8529748916625977, + "learning_rate": 5e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7380066514015198, + "num_tokens": 404602057.0, + "step": 15641 + }, + { + "epoch": 1.7177685042828905, + "grad_norm": 1.8054356575012207, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7650294303894043, + "num_tokens": 404625125.0, + "step": 15642 + }, + { + "epoch": 1.717878321985504, + "grad_norm": 1.8526889085769653, + "learning_rate": 5e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.719813346862793, + "num_tokens": 404649030.0, + "step": 15643 + }, + { + "epoch": 1.7179881396881176, + "grad_norm": 2.2548673152923584, + "learning_rate": 5e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7407753467559814, + "num_tokens": 404667725.0, + "step": 15644 + }, + { + "epoch": 1.7180979573907313, + "grad_norm": 1.8904120922088623, + "learning_rate": 5e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7347956299781799, + "num_tokens": 404690221.0, + "step": 15645 + }, + { + "epoch": 1.718207775093345, + "grad_norm": 2.080406427383423, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7442132234573364, + "num_tokens": 404711480.0, + "step": 15646 + }, + { + "epoch": 1.7183175927959589, + "grad_norm": 1.7401258945465088, + "learning_rate": 5e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7089827060699463, + "num_tokens": 404739597.0, + "step": 15647 + }, + { + "epoch": 1.7184274104985724, + "grad_norm": 1.6377924680709839, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7286381721496582, + "num_tokens": 404770167.0, + "step": 15648 + }, + { + "epoch": 1.718537228201186, + "grad_norm": 1.6876192092895508, + "learning_rate": 5e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7414613962173462, + "num_tokens": 404800046.0, + "step": 15649 + }, + { + "epoch": 1.7186470459037997, + "grad_norm": 1.7319984436035156, + "learning_rate": 5e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7136217951774597, + "num_tokens": 404829643.0, + "step": 15650 + }, + { + "epoch": 1.7187568636064134, + "grad_norm": 1.8262741565704346, + "learning_rate": 5e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7464751601219177, + "num_tokens": 404852035.0, + "step": 15651 + }, + { + "epoch": 1.718866681309027, + "grad_norm": 1.6349365711212158, + "learning_rate": 5e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7184930443763733, + "num_tokens": 404883045.0, + "step": 15652 + }, + { + "epoch": 1.7189764990116405, + "grad_norm": 1.7109925746917725, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7231096029281616, + "num_tokens": 404911243.0, + "step": 15653 + }, + { + "epoch": 1.7190863167142543, + "grad_norm": 1.8434284925460815, + "learning_rate": 5e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7246224880218506, + "num_tokens": 404936531.0, + "step": 15654 + }, + { + "epoch": 1.719196134416868, + "grad_norm": 1.7152531147003174, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7463984489440918, + "num_tokens": 404963408.0, + "step": 15655 + }, + { + "epoch": 1.7193059521194818, + "grad_norm": 1.876249074935913, + "learning_rate": 5e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7396560907363892, + "num_tokens": 404988085.0, + "step": 15656 + }, + { + "epoch": 1.7194157698220953, + "grad_norm": 1.7413872480392456, + "learning_rate": 5e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7105256915092468, + "num_tokens": 405018249.0, + "step": 15657 + }, + { + "epoch": 1.7195255875247089, + "grad_norm": 1.9360864162445068, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7472262382507324, + "num_tokens": 405039723.0, + "step": 15658 + }, + { + "epoch": 1.7196354052273226, + "grad_norm": 1.9965344667434692, + "learning_rate": 5e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7215844392776489, + "num_tokens": 405062852.0, + "step": 15659 + }, + { + "epoch": 1.7197452229299364, + "grad_norm": 1.7820460796356201, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.733125627040863, + "num_tokens": 405090345.0, + "step": 15660 + }, + { + "epoch": 1.7198550406325501, + "grad_norm": 1.6237139701843262, + "learning_rate": 5e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7118933796882629, + "num_tokens": 405119853.0, + "step": 15661 + }, + { + "epoch": 1.7199648583351637, + "grad_norm": 1.6858705282211304, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7356584072113037, + "num_tokens": 405149630.0, + "step": 15662 + }, + { + "epoch": 1.7200746760377772, + "grad_norm": 2.0278470516204834, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7450008392333984, + "num_tokens": 405171330.0, + "step": 15663 + }, + { + "epoch": 1.720184493740391, + "grad_norm": 1.6761776208877563, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7320965528488159, + "num_tokens": 405200447.0, + "step": 15664 + }, + { + "epoch": 1.7202943114430047, + "grad_norm": 2.0339126586914062, + "learning_rate": 5e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7475996017456055, + "num_tokens": 405222885.0, + "step": 15665 + }, + { + "epoch": 1.7204041291456182, + "grad_norm": 1.777238368988037, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7290334701538086, + "num_tokens": 405249000.0, + "step": 15666 + }, + { + "epoch": 1.7205139468482318, + "grad_norm": 1.7756823301315308, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7456799745559692, + "num_tokens": 405276042.0, + "step": 15667 + }, + { + "epoch": 1.7206237645508455, + "grad_norm": 1.7750372886657715, + "learning_rate": 5e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7386120557785034, + "num_tokens": 405300311.0, + "step": 15668 + }, + { + "epoch": 1.7207335822534593, + "grad_norm": 2.068035840988159, + "learning_rate": 5e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7442411780357361, + "num_tokens": 405320690.0, + "step": 15669 + }, + { + "epoch": 1.720843399956073, + "grad_norm": 1.8629785776138306, + "learning_rate": 5e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7442089319229126, + "num_tokens": 405344890.0, + "step": 15670 + }, + { + "epoch": 1.7209532176586866, + "grad_norm": 1.878868579864502, + "learning_rate": 5e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7380372285842896, + "num_tokens": 405370211.0, + "step": 15671 + }, + { + "epoch": 1.7210630353613001, + "grad_norm": 1.7728041410446167, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7693665027618408, + "num_tokens": 405394029.0, + "step": 15672 + }, + { + "epoch": 1.7211728530639139, + "grad_norm": 1.7264546155929565, + "learning_rate": 5e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7512394189834595, + "num_tokens": 405421601.0, + "step": 15673 + }, + { + "epoch": 1.7212826707665276, + "grad_norm": 1.8864567279815674, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7348710298538208, + "num_tokens": 405445571.0, + "step": 15674 + }, + { + "epoch": 1.7213924884691414, + "grad_norm": 1.6755670309066772, + "learning_rate": 5e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7101444005966187, + "num_tokens": 405473928.0, + "step": 15675 + }, + { + "epoch": 1.721502306171755, + "grad_norm": 1.730810523033142, + "learning_rate": 5e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7197759747505188, + "num_tokens": 405500549.0, + "step": 15676 + }, + { + "epoch": 1.7216121238743685, + "grad_norm": 1.676793098449707, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.746618390083313, + "num_tokens": 405527459.0, + "step": 15677 + }, + { + "epoch": 1.7217219415769822, + "grad_norm": 1.9473005533218384, + "learning_rate": 5e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.724498987197876, + "num_tokens": 405550833.0, + "step": 15678 + }, + { + "epoch": 1.721831759279596, + "grad_norm": 1.779934048652649, + "learning_rate": 5e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.740466833114624, + "num_tokens": 405576862.0, + "step": 15679 + }, + { + "epoch": 1.7219415769822095, + "grad_norm": 1.8573518991470337, + "learning_rate": 5e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7445633411407471, + "num_tokens": 405599979.0, + "step": 15680 + }, + { + "epoch": 1.722051394684823, + "grad_norm": 1.6923134326934814, + "learning_rate": 5e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7255276441574097, + "num_tokens": 405629665.0, + "step": 15681 + }, + { + "epoch": 1.7221612123874368, + "grad_norm": 1.7720669507980347, + "learning_rate": 5e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7148054838180542, + "num_tokens": 405657114.0, + "step": 15682 + }, + { + "epoch": 1.7222710300900506, + "grad_norm": 1.9103753566741943, + "learning_rate": 5e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7165244817733765, + "num_tokens": 405679619.0, + "step": 15683 + }, + { + "epoch": 1.7223808477926643, + "grad_norm": 1.5202250480651855, + "learning_rate": 5e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7540173530578613, + "num_tokens": 405714846.0, + "step": 15684 + }, + { + "epoch": 1.7224906654952779, + "grad_norm": 1.7857601642608643, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7172250747680664, + "num_tokens": 405742216.0, + "step": 15685 + }, + { + "epoch": 1.7226004831978914, + "grad_norm": 1.8419896364212036, + "learning_rate": 5e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7186033725738525, + "num_tokens": 405769493.0, + "step": 15686 + }, + { + "epoch": 1.7227103009005051, + "grad_norm": 1.8093552589416504, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7517126202583313, + "num_tokens": 405792787.0, + "step": 15687 + }, + { + "epoch": 1.722820118603119, + "grad_norm": 1.6283931732177734, + "learning_rate": 5e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7582814693450928, + "num_tokens": 405820030.0, + "step": 15688 + }, + { + "epoch": 1.7229299363057324, + "grad_norm": 1.6699632406234741, + "learning_rate": 5e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7110142707824707, + "num_tokens": 405851239.0, + "step": 15689 + }, + { + "epoch": 1.7230397540083462, + "grad_norm": 1.7504884004592896, + "learning_rate": 5e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.6970947980880737, + "num_tokens": 405879792.0, + "step": 15690 + }, + { + "epoch": 1.7231495717109597, + "grad_norm": 1.6838412284851074, + "learning_rate": 5e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7419642210006714, + "num_tokens": 405908111.0, + "step": 15691 + }, + { + "epoch": 1.7232593894135735, + "grad_norm": 1.7523953914642334, + "learning_rate": 5e-06, + "loss": 0.825, + "mean_token_accuracy": 0.736924409866333, + "num_tokens": 405933561.0, + "step": 15692 + }, + { + "epoch": 1.7233692071161872, + "grad_norm": 1.8253443241119385, + "learning_rate": 5e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7109619379043579, + "num_tokens": 405959328.0, + "step": 15693 + }, + { + "epoch": 1.7234790248188008, + "grad_norm": 1.9462485313415527, + "learning_rate": 5e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7405514121055603, + "num_tokens": 405980712.0, + "step": 15694 + }, + { + "epoch": 1.7235888425214143, + "grad_norm": 2.065786361694336, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7310173511505127, + "num_tokens": 406002555.0, + "step": 15695 + }, + { + "epoch": 1.723698660224028, + "grad_norm": 1.677439570426941, + "learning_rate": 5e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7124899625778198, + "num_tokens": 406036440.0, + "step": 15696 + }, + { + "epoch": 1.7238084779266418, + "grad_norm": 1.670261025428772, + "learning_rate": 5e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7437383532524109, + "num_tokens": 406065725.0, + "step": 15697 + }, + { + "epoch": 1.7239182956292556, + "grad_norm": 1.6017961502075195, + "learning_rate": 5e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7263562679290771, + "num_tokens": 406100743.0, + "step": 15698 + }, + { + "epoch": 1.7240281133318691, + "grad_norm": 1.5810832977294922, + "learning_rate": 5e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7393578290939331, + "num_tokens": 406132407.0, + "step": 15699 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 1.6277796030044556, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7170326709747314, + "num_tokens": 406164080.0, + "step": 15700 + }, + { + "epoch": 1.7242477487370964, + "grad_norm": 1.725356936454773, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7240831851959229, + "num_tokens": 406191002.0, + "step": 15701 + }, + { + "epoch": 1.7243575664397102, + "grad_norm": 1.8428775072097778, + "learning_rate": 5e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.721474289894104, + "num_tokens": 406218324.0, + "step": 15702 + }, + { + "epoch": 1.7244673841423237, + "grad_norm": 1.5480149984359741, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7287896275520325, + "num_tokens": 406252312.0, + "step": 15703 + }, + { + "epoch": 1.7245772018449375, + "grad_norm": 1.6446654796600342, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7223018407821655, + "num_tokens": 406284142.0, + "step": 15704 + }, + { + "epoch": 1.724687019547551, + "grad_norm": 1.8069180250167847, + "learning_rate": 5e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7392411231994629, + "num_tokens": 406308383.0, + "step": 15705 + }, + { + "epoch": 1.7247968372501647, + "grad_norm": 1.8653699159622192, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7327471375465393, + "num_tokens": 406334550.0, + "step": 15706 + }, + { + "epoch": 1.7249066549527785, + "grad_norm": 1.924182415008545, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7307374477386475, + "num_tokens": 406358278.0, + "step": 15707 + }, + { + "epoch": 1.725016472655392, + "grad_norm": 1.8566781282424927, + "learning_rate": 5e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7203906774520874, + "num_tokens": 406387408.0, + "step": 15708 + }, + { + "epoch": 1.7251262903580056, + "grad_norm": 1.863527774810791, + "learning_rate": 5e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7165662050247192, + "num_tokens": 406413206.0, + "step": 15709 + }, + { + "epoch": 1.7252361080606193, + "grad_norm": 2.0845947265625, + "learning_rate": 5e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7366689443588257, + "num_tokens": 406432301.0, + "step": 15710 + }, + { + "epoch": 1.725345925763233, + "grad_norm": 1.7780075073242188, + "learning_rate": 5e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7330384254455566, + "num_tokens": 406459157.0, + "step": 15711 + }, + { + "epoch": 1.7254557434658468, + "grad_norm": 1.6981885433197021, + "learning_rate": 5e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.726686418056488, + "num_tokens": 406487636.0, + "step": 15712 + }, + { + "epoch": 1.7255655611684604, + "grad_norm": 1.789159893989563, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7306580543518066, + "num_tokens": 406513675.0, + "step": 15713 + }, + { + "epoch": 1.725675378871074, + "grad_norm": 1.669632911682129, + "learning_rate": 5e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7521653175354004, + "num_tokens": 406541390.0, + "step": 15714 + }, + { + "epoch": 1.7257851965736877, + "grad_norm": 1.8125628232955933, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7099231481552124, + "num_tokens": 406567096.0, + "step": 15715 + }, + { + "epoch": 1.7258950142763014, + "grad_norm": 1.8356446027755737, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7246730327606201, + "num_tokens": 406593736.0, + "step": 15716 + }, + { + "epoch": 1.726004831978915, + "grad_norm": 1.8696407079696655, + "learning_rate": 5e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7552025318145752, + "num_tokens": 406616866.0, + "step": 15717 + }, + { + "epoch": 1.7261146496815285, + "grad_norm": 1.861207127571106, + "learning_rate": 5e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7128109931945801, + "num_tokens": 406642959.0, + "step": 15718 + }, + { + "epoch": 1.7262244673841423, + "grad_norm": 1.8067249059677124, + "learning_rate": 5e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7341890335083008, + "num_tokens": 406667138.0, + "step": 15719 + }, + { + "epoch": 1.726334285086756, + "grad_norm": 1.7264032363891602, + "learning_rate": 5e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7411913871765137, + "num_tokens": 406693452.0, + "step": 15720 + }, + { + "epoch": 1.7264441027893698, + "grad_norm": 1.8877511024475098, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.723904550075531, + "num_tokens": 406719089.0, + "step": 15721 + }, + { + "epoch": 1.7265539204919833, + "grad_norm": 1.990526795387268, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7407981157302856, + "num_tokens": 406739848.0, + "step": 15722 + }, + { + "epoch": 1.7266637381945968, + "grad_norm": 1.874761700630188, + "learning_rate": 5e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7074379920959473, + "num_tokens": 406765050.0, + "step": 15723 + }, + { + "epoch": 1.7267735558972106, + "grad_norm": 1.965293049812317, + "learning_rate": 5e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.722834050655365, + "num_tokens": 406787588.0, + "step": 15724 + }, + { + "epoch": 1.7268833735998244, + "grad_norm": 1.8910914659500122, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7452384829521179, + "num_tokens": 406810909.0, + "step": 15725 + }, + { + "epoch": 1.7269931913024381, + "grad_norm": 1.7490066289901733, + "learning_rate": 5e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7373461127281189, + "num_tokens": 406835057.0, + "step": 15726 + }, + { + "epoch": 1.7271030090050516, + "grad_norm": 1.7499572038650513, + "learning_rate": 5e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7133093476295471, + "num_tokens": 406862655.0, + "step": 15727 + }, + { + "epoch": 1.7272128267076652, + "grad_norm": 1.936600923538208, + "learning_rate": 5e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.733080267906189, + "num_tokens": 406887206.0, + "step": 15728 + }, + { + "epoch": 1.727322644410279, + "grad_norm": 1.694891095161438, + "learning_rate": 5e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7355706691741943, + "num_tokens": 406913768.0, + "step": 15729 + }, + { + "epoch": 1.7274324621128927, + "grad_norm": 1.766461730003357, + "learning_rate": 5e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7427612543106079, + "num_tokens": 406940012.0, + "step": 15730 + }, + { + "epoch": 1.7275422798155062, + "grad_norm": 1.803969383239746, + "learning_rate": 5e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.753471851348877, + "num_tokens": 406963230.0, + "step": 15731 + }, + { + "epoch": 1.7276520975181198, + "grad_norm": 1.8337996006011963, + "learning_rate": 5e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7419881820678711, + "num_tokens": 406989610.0, + "step": 15732 + }, + { + "epoch": 1.7277619152207335, + "grad_norm": 1.8197238445281982, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7282256484031677, + "num_tokens": 407015554.0, + "step": 15733 + }, + { + "epoch": 1.7278717329233473, + "grad_norm": 1.6444401741027832, + "learning_rate": 5e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7303919792175293, + "num_tokens": 407045861.0, + "step": 15734 + }, + { + "epoch": 1.727981550625961, + "grad_norm": 1.749441385269165, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7225451469421387, + "num_tokens": 407074088.0, + "step": 15735 + }, + { + "epoch": 1.7280913683285746, + "grad_norm": 1.7926867008209229, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7143237590789795, + "num_tokens": 407099385.0, + "step": 15736 + }, + { + "epoch": 1.728201186031188, + "grad_norm": 1.6968634128570557, + "learning_rate": 5e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7513585090637207, + "num_tokens": 407128965.0, + "step": 15737 + }, + { + "epoch": 1.7283110037338019, + "grad_norm": 2.072608232498169, + "learning_rate": 5e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7358103394508362, + "num_tokens": 407149479.0, + "step": 15738 + }, + { + "epoch": 1.7284208214364156, + "grad_norm": 1.6730594635009766, + "learning_rate": 5e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7016066312789917, + "num_tokens": 407181255.0, + "step": 15739 + }, + { + "epoch": 1.7285306391390292, + "grad_norm": 1.8858948945999146, + "learning_rate": 5e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7249306440353394, + "num_tokens": 407206333.0, + "step": 15740 + }, + { + "epoch": 1.728640456841643, + "grad_norm": 1.9488496780395508, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7494402527809143, + "num_tokens": 407229339.0, + "step": 15741 + }, + { + "epoch": 1.7287502745442564, + "grad_norm": 1.885881781578064, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7191777229309082, + "num_tokens": 407256100.0, + "step": 15742 + }, + { + "epoch": 1.7288600922468702, + "grad_norm": 1.8290467262268066, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7332520484924316, + "num_tokens": 407280835.0, + "step": 15743 + }, + { + "epoch": 1.728969909949484, + "grad_norm": 1.666839838027954, + "learning_rate": 5e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7282938361167908, + "num_tokens": 407311167.0, + "step": 15744 + }, + { + "epoch": 1.7290797276520975, + "grad_norm": 1.8100401163101196, + "learning_rate": 5e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.6980725526809692, + "num_tokens": 407340740.0, + "step": 15745 + }, + { + "epoch": 1.729189545354711, + "grad_norm": 1.649552583694458, + "learning_rate": 5e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7242068648338318, + "num_tokens": 407373632.0, + "step": 15746 + }, + { + "epoch": 1.7292993630573248, + "grad_norm": 1.7296106815338135, + "learning_rate": 5e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7190567851066589, + "num_tokens": 407401251.0, + "step": 15747 + }, + { + "epoch": 1.7294091807599385, + "grad_norm": 1.7573630809783936, + "learning_rate": 5e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7387971878051758, + "num_tokens": 407429379.0, + "step": 15748 + }, + { + "epoch": 1.7295189984625523, + "grad_norm": 1.8886677026748657, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.738917350769043, + "num_tokens": 407454977.0, + "step": 15749 + }, + { + "epoch": 1.7296288161651658, + "grad_norm": 1.7376198768615723, + "learning_rate": 5e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7132771015167236, + "num_tokens": 407481221.0, + "step": 15750 + }, + { + "epoch": 1.7297386338677794, + "grad_norm": 1.7221680879592896, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7267980575561523, + "num_tokens": 407511089.0, + "step": 15751 + }, + { + "epoch": 1.7298484515703931, + "grad_norm": 1.7650792598724365, + "learning_rate": 5e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7299192547798157, + "num_tokens": 407538582.0, + "step": 15752 + }, + { + "epoch": 1.7299582692730069, + "grad_norm": 1.846369743347168, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7338175773620605, + "num_tokens": 407562511.0, + "step": 15753 + }, + { + "epoch": 1.7300680869756204, + "grad_norm": 1.9833546876907349, + "learning_rate": 5e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7690219879150391, + "num_tokens": 407583333.0, + "step": 15754 + }, + { + "epoch": 1.7301779046782342, + "grad_norm": 1.7583799362182617, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7323094606399536, + "num_tokens": 407610061.0, + "step": 15755 + }, + { + "epoch": 1.7302877223808477, + "grad_norm": 1.877051830291748, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7565213441848755, + "num_tokens": 407634228.0, + "step": 15756 + }, + { + "epoch": 1.7303975400834615, + "grad_norm": 1.8254927396774292, + "learning_rate": 5e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7162489295005798, + "num_tokens": 407659114.0, + "step": 15757 + }, + { + "epoch": 1.7305073577860752, + "grad_norm": 1.9736770391464233, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7314757704734802, + "num_tokens": 407683097.0, + "step": 15758 + }, + { + "epoch": 1.7306171754886888, + "grad_norm": 1.933053731918335, + "learning_rate": 5e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.750950813293457, + "num_tokens": 407705646.0, + "step": 15759 + }, + { + "epoch": 1.7307269931913023, + "grad_norm": 2.043896436691284, + "learning_rate": 5e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7320249080657959, + "num_tokens": 407726455.0, + "step": 15760 + }, + { + "epoch": 1.730836810893916, + "grad_norm": 1.9206852912902832, + "learning_rate": 5e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7255991101264954, + "num_tokens": 407749898.0, + "step": 15761 + }, + { + "epoch": 1.7309466285965298, + "grad_norm": 2.128910779953003, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7221503257751465, + "num_tokens": 407772374.0, + "step": 15762 + }, + { + "epoch": 1.7310564462991436, + "grad_norm": 1.5915327072143555, + "learning_rate": 5e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7129657864570618, + "num_tokens": 407807149.0, + "step": 15763 + }, + { + "epoch": 1.731166264001757, + "grad_norm": 1.7122567892074585, + "learning_rate": 5e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7230602502822876, + "num_tokens": 407838370.0, + "step": 15764 + }, + { + "epoch": 1.7312760817043706, + "grad_norm": 1.7464176416397095, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7427395582199097, + "num_tokens": 407864021.0, + "step": 15765 + }, + { + "epoch": 1.7313858994069844, + "grad_norm": 1.8930903673171997, + "learning_rate": 5e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7502736449241638, + "num_tokens": 407888033.0, + "step": 15766 + }, + { + "epoch": 1.7314957171095982, + "grad_norm": 1.8174761533737183, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7354285717010498, + "num_tokens": 407911650.0, + "step": 15767 + }, + { + "epoch": 1.7316055348122117, + "grad_norm": 1.8004266023635864, + "learning_rate": 5e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7306820154190063, + "num_tokens": 407940128.0, + "step": 15768 + }, + { + "epoch": 1.7317153525148252, + "grad_norm": 2.0432446002960205, + "learning_rate": 5e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7362508177757263, + "num_tokens": 407960829.0, + "step": 15769 + }, + { + "epoch": 1.731825170217439, + "grad_norm": 1.6919490098953247, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7158671617507935, + "num_tokens": 407989734.0, + "step": 15770 + }, + { + "epoch": 1.7319349879200527, + "grad_norm": 1.7498377561569214, + "learning_rate": 5e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7175620794296265, + "num_tokens": 408017585.0, + "step": 15771 + }, + { + "epoch": 1.7320448056226665, + "grad_norm": 1.9289612770080566, + "learning_rate": 5e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7139050960540771, + "num_tokens": 408040848.0, + "step": 15772 + }, + { + "epoch": 1.73215462332528, + "grad_norm": 1.8532228469848633, + "learning_rate": 5e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7109819650650024, + "num_tokens": 408067418.0, + "step": 15773 + }, + { + "epoch": 1.7322644410278936, + "grad_norm": 1.9444373846054077, + "learning_rate": 5e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.723796010017395, + "num_tokens": 408089901.0, + "step": 15774 + }, + { + "epoch": 1.7323742587305073, + "grad_norm": 1.8887392282485962, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7630179524421692, + "num_tokens": 408112691.0, + "step": 15775 + }, + { + "epoch": 1.732484076433121, + "grad_norm": 1.9004724025726318, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7730040550231934, + "num_tokens": 408135456.0, + "step": 15776 + }, + { + "epoch": 1.7325938941357348, + "grad_norm": 1.6630018949508667, + "learning_rate": 5e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7402610778808594, + "num_tokens": 408163318.0, + "step": 15777 + }, + { + "epoch": 1.7327037118383484, + "grad_norm": 1.8421868085861206, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7347110509872437, + "num_tokens": 408188960.0, + "step": 15778 + }, + { + "epoch": 1.732813529540962, + "grad_norm": 2.1878981590270996, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.745354175567627, + "num_tokens": 408207791.0, + "step": 15779 + }, + { + "epoch": 1.7329233472435757, + "grad_norm": 1.7727336883544922, + "learning_rate": 5e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7349777817726135, + "num_tokens": 408233840.0, + "step": 15780 + }, + { + "epoch": 1.7330331649461894, + "grad_norm": 1.9555766582489014, + "learning_rate": 5e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7145776152610779, + "num_tokens": 408258304.0, + "step": 15781 + }, + { + "epoch": 1.733142982648803, + "grad_norm": 1.8767017126083374, + "learning_rate": 5e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.737360954284668, + "num_tokens": 408281348.0, + "step": 15782 + }, + { + "epoch": 1.7332528003514165, + "grad_norm": 2.034456491470337, + "learning_rate": 5e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7348085045814514, + "num_tokens": 408302417.0, + "step": 15783 + }, + { + "epoch": 1.7333626180540302, + "grad_norm": 1.8198487758636475, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7366383075714111, + "num_tokens": 408328212.0, + "step": 15784 + }, + { + "epoch": 1.733472435756644, + "grad_norm": 1.6161139011383057, + "learning_rate": 5e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7426602840423584, + "num_tokens": 408357361.0, + "step": 15785 + }, + { + "epoch": 1.7335822534592578, + "grad_norm": 1.951500415802002, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7228124141693115, + "num_tokens": 408381043.0, + "step": 15786 + }, + { + "epoch": 1.7336920711618713, + "grad_norm": 1.7989262342453003, + "learning_rate": 5e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7210668325424194, + "num_tokens": 408408805.0, + "step": 15787 + }, + { + "epoch": 1.7338018888644848, + "grad_norm": 2.0585010051727295, + "learning_rate": 5e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7455613613128662, + "num_tokens": 408429011.0, + "step": 15788 + }, + { + "epoch": 1.7339117065670986, + "grad_norm": 1.8609771728515625, + "learning_rate": 5e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.747079610824585, + "num_tokens": 408452612.0, + "step": 15789 + }, + { + "epoch": 1.7340215242697123, + "grad_norm": 1.9973640441894531, + "learning_rate": 5e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7121593952178955, + "num_tokens": 408479909.0, + "step": 15790 + }, + { + "epoch": 1.734131341972326, + "grad_norm": 1.8843995332717896, + "learning_rate": 5e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7436084151268005, + "num_tokens": 408502042.0, + "step": 15791 + }, + { + "epoch": 1.7342411596749396, + "grad_norm": 1.7829680442810059, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7403960227966309, + "num_tokens": 408525834.0, + "step": 15792 + }, + { + "epoch": 1.7343509773775532, + "grad_norm": 1.8101192712783813, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.763530433177948, + "num_tokens": 408548824.0, + "step": 15793 + }, + { + "epoch": 1.734460795080167, + "grad_norm": 1.91633939743042, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7417608499526978, + "num_tokens": 408570013.0, + "step": 15794 + }, + { + "epoch": 1.7345706127827807, + "grad_norm": 1.9595540761947632, + "learning_rate": 5e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7347880005836487, + "num_tokens": 408593412.0, + "step": 15795 + }, + { + "epoch": 1.7346804304853942, + "grad_norm": 2.322880983352661, + "learning_rate": 5e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7502354383468628, + "num_tokens": 408609806.0, + "step": 15796 + }, + { + "epoch": 1.7347902481880078, + "grad_norm": 1.9782077074050903, + "learning_rate": 5e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7400202751159668, + "num_tokens": 408630102.0, + "step": 15797 + }, + { + "epoch": 1.7349000658906215, + "grad_norm": 1.9974486827850342, + "learning_rate": 5e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7769068479537964, + "num_tokens": 408650990.0, + "step": 15798 + }, + { + "epoch": 1.7350098835932353, + "grad_norm": 1.7002689838409424, + "learning_rate": 5e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7437414526939392, + "num_tokens": 408680329.0, + "step": 15799 + }, + { + "epoch": 1.735119701295849, + "grad_norm": 1.844464659690857, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7246191501617432, + "num_tokens": 408706789.0, + "step": 15800 + }, + { + "epoch": 1.7352295189984626, + "grad_norm": 1.9599775075912476, + "learning_rate": 5e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7328130006790161, + "num_tokens": 408729492.0, + "step": 15801 + }, + { + "epoch": 1.735339336701076, + "grad_norm": 1.9125909805297852, + "learning_rate": 5e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7420018911361694, + "num_tokens": 408754722.0, + "step": 15802 + }, + { + "epoch": 1.7354491544036899, + "grad_norm": 1.801345705986023, + "learning_rate": 5e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7273058891296387, + "num_tokens": 408782183.0, + "step": 15803 + }, + { + "epoch": 1.7355589721063036, + "grad_norm": 1.609631896018982, + "learning_rate": 5e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7162763476371765, + "num_tokens": 408812214.0, + "step": 15804 + }, + { + "epoch": 1.7356687898089171, + "grad_norm": 1.9418911933898926, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7550582885742188, + "num_tokens": 408831574.0, + "step": 15805 + }, + { + "epoch": 1.735778607511531, + "grad_norm": 1.7228364944458008, + "learning_rate": 5e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7451733350753784, + "num_tokens": 408858242.0, + "step": 15806 + }, + { + "epoch": 1.7358884252141444, + "grad_norm": 1.660963535308838, + "learning_rate": 5e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7104266881942749, + "num_tokens": 408888552.0, + "step": 15807 + }, + { + "epoch": 1.7359982429167582, + "grad_norm": 1.7742420434951782, + "learning_rate": 5e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7161291241645813, + "num_tokens": 408915466.0, + "step": 15808 + }, + { + "epoch": 1.736108060619372, + "grad_norm": 1.6563775539398193, + "learning_rate": 5e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7237202525138855, + "num_tokens": 408944300.0, + "step": 15809 + }, + { + "epoch": 1.7362178783219855, + "grad_norm": 1.6522159576416016, + "learning_rate": 5e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7293325662612915, + "num_tokens": 408973636.0, + "step": 15810 + }, + { + "epoch": 1.736327696024599, + "grad_norm": 1.8741600513458252, + "learning_rate": 5e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7137331962585449, + "num_tokens": 408999529.0, + "step": 15811 + }, + { + "epoch": 1.7364375137272128, + "grad_norm": 1.5696672201156616, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7154201865196228, + "num_tokens": 409034836.0, + "step": 15812 + }, + { + "epoch": 1.7365473314298265, + "grad_norm": 1.753653883934021, + "learning_rate": 5e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7422080039978027, + "num_tokens": 409060779.0, + "step": 15813 + }, + { + "epoch": 1.7366571491324403, + "grad_norm": 1.698168158531189, + "learning_rate": 5e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7405188083648682, + "num_tokens": 409087513.0, + "step": 15814 + }, + { + "epoch": 1.7367669668350538, + "grad_norm": 1.8880438804626465, + "learning_rate": 5e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7392552495002747, + "num_tokens": 409109935.0, + "step": 15815 + }, + { + "epoch": 1.7368767845376674, + "grad_norm": 1.892667531967163, + "learning_rate": 5e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7598997354507446, + "num_tokens": 409132819.0, + "step": 15816 + }, + { + "epoch": 1.7369866022402811, + "grad_norm": 1.8457143306732178, + "learning_rate": 5e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7456648349761963, + "num_tokens": 409159604.0, + "step": 15817 + }, + { + "epoch": 1.7370964199428949, + "grad_norm": 2.038041114807129, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7357460260391235, + "num_tokens": 409182487.0, + "step": 15818 + }, + { + "epoch": 1.7372062376455084, + "grad_norm": 1.761228084564209, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.732176661491394, + "num_tokens": 409211099.0, + "step": 15819 + }, + { + "epoch": 1.7373160553481222, + "grad_norm": 1.9523545503616333, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7268901467323303, + "num_tokens": 409240358.0, + "step": 15820 + }, + { + "epoch": 1.7374258730507357, + "grad_norm": 1.8128859996795654, + "learning_rate": 5e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.728801965713501, + "num_tokens": 409266389.0, + "step": 15821 + }, + { + "epoch": 1.7375356907533495, + "grad_norm": 1.7059965133666992, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.743322491645813, + "num_tokens": 409294006.0, + "step": 15822 + }, + { + "epoch": 1.7376455084559632, + "grad_norm": 1.7746080160140991, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.758704662322998, + "num_tokens": 409317915.0, + "step": 15823 + }, + { + "epoch": 1.7377553261585768, + "grad_norm": 1.8707287311553955, + "learning_rate": 5e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.729677140712738, + "num_tokens": 409344548.0, + "step": 15824 + }, + { + "epoch": 1.7378651438611903, + "grad_norm": 1.818954586982727, + "learning_rate": 5e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7216975688934326, + "num_tokens": 409372228.0, + "step": 15825 + }, + { + "epoch": 1.737974961563804, + "grad_norm": 1.6173059940338135, + "learning_rate": 5e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7361905574798584, + "num_tokens": 409402276.0, + "step": 15826 + }, + { + "epoch": 1.7380847792664178, + "grad_norm": 1.8293721675872803, + "learning_rate": 5e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7238868474960327, + "num_tokens": 409426835.0, + "step": 15827 + }, + { + "epoch": 1.7381945969690316, + "grad_norm": 1.6210432052612305, + "learning_rate": 5e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7174892425537109, + "num_tokens": 409457645.0, + "step": 15828 + }, + { + "epoch": 1.738304414671645, + "grad_norm": 1.6130897998809814, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7341004610061646, + "num_tokens": 409488295.0, + "step": 15829 + }, + { + "epoch": 1.7384142323742586, + "grad_norm": 1.7951979637145996, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7357054948806763, + "num_tokens": 409514004.0, + "step": 15830 + }, + { + "epoch": 1.7385240500768724, + "grad_norm": 1.7107901573181152, + "learning_rate": 5e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7395124435424805, + "num_tokens": 409543001.0, + "step": 15831 + }, + { + "epoch": 1.7386338677794861, + "grad_norm": 1.8080425262451172, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.744939923286438, + "num_tokens": 409569549.0, + "step": 15832 + }, + { + "epoch": 1.7387436854820997, + "grad_norm": 2.1938629150390625, + "learning_rate": 5e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7378180027008057, + "num_tokens": 409588264.0, + "step": 15833 + }, + { + "epoch": 1.7388535031847132, + "grad_norm": 1.914258599281311, + "learning_rate": 5e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7282527685165405, + "num_tokens": 409613206.0, + "step": 15834 + }, + { + "epoch": 1.738963320887327, + "grad_norm": 1.562153697013855, + "learning_rate": 5e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7075419425964355, + "num_tokens": 409649833.0, + "step": 15835 + }, + { + "epoch": 1.7390731385899407, + "grad_norm": 1.8846828937530518, + "learning_rate": 5e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7164509892463684, + "num_tokens": 409676723.0, + "step": 15836 + }, + { + "epoch": 1.7391829562925545, + "grad_norm": 1.7842156887054443, + "learning_rate": 5e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.732783317565918, + "num_tokens": 409703067.0, + "step": 15837 + }, + { + "epoch": 1.739292773995168, + "grad_norm": 1.9587364196777344, + "learning_rate": 5e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.716755747795105, + "num_tokens": 409727363.0, + "step": 15838 + }, + { + "epoch": 1.7394025916977816, + "grad_norm": 1.9204744100570679, + "learning_rate": 5e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.76453697681427, + "num_tokens": 409750416.0, + "step": 15839 + }, + { + "epoch": 1.7395124094003953, + "grad_norm": 1.688480019569397, + "learning_rate": 5e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7352997064590454, + "num_tokens": 409779674.0, + "step": 15840 + }, + { + "epoch": 1.739622227103009, + "grad_norm": 1.8389513492584229, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7375635504722595, + "num_tokens": 409805227.0, + "step": 15841 + }, + { + "epoch": 1.7397320448056228, + "grad_norm": 1.87787926197052, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7363118529319763, + "num_tokens": 409831292.0, + "step": 15842 + }, + { + "epoch": 1.7398418625082364, + "grad_norm": 2.057342767715454, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7269291877746582, + "num_tokens": 409853451.0, + "step": 15843 + }, + { + "epoch": 1.73995168021085, + "grad_norm": 1.8903281688690186, + "learning_rate": 5e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7301056385040283, + "num_tokens": 409877551.0, + "step": 15844 + }, + { + "epoch": 1.7400614979134637, + "grad_norm": 1.766120195388794, + "learning_rate": 5e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7222115993499756, + "num_tokens": 409906365.0, + "step": 15845 + }, + { + "epoch": 1.7401713156160774, + "grad_norm": 1.6643890142440796, + "learning_rate": 5e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.734696626663208, + "num_tokens": 409935179.0, + "step": 15846 + }, + { + "epoch": 1.740281133318691, + "grad_norm": 2.154874563217163, + "learning_rate": 5e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7499604821205139, + "num_tokens": 409953672.0, + "step": 15847 + }, + { + "epoch": 1.7403909510213045, + "grad_norm": 2.0439300537109375, + "learning_rate": 5e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7340145707130432, + "num_tokens": 409975864.0, + "step": 15848 + }, + { + "epoch": 1.7405007687239182, + "grad_norm": 1.6783393621444702, + "learning_rate": 5e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7491350173950195, + "num_tokens": 410004554.0, + "step": 15849 + }, + { + "epoch": 1.740610586426532, + "grad_norm": 2.0846431255340576, + "learning_rate": 5e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7203953266143799, + "num_tokens": 410026715.0, + "step": 15850 + }, + { + "epoch": 1.7407204041291457, + "grad_norm": 1.8539766073226929, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7312594652175903, + "num_tokens": 410050644.0, + "step": 15851 + }, + { + "epoch": 1.7408302218317593, + "grad_norm": 1.76973557472229, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7391979694366455, + "num_tokens": 410077356.0, + "step": 15852 + }, + { + "epoch": 1.7409400395343728, + "grad_norm": 1.9575587511062622, + "learning_rate": 5e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.739311933517456, + "num_tokens": 410099337.0, + "step": 15853 + }, + { + "epoch": 1.7410498572369866, + "grad_norm": 1.7768152952194214, + "learning_rate": 5e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7267552614212036, + "num_tokens": 410126334.0, + "step": 15854 + }, + { + "epoch": 1.7411596749396003, + "grad_norm": 1.8217054605484009, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7437913417816162, + "num_tokens": 410150056.0, + "step": 15855 + }, + { + "epoch": 1.741269492642214, + "grad_norm": 1.615382194519043, + "learning_rate": 5e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7345269322395325, + "num_tokens": 410178339.0, + "step": 15856 + }, + { + "epoch": 1.7413793103448276, + "grad_norm": 1.6870425939559937, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7258208990097046, + "num_tokens": 410207774.0, + "step": 15857 + }, + { + "epoch": 1.7414891280474412, + "grad_norm": 1.9064488410949707, + "learning_rate": 5e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7143399715423584, + "num_tokens": 410231827.0, + "step": 15858 + }, + { + "epoch": 1.741598945750055, + "grad_norm": 1.8135826587677002, + "learning_rate": 5e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.711226761341095, + "num_tokens": 410258851.0, + "step": 15859 + }, + { + "epoch": 1.7417087634526687, + "grad_norm": 2.2255280017852783, + "learning_rate": 5e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7746905088424683, + "num_tokens": 410276333.0, + "step": 15860 + }, + { + "epoch": 1.7418185811552822, + "grad_norm": 1.6778864860534668, + "learning_rate": 5e-06, + "loss": 0.963, + "mean_token_accuracy": 0.6986322402954102, + "num_tokens": 410307419.0, + "step": 15861 + }, + { + "epoch": 1.7419283988578957, + "grad_norm": 2.0026893615722656, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7501733303070068, + "num_tokens": 410328127.0, + "step": 15862 + }, + { + "epoch": 1.7420382165605095, + "grad_norm": 2.0205891132354736, + "learning_rate": 5e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.725260853767395, + "num_tokens": 410351256.0, + "step": 15863 + }, + { + "epoch": 1.7421480342631233, + "grad_norm": 1.6240456104278564, + "learning_rate": 5e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7278327345848083, + "num_tokens": 410382616.0, + "step": 15864 + }, + { + "epoch": 1.742257851965737, + "grad_norm": 1.6515220403671265, + "learning_rate": 5e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7187291383743286, + "num_tokens": 410415550.0, + "step": 15865 + }, + { + "epoch": 1.7423676696683505, + "grad_norm": 1.9227951765060425, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7434031963348389, + "num_tokens": 410438372.0, + "step": 15866 + }, + { + "epoch": 1.742477487370964, + "grad_norm": 1.8331356048583984, + "learning_rate": 5e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7360717058181763, + "num_tokens": 410464503.0, + "step": 15867 + }, + { + "epoch": 1.7425873050735778, + "grad_norm": 1.8047895431518555, + "learning_rate": 5e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7331615686416626, + "num_tokens": 410490660.0, + "step": 15868 + }, + { + "epoch": 1.7426971227761916, + "grad_norm": 1.926882028579712, + "learning_rate": 5e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7317605018615723, + "num_tokens": 410512164.0, + "step": 15869 + }, + { + "epoch": 1.7428069404788051, + "grad_norm": 1.8611774444580078, + "learning_rate": 5e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7136333584785461, + "num_tokens": 410538865.0, + "step": 15870 + }, + { + "epoch": 1.742916758181419, + "grad_norm": 1.760178804397583, + "learning_rate": 5e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7509934902191162, + "num_tokens": 410566222.0, + "step": 15871 + }, + { + "epoch": 1.7430265758840324, + "grad_norm": 1.7566150426864624, + "learning_rate": 5e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7196559906005859, + "num_tokens": 410593462.0, + "step": 15872 + }, + { + "epoch": 1.7431363935866462, + "grad_norm": 1.8473360538482666, + "learning_rate": 5e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7446973323822021, + "num_tokens": 410618192.0, + "step": 15873 + }, + { + "epoch": 1.74324621128926, + "grad_norm": 1.8816553354263306, + "learning_rate": 5e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7295913696289062, + "num_tokens": 410641457.0, + "step": 15874 + }, + { + "epoch": 1.7433560289918735, + "grad_norm": 1.7581757307052612, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.736002504825592, + "num_tokens": 410670911.0, + "step": 15875 + }, + { + "epoch": 1.743465846694487, + "grad_norm": 1.5737005472183228, + "learning_rate": 5e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.733369767665863, + "num_tokens": 410704598.0, + "step": 15876 + }, + { + "epoch": 1.7435756643971008, + "grad_norm": 2.110470771789551, + "learning_rate": 5e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7136172652244568, + "num_tokens": 410727728.0, + "step": 15877 + }, + { + "epoch": 1.7436854820997145, + "grad_norm": 1.9021718502044678, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7389854192733765, + "num_tokens": 410753449.0, + "step": 15878 + }, + { + "epoch": 1.7437952998023283, + "grad_norm": 1.8206640481948853, + "learning_rate": 5e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7476266026496887, + "num_tokens": 410776691.0, + "step": 15879 + }, + { + "epoch": 1.7439051175049418, + "grad_norm": 2.036376476287842, + "learning_rate": 5e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.746073305606842, + "num_tokens": 410798683.0, + "step": 15880 + }, + { + "epoch": 1.7440149352075554, + "grad_norm": 1.7231709957122803, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7335156798362732, + "num_tokens": 410826579.0, + "step": 15881 + }, + { + "epoch": 1.744124752910169, + "grad_norm": 1.7626484632492065, + "learning_rate": 5e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7498293519020081, + "num_tokens": 410850706.0, + "step": 15882 + }, + { + "epoch": 1.7442345706127829, + "grad_norm": 2.031285524368286, + "learning_rate": 5e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7451800107955933, + "num_tokens": 410870396.0, + "step": 15883 + }, + { + "epoch": 1.7443443883153964, + "grad_norm": 1.743275761604309, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7644146680831909, + "num_tokens": 410896072.0, + "step": 15884 + }, + { + "epoch": 1.7444542060180102, + "grad_norm": 1.6227915287017822, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7389929890632629, + "num_tokens": 410925776.0, + "step": 15885 + }, + { + "epoch": 1.7445640237206237, + "grad_norm": 1.6942156553268433, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7696205973625183, + "num_tokens": 410950820.0, + "step": 15886 + }, + { + "epoch": 1.7446738414232374, + "grad_norm": 1.76121985912323, + "learning_rate": 5e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7238149642944336, + "num_tokens": 410980959.0, + "step": 15887 + }, + { + "epoch": 1.7447836591258512, + "grad_norm": 1.9537707567214966, + "learning_rate": 5e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7508169412612915, + "num_tokens": 411000691.0, + "step": 15888 + }, + { + "epoch": 1.7448934768284647, + "grad_norm": 1.738227128982544, + "learning_rate": 5e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7160151600837708, + "num_tokens": 411029118.0, + "step": 15889 + }, + { + "epoch": 1.7450032945310783, + "grad_norm": 1.9223930835723877, + "learning_rate": 5e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7279775142669678, + "num_tokens": 411053835.0, + "step": 15890 + }, + { + "epoch": 1.745113112233692, + "grad_norm": 1.687894582748413, + "learning_rate": 5e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7366793155670166, + "num_tokens": 411083317.0, + "step": 15891 + }, + { + "epoch": 1.7452229299363058, + "grad_norm": 1.716281771659851, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7328859567642212, + "num_tokens": 411110787.0, + "step": 15892 + }, + { + "epoch": 1.7453327476389195, + "grad_norm": 1.8583229780197144, + "learning_rate": 5e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7237354516983032, + "num_tokens": 411138429.0, + "step": 15893 + }, + { + "epoch": 1.745442565341533, + "grad_norm": 1.7971571683883667, + "learning_rate": 5e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7445207834243774, + "num_tokens": 411163866.0, + "step": 15894 + }, + { + "epoch": 1.7455523830441466, + "grad_norm": 1.7530046701431274, + "learning_rate": 5e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7245041131973267, + "num_tokens": 411191562.0, + "step": 15895 + }, + { + "epoch": 1.7456622007467604, + "grad_norm": 1.9018800258636475, + "learning_rate": 5e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7360936403274536, + "num_tokens": 411214229.0, + "step": 15896 + }, + { + "epoch": 1.7457720184493741, + "grad_norm": 1.8111340999603271, + "learning_rate": 5e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7142454981803894, + "num_tokens": 411239689.0, + "step": 15897 + }, + { + "epoch": 1.7458818361519877, + "grad_norm": 1.6920855045318604, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7443820238113403, + "num_tokens": 411269375.0, + "step": 15898 + }, + { + "epoch": 1.7459916538546012, + "grad_norm": 1.7938112020492554, + "learning_rate": 5e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7330498695373535, + "num_tokens": 411294492.0, + "step": 15899 + }, + { + "epoch": 1.746101471557215, + "grad_norm": 1.8336360454559326, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7068382501602173, + "num_tokens": 411323372.0, + "step": 15900 + }, + { + "epoch": 1.7462112892598287, + "grad_norm": 1.7047251462936401, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7143714427947998, + "num_tokens": 411351748.0, + "step": 15901 + }, + { + "epoch": 1.7463211069624425, + "grad_norm": 1.6621077060699463, + "learning_rate": 5e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7226505279541016, + "num_tokens": 411379186.0, + "step": 15902 + }, + { + "epoch": 1.746430924665056, + "grad_norm": 1.9026011228561401, + "learning_rate": 5e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.750951886177063, + "num_tokens": 411403800.0, + "step": 15903 + }, + { + "epoch": 1.7465407423676695, + "grad_norm": 1.6756489276885986, + "learning_rate": 5e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7160373330116272, + "num_tokens": 411434102.0, + "step": 15904 + }, + { + "epoch": 1.7466505600702833, + "grad_norm": 1.7249823808670044, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7322777509689331, + "num_tokens": 411463731.0, + "step": 15905 + }, + { + "epoch": 1.746760377772897, + "grad_norm": 1.6551101207733154, + "learning_rate": 5e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7242485880851746, + "num_tokens": 411495036.0, + "step": 15906 + }, + { + "epoch": 1.7468701954755108, + "grad_norm": 1.842500925064087, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7370206117630005, + "num_tokens": 411519402.0, + "step": 15907 + }, + { + "epoch": 1.7469800131781243, + "grad_norm": 1.8438785076141357, + "learning_rate": 5e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7347376942634583, + "num_tokens": 411545740.0, + "step": 15908 + }, + { + "epoch": 1.7470898308807379, + "grad_norm": 1.8431966304779053, + "learning_rate": 5e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7377623915672302, + "num_tokens": 411573431.0, + "step": 15909 + }, + { + "epoch": 1.7471996485833516, + "grad_norm": 1.7031497955322266, + "learning_rate": 5e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7030137181282043, + "num_tokens": 411608199.0, + "step": 15910 + }, + { + "epoch": 1.7473094662859654, + "grad_norm": 1.758929967880249, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7236998677253723, + "num_tokens": 411636975.0, + "step": 15911 + }, + { + "epoch": 1.747419283988579, + "grad_norm": 1.7447665929794312, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7375827431678772, + "num_tokens": 411664135.0, + "step": 15912 + }, + { + "epoch": 1.7475291016911925, + "grad_norm": 1.9230592250823975, + "learning_rate": 5e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7392431497573853, + "num_tokens": 411687397.0, + "step": 15913 + }, + { + "epoch": 1.7476389193938062, + "grad_norm": 1.6910640001296997, + "learning_rate": 5e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7057843804359436, + "num_tokens": 411718180.0, + "step": 15914 + }, + { + "epoch": 1.74774873709642, + "grad_norm": 1.8839366436004639, + "learning_rate": 5e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7190485000610352, + "num_tokens": 411745229.0, + "step": 15915 + }, + { + "epoch": 1.7478585547990337, + "grad_norm": 1.7335702180862427, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.731116771697998, + "num_tokens": 411771339.0, + "step": 15916 + }, + { + "epoch": 1.7479683725016473, + "grad_norm": 1.6484543085098267, + "learning_rate": 5e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7106744050979614, + "num_tokens": 411799604.0, + "step": 15917 + }, + { + "epoch": 1.7480781902042608, + "grad_norm": 1.5439057350158691, + "learning_rate": 5e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7310718297958374, + "num_tokens": 411833824.0, + "step": 15918 + }, + { + "epoch": 1.7481880079068746, + "grad_norm": 1.9209694862365723, + "learning_rate": 5e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.735282301902771, + "num_tokens": 411857394.0, + "step": 15919 + }, + { + "epoch": 1.7482978256094883, + "grad_norm": 1.921543002128601, + "learning_rate": 5e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7330676317214966, + "num_tokens": 411881046.0, + "step": 15920 + }, + { + "epoch": 1.7484076433121019, + "grad_norm": 2.0023558139801025, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.769180178642273, + "num_tokens": 411902119.0, + "step": 15921 + }, + { + "epoch": 1.7485174610147156, + "grad_norm": 1.8448165655136108, + "learning_rate": 5e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7052983045578003, + "num_tokens": 411927904.0, + "step": 15922 + }, + { + "epoch": 1.7486272787173291, + "grad_norm": 1.6036946773529053, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7390227317810059, + "num_tokens": 411957848.0, + "step": 15923 + }, + { + "epoch": 1.748737096419943, + "grad_norm": 1.7410614490509033, + "learning_rate": 5e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7467560768127441, + "num_tokens": 411984402.0, + "step": 15924 + }, + { + "epoch": 1.7488469141225567, + "grad_norm": 1.8232463598251343, + "learning_rate": 5e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.716500997543335, + "num_tokens": 412013754.0, + "step": 15925 + }, + { + "epoch": 1.7489567318251702, + "grad_norm": 1.8837491273880005, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7615447640419006, + "num_tokens": 412036007.0, + "step": 15926 + }, + { + "epoch": 1.7490665495277837, + "grad_norm": 1.8619846105575562, + "learning_rate": 5e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7429323792457581, + "num_tokens": 412060310.0, + "step": 15927 + }, + { + "epoch": 1.7491763672303975, + "grad_norm": 2.038386821746826, + "learning_rate": 5e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7330564260482788, + "num_tokens": 412081560.0, + "step": 15928 + }, + { + "epoch": 1.7492861849330112, + "grad_norm": 2.2521812915802, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7330042719841003, + "num_tokens": 412099718.0, + "step": 15929 + }, + { + "epoch": 1.749396002635625, + "grad_norm": 1.8277331590652466, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7344439029693604, + "num_tokens": 412123901.0, + "step": 15930 + }, + { + "epoch": 1.7495058203382385, + "grad_norm": 1.726908802986145, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7143501043319702, + "num_tokens": 412153220.0, + "step": 15931 + }, + { + "epoch": 1.749615638040852, + "grad_norm": 1.7649619579315186, + "learning_rate": 5e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7420200705528259, + "num_tokens": 412179917.0, + "step": 15932 + }, + { + "epoch": 1.7497254557434658, + "grad_norm": 1.9931845664978027, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7375358939170837, + "num_tokens": 412203046.0, + "step": 15933 + }, + { + "epoch": 1.7498352734460796, + "grad_norm": 1.6729387044906616, + "learning_rate": 5e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.724864661693573, + "num_tokens": 412231946.0, + "step": 15934 + }, + { + "epoch": 1.7499450911486931, + "grad_norm": 1.7217615842819214, + "learning_rate": 5e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7235262393951416, + "num_tokens": 412260014.0, + "step": 15935 + }, + { + "epoch": 1.7500549088513069, + "grad_norm": 1.8040797710418701, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.726578414440155, + "num_tokens": 412286391.0, + "step": 15936 + }, + { + "epoch": 1.7501647265539204, + "grad_norm": 1.83554208278656, + "learning_rate": 5e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7335689663887024, + "num_tokens": 412310666.0, + "step": 15937 + }, + { + "epoch": 1.7502745442565342, + "grad_norm": 1.6961421966552734, + "learning_rate": 5e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.744269847869873, + "num_tokens": 412339916.0, + "step": 15938 + }, + { + "epoch": 1.750384361959148, + "grad_norm": 1.7774943113327026, + "learning_rate": 5e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.707526445388794, + "num_tokens": 412367583.0, + "step": 15939 + }, + { + "epoch": 1.7504941796617615, + "grad_norm": 1.8289563655853271, + "learning_rate": 5e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7139825820922852, + "num_tokens": 412398342.0, + "step": 15940 + }, + { + "epoch": 1.750603997364375, + "grad_norm": 1.806693196296692, + "learning_rate": 5e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7126680612564087, + "num_tokens": 412426524.0, + "step": 15941 + }, + { + "epoch": 1.7507138150669888, + "grad_norm": 1.9728922843933105, + "learning_rate": 5e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7249379754066467, + "num_tokens": 412449395.0, + "step": 15942 + }, + { + "epoch": 1.7508236327696025, + "grad_norm": 1.770660400390625, + "learning_rate": 5e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7322179079055786, + "num_tokens": 412476421.0, + "step": 15943 + }, + { + "epoch": 1.7509334504722163, + "grad_norm": 1.542603611946106, + "learning_rate": 5e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7276317477226257, + "num_tokens": 412510866.0, + "step": 15944 + }, + { + "epoch": 1.7510432681748298, + "grad_norm": 1.6829122304916382, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.741591215133667, + "num_tokens": 412541573.0, + "step": 15945 + }, + { + "epoch": 1.7511530858774433, + "grad_norm": 2.060093641281128, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7567085027694702, + "num_tokens": 412561001.0, + "step": 15946 + }, + { + "epoch": 1.751262903580057, + "grad_norm": 1.9137063026428223, + "learning_rate": 5e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7246606349945068, + "num_tokens": 412584844.0, + "step": 15947 + }, + { + "epoch": 1.7513727212826709, + "grad_norm": 2.0020389556884766, + "learning_rate": 5e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7360789179801941, + "num_tokens": 412608494.0, + "step": 15948 + }, + { + "epoch": 1.7514825389852844, + "grad_norm": 1.9338167905807495, + "learning_rate": 5e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7247340083122253, + "num_tokens": 412631430.0, + "step": 15949 + }, + { + "epoch": 1.7515923566878981, + "grad_norm": 1.6929192543029785, + "learning_rate": 5e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7169969081878662, + "num_tokens": 412658463.0, + "step": 15950 + }, + { + "epoch": 1.7517021743905117, + "grad_norm": 1.4705719947814941, + "learning_rate": 5e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.698050856590271, + "num_tokens": 412698959.0, + "step": 15951 + }, + { + "epoch": 1.7518119920931254, + "grad_norm": 1.5753835439682007, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7364478707313538, + "num_tokens": 412729330.0, + "step": 15952 + }, + { + "epoch": 1.7519218097957392, + "grad_norm": 1.685261845588684, + "learning_rate": 5e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7128140926361084, + "num_tokens": 412760870.0, + "step": 15953 + }, + { + "epoch": 1.7520316274983527, + "grad_norm": 1.8946186304092407, + "learning_rate": 5e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7153379917144775, + "num_tokens": 412784773.0, + "step": 15954 + }, + { + "epoch": 1.7521414452009663, + "grad_norm": 1.7642111778259277, + "learning_rate": 5e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7362260222434998, + "num_tokens": 412814968.0, + "step": 15955 + }, + { + "epoch": 1.75225126290358, + "grad_norm": 1.802155613899231, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7503499984741211, + "num_tokens": 412838046.0, + "step": 15956 + }, + { + "epoch": 1.7523610806061938, + "grad_norm": 1.8642778396606445, + "learning_rate": 5e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7411402463912964, + "num_tokens": 412864052.0, + "step": 15957 + }, + { + "epoch": 1.7524708983088075, + "grad_norm": 2.0193750858306885, + "learning_rate": 5e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7497977614402771, + "num_tokens": 412882875.0, + "step": 15958 + }, + { + "epoch": 1.752580716011421, + "grad_norm": 1.802077054977417, + "learning_rate": 5e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7492818832397461, + "num_tokens": 412905727.0, + "step": 15959 + }, + { + "epoch": 1.7526905337140346, + "grad_norm": 1.8707799911499023, + "learning_rate": 5e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7442256212234497, + "num_tokens": 412929368.0, + "step": 15960 + }, + { + "epoch": 1.7528003514166484, + "grad_norm": 1.6790053844451904, + "learning_rate": 5e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.7498949766159058, + "num_tokens": 412955560.0, + "step": 15961 + }, + { + "epoch": 1.7529101691192621, + "grad_norm": 1.8042092323303223, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7339454293251038, + "num_tokens": 412981249.0, + "step": 15962 + }, + { + "epoch": 1.7530199868218757, + "grad_norm": 1.7809152603149414, + "learning_rate": 5e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7529541254043579, + "num_tokens": 413005854.0, + "step": 15963 + }, + { + "epoch": 1.7531298045244892, + "grad_norm": 1.769524335861206, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7270241379737854, + "num_tokens": 413032569.0, + "step": 15964 + }, + { + "epoch": 1.753239622227103, + "grad_norm": 1.7065069675445557, + "learning_rate": 5e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7126580476760864, + "num_tokens": 413061842.0, + "step": 15965 + }, + { + "epoch": 1.7533494399297167, + "grad_norm": 1.8382084369659424, + "learning_rate": 5e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.6990451812744141, + "num_tokens": 413087069.0, + "step": 15966 + }, + { + "epoch": 1.7534592576323305, + "grad_norm": 1.8228262662887573, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7343733310699463, + "num_tokens": 413111109.0, + "step": 15967 + }, + { + "epoch": 1.753569075334944, + "grad_norm": 1.8910799026489258, + "learning_rate": 5e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7375284433364868, + "num_tokens": 413135078.0, + "step": 15968 + }, + { + "epoch": 1.7536788930375575, + "grad_norm": 1.9184895753860474, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7615053653717041, + "num_tokens": 413158696.0, + "step": 15969 + }, + { + "epoch": 1.7537887107401713, + "grad_norm": 1.6436264514923096, + "learning_rate": 5e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7262064218521118, + "num_tokens": 413188653.0, + "step": 15970 + }, + { + "epoch": 1.753898528442785, + "grad_norm": 1.7537667751312256, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7208173274993896, + "num_tokens": 413218621.0, + "step": 15971 + }, + { + "epoch": 1.7540083461453988, + "grad_norm": 1.7955679893493652, + "learning_rate": 5e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7280596494674683, + "num_tokens": 413246961.0, + "step": 15972 + }, + { + "epoch": 1.7541181638480123, + "grad_norm": 1.7726686000823975, + "learning_rate": 5e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7186429500579834, + "num_tokens": 413273481.0, + "step": 15973 + }, + { + "epoch": 1.7542279815506259, + "grad_norm": 1.7651559114456177, + "learning_rate": 5e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7250159382820129, + "num_tokens": 413300211.0, + "step": 15974 + }, + { + "epoch": 1.7543377992532396, + "grad_norm": 1.6784013509750366, + "learning_rate": 5e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7175867557525635, + "num_tokens": 413332547.0, + "step": 15975 + }, + { + "epoch": 1.7544476169558534, + "grad_norm": 2.249133825302124, + "learning_rate": 5e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7522847652435303, + "num_tokens": 413348993.0, + "step": 15976 + }, + { + "epoch": 1.754557434658467, + "grad_norm": 1.7550612688064575, + "learning_rate": 5e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7404752373695374, + "num_tokens": 413377890.0, + "step": 15977 + }, + { + "epoch": 1.7546672523610805, + "grad_norm": 1.7914085388183594, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7188215255737305, + "num_tokens": 413405060.0, + "step": 15978 + }, + { + "epoch": 1.7547770700636942, + "grad_norm": 1.9339200258255005, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7286192178726196, + "num_tokens": 413428417.0, + "step": 15979 + }, + { + "epoch": 1.754886887766308, + "grad_norm": 1.7324841022491455, + "learning_rate": 5e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7140489816665649, + "num_tokens": 413459168.0, + "step": 15980 + }, + { + "epoch": 1.7549967054689217, + "grad_norm": 1.8574590682983398, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.729584276676178, + "num_tokens": 413482635.0, + "step": 15981 + }, + { + "epoch": 1.7551065231715353, + "grad_norm": 1.8152014017105103, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7177123427391052, + "num_tokens": 413508933.0, + "step": 15982 + }, + { + "epoch": 1.7552163408741488, + "grad_norm": 1.6897506713867188, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7373868227005005, + "num_tokens": 413535634.0, + "step": 15983 + }, + { + "epoch": 1.7553261585767626, + "grad_norm": 1.8729358911514282, + "learning_rate": 5e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7410539984703064, + "num_tokens": 413559790.0, + "step": 15984 + }, + { + "epoch": 1.7554359762793763, + "grad_norm": 1.7301175594329834, + "learning_rate": 5e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7216501832008362, + "num_tokens": 413587337.0, + "step": 15985 + }, + { + "epoch": 1.7555457939819898, + "grad_norm": 1.8027623891830444, + "learning_rate": 5e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7228981852531433, + "num_tokens": 413612911.0, + "step": 15986 + }, + { + "epoch": 1.7556556116846036, + "grad_norm": 2.0052847862243652, + "learning_rate": 5e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7347557544708252, + "num_tokens": 413636422.0, + "step": 15987 + }, + { + "epoch": 1.7557654293872171, + "grad_norm": 1.6344558000564575, + "learning_rate": 5e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7317346334457397, + "num_tokens": 413667180.0, + "step": 15988 + }, + { + "epoch": 1.755875247089831, + "grad_norm": 1.7270705699920654, + "learning_rate": 5e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.720816969871521, + "num_tokens": 413694823.0, + "step": 15989 + }, + { + "epoch": 1.7559850647924446, + "grad_norm": 1.761030912399292, + "learning_rate": 5e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.728632926940918, + "num_tokens": 413721504.0, + "step": 15990 + }, + { + "epoch": 1.7560948824950582, + "grad_norm": 1.8737363815307617, + "learning_rate": 5e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.712888240814209, + "num_tokens": 413746642.0, + "step": 15991 + }, + { + "epoch": 1.7562047001976717, + "grad_norm": 1.8161084651947021, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7200486660003662, + "num_tokens": 413771782.0, + "step": 15992 + }, + { + "epoch": 1.7563145179002855, + "grad_norm": 1.717544674873352, + "learning_rate": 5e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7532918453216553, + "num_tokens": 413799674.0, + "step": 15993 + }, + { + "epoch": 1.7564243356028992, + "grad_norm": 2.1143438816070557, + "learning_rate": 5e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7211154699325562, + "num_tokens": 413821766.0, + "step": 15994 + }, + { + "epoch": 1.756534153305513, + "grad_norm": 2.0676169395446777, + "learning_rate": 5e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7274640202522278, + "num_tokens": 413842286.0, + "step": 15995 + }, + { + "epoch": 1.7566439710081265, + "grad_norm": 1.643510341644287, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7548946142196655, + "num_tokens": 413872029.0, + "step": 15996 + }, + { + "epoch": 1.75675378871074, + "grad_norm": 1.9985333681106567, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7370462417602539, + "num_tokens": 413892603.0, + "step": 15997 + }, + { + "epoch": 1.7568636064133538, + "grad_norm": 1.678633689880371, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7280356884002686, + "num_tokens": 413922551.0, + "step": 15998 + }, + { + "epoch": 1.7569734241159676, + "grad_norm": 1.736423373222351, + "learning_rate": 5e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7229213118553162, + "num_tokens": 413949649.0, + "step": 15999 + }, + { + "epoch": 1.757083241818581, + "grad_norm": 1.8048211336135864, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7527116537094116, + "num_tokens": 413972558.0, + "step": 16000 + }, + { + "epoch": 1.7571930595211949, + "grad_norm": 1.7478927373886108, + "learning_rate": 5e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7393784523010254, + "num_tokens": 413999512.0, + "step": 16001 + }, + { + "epoch": 1.7573028772238084, + "grad_norm": 2.0061991214752197, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7478699684143066, + "num_tokens": 414020053.0, + "step": 16002 + }, + { + "epoch": 1.7574126949264222, + "grad_norm": 1.723862648010254, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7064045071601868, + "num_tokens": 414047935.0, + "step": 16003 + }, + { + "epoch": 1.757522512629036, + "grad_norm": 1.7610909938812256, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7246662378311157, + "num_tokens": 414073523.0, + "step": 16004 + }, + { + "epoch": 1.7576323303316495, + "grad_norm": 1.543362021446228, + "learning_rate": 5e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7064347267150879, + "num_tokens": 414108870.0, + "step": 16005 + }, + { + "epoch": 1.757742148034263, + "grad_norm": 1.6722466945648193, + "learning_rate": 5e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7307683229446411, + "num_tokens": 414139834.0, + "step": 16006 + }, + { + "epoch": 1.7578519657368767, + "grad_norm": 1.8896533250808716, + "learning_rate": 5e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7524581551551819, + "num_tokens": 414160627.0, + "step": 16007 + }, + { + "epoch": 1.7579617834394905, + "grad_norm": 1.7153788805007935, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7153027057647705, + "num_tokens": 414187784.0, + "step": 16008 + }, + { + "epoch": 1.7580716011421043, + "grad_norm": 1.7535042762756348, + "learning_rate": 5e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7192795872688293, + "num_tokens": 414214163.0, + "step": 16009 + }, + { + "epoch": 1.7581814188447178, + "grad_norm": 1.9363514184951782, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7242181897163391, + "num_tokens": 414237111.0, + "step": 16010 + }, + { + "epoch": 1.7582912365473313, + "grad_norm": 1.6968907117843628, + "learning_rate": 5e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7323096990585327, + "num_tokens": 414266847.0, + "step": 16011 + }, + { + "epoch": 1.758401054249945, + "grad_norm": 1.6379714012145996, + "learning_rate": 5e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7268220782279968, + "num_tokens": 414298605.0, + "step": 16012 + }, + { + "epoch": 1.7585108719525588, + "grad_norm": 1.9835665225982666, + "learning_rate": 5e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7509941458702087, + "num_tokens": 414319136.0, + "step": 16013 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 1.7519538402557373, + "learning_rate": 5e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7265965342521667, + "num_tokens": 414345953.0, + "step": 16014 + }, + { + "epoch": 1.758730507357786, + "grad_norm": 1.9356733560562134, + "learning_rate": 5e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7373343110084534, + "num_tokens": 414369238.0, + "step": 16015 + }, + { + "epoch": 1.7588403250603997, + "grad_norm": 1.6965012550354004, + "learning_rate": 5e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7172936201095581, + "num_tokens": 414399735.0, + "step": 16016 + }, + { + "epoch": 1.7589501427630134, + "grad_norm": 1.7369636297225952, + "learning_rate": 5e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7406162023544312, + "num_tokens": 414426708.0, + "step": 16017 + }, + { + "epoch": 1.7590599604656272, + "grad_norm": 2.015594720840454, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7565127611160278, + "num_tokens": 414447534.0, + "step": 16018 + }, + { + "epoch": 1.7591697781682407, + "grad_norm": 1.9844874143600464, + "learning_rate": 5e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7384534478187561, + "num_tokens": 414469888.0, + "step": 16019 + }, + { + "epoch": 1.7592795958708543, + "grad_norm": 1.935511589050293, + "learning_rate": 5e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7482969760894775, + "num_tokens": 414490929.0, + "step": 16020 + }, + { + "epoch": 1.759389413573468, + "grad_norm": 1.753069519996643, + "learning_rate": 5e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7469408512115479, + "num_tokens": 414514489.0, + "step": 16021 + }, + { + "epoch": 1.7594992312760818, + "grad_norm": 1.7012327909469604, + "learning_rate": 5e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7414758205413818, + "num_tokens": 414542193.0, + "step": 16022 + }, + { + "epoch": 1.7596090489786955, + "grad_norm": 1.8077239990234375, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7505540251731873, + "num_tokens": 414566055.0, + "step": 16023 + }, + { + "epoch": 1.759718866681309, + "grad_norm": 1.8637945652008057, + "learning_rate": 5e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7269134521484375, + "num_tokens": 414592208.0, + "step": 16024 + }, + { + "epoch": 1.7598286843839226, + "grad_norm": 1.9109506607055664, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7267025709152222, + "num_tokens": 414616190.0, + "step": 16025 + }, + { + "epoch": 1.7599385020865363, + "grad_norm": 1.803346037864685, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7512832880020142, + "num_tokens": 414642915.0, + "step": 16026 + }, + { + "epoch": 1.76004831978915, + "grad_norm": 1.745255708694458, + "learning_rate": 5e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7310380339622498, + "num_tokens": 414671219.0, + "step": 16027 + }, + { + "epoch": 1.7601581374917636, + "grad_norm": 1.7568539381027222, + "learning_rate": 5e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7260971069335938, + "num_tokens": 414699547.0, + "step": 16028 + }, + { + "epoch": 1.7602679551943772, + "grad_norm": 1.975124716758728, + "learning_rate": 5e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7307270169258118, + "num_tokens": 414722821.0, + "step": 16029 + }, + { + "epoch": 1.760377772896991, + "grad_norm": 1.8692368268966675, + "learning_rate": 5e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7254437804222107, + "num_tokens": 414747436.0, + "step": 16030 + }, + { + "epoch": 1.7604875905996047, + "grad_norm": 1.583885669708252, + "learning_rate": 5e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7275011539459229, + "num_tokens": 414777291.0, + "step": 16031 + }, + { + "epoch": 1.7605974083022184, + "grad_norm": 1.6868929862976074, + "learning_rate": 5e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.731061577796936, + "num_tokens": 414806526.0, + "step": 16032 + }, + { + "epoch": 1.760707226004832, + "grad_norm": 1.9189329147338867, + "learning_rate": 5e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7454947233200073, + "num_tokens": 414829337.0, + "step": 16033 + }, + { + "epoch": 1.7608170437074455, + "grad_norm": 1.9904992580413818, + "learning_rate": 5e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.7388015985488892, + "num_tokens": 414852026.0, + "step": 16034 + }, + { + "epoch": 1.7609268614100593, + "grad_norm": 2.001457452774048, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7264920473098755, + "num_tokens": 414878589.0, + "step": 16035 + }, + { + "epoch": 1.761036679112673, + "grad_norm": 1.9088728427886963, + "learning_rate": 5e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7169100046157837, + "num_tokens": 414902501.0, + "step": 16036 + }, + { + "epoch": 1.7611464968152868, + "grad_norm": 1.861456036567688, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7360961437225342, + "num_tokens": 414928696.0, + "step": 16037 + }, + { + "epoch": 1.7612563145179003, + "grad_norm": 1.7749826908111572, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7168579697608948, + "num_tokens": 414955074.0, + "step": 16038 + }, + { + "epoch": 1.7613661322205139, + "grad_norm": 2.092921495437622, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.756433367729187, + "num_tokens": 414973591.0, + "step": 16039 + }, + { + "epoch": 1.7614759499231276, + "grad_norm": 2.0585975646972656, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.762057363986969, + "num_tokens": 414992577.0, + "step": 16040 + }, + { + "epoch": 1.7615857676257414, + "grad_norm": 1.8912944793701172, + "learning_rate": 5e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7266721725463867, + "num_tokens": 415016008.0, + "step": 16041 + }, + { + "epoch": 1.761695585328355, + "grad_norm": 1.7133023738861084, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7218909859657288, + "num_tokens": 415044253.0, + "step": 16042 + }, + { + "epoch": 1.7618054030309684, + "grad_norm": 1.6270711421966553, + "learning_rate": 5e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7275846004486084, + "num_tokens": 415073529.0, + "step": 16043 + }, + { + "epoch": 1.7619152207335822, + "grad_norm": 1.8820676803588867, + "learning_rate": 5e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7335375547409058, + "num_tokens": 415098063.0, + "step": 16044 + }, + { + "epoch": 1.762025038436196, + "grad_norm": 1.9840073585510254, + "learning_rate": 5e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7433568239212036, + "num_tokens": 415118950.0, + "step": 16045 + }, + { + "epoch": 1.7621348561388097, + "grad_norm": 1.74869704246521, + "learning_rate": 5e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7569562196731567, + "num_tokens": 415142623.0, + "step": 16046 + }, + { + "epoch": 1.7622446738414232, + "grad_norm": 2.0104238986968994, + "learning_rate": 5e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7211339473724365, + "num_tokens": 415163961.0, + "step": 16047 + }, + { + "epoch": 1.7623544915440368, + "grad_norm": 1.6841155290603638, + "learning_rate": 5e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7257789373397827, + "num_tokens": 415191996.0, + "step": 16048 + }, + { + "epoch": 1.7624643092466505, + "grad_norm": 1.7099617719650269, + "learning_rate": 5e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7272759675979614, + "num_tokens": 415221379.0, + "step": 16049 + }, + { + "epoch": 1.7625741269492643, + "grad_norm": 1.6143629550933838, + "learning_rate": 5e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7253391146659851, + "num_tokens": 415253494.0, + "step": 16050 + }, + { + "epoch": 1.7626839446518778, + "grad_norm": 1.858825445175171, + "learning_rate": 5e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7473248839378357, + "num_tokens": 415277793.0, + "step": 16051 + }, + { + "epoch": 1.7627937623544916, + "grad_norm": 1.73601233959198, + "learning_rate": 5e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7236121892929077, + "num_tokens": 415305088.0, + "step": 16052 + }, + { + "epoch": 1.7629035800571051, + "grad_norm": 1.80157470703125, + "learning_rate": 5e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.713984489440918, + "num_tokens": 415330667.0, + "step": 16053 + }, + { + "epoch": 1.7630133977597189, + "grad_norm": 1.882060170173645, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7328432202339172, + "num_tokens": 415355190.0, + "step": 16054 + }, + { + "epoch": 1.7631232154623326, + "grad_norm": 1.8913325071334839, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7583662867546082, + "num_tokens": 415377228.0, + "step": 16055 + }, + { + "epoch": 1.7632330331649462, + "grad_norm": 2.088794231414795, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7361600399017334, + "num_tokens": 415399324.0, + "step": 16056 + }, + { + "epoch": 1.7633428508675597, + "grad_norm": 1.7498140335083008, + "learning_rate": 5e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7128166556358337, + "num_tokens": 415427527.0, + "step": 16057 + }, + { + "epoch": 1.7634526685701735, + "grad_norm": 1.8619598150253296, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7322369813919067, + "num_tokens": 415451200.0, + "step": 16058 + }, + { + "epoch": 1.7635624862727872, + "grad_norm": 1.9334017038345337, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7253296375274658, + "num_tokens": 415474659.0, + "step": 16059 + }, + { + "epoch": 1.763672303975401, + "grad_norm": 1.9660451412200928, + "learning_rate": 5e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.712518036365509, + "num_tokens": 415500399.0, + "step": 16060 + }, + { + "epoch": 1.7637821216780145, + "grad_norm": 2.0662906169891357, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7588775753974915, + "num_tokens": 415520991.0, + "step": 16061 + }, + { + "epoch": 1.763891939380628, + "grad_norm": 1.7477608919143677, + "learning_rate": 5e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7131024599075317, + "num_tokens": 415551486.0, + "step": 16062 + }, + { + "epoch": 1.7640017570832418, + "grad_norm": 1.831034779548645, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7281506061553955, + "num_tokens": 415577365.0, + "step": 16063 + }, + { + "epoch": 1.7641115747858556, + "grad_norm": 1.7782198190689087, + "learning_rate": 5e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7248101234436035, + "num_tokens": 415603485.0, + "step": 16064 + }, + { + "epoch": 1.764221392488469, + "grad_norm": 1.930306077003479, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7623881101608276, + "num_tokens": 415625690.0, + "step": 16065 + }, + { + "epoch": 1.7643312101910829, + "grad_norm": 1.921409010887146, + "learning_rate": 5e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.735450267791748, + "num_tokens": 415650401.0, + "step": 16066 + }, + { + "epoch": 1.7644410278936964, + "grad_norm": 1.8318687677383423, + "learning_rate": 5e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7268809080123901, + "num_tokens": 415676506.0, + "step": 16067 + }, + { + "epoch": 1.7645508455963101, + "grad_norm": 1.7603248357772827, + "learning_rate": 5e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7162582874298096, + "num_tokens": 415703518.0, + "step": 16068 + }, + { + "epoch": 1.764660663298924, + "grad_norm": 1.7809432744979858, + "learning_rate": 5e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7211527824401855, + "num_tokens": 415733931.0, + "step": 16069 + }, + { + "epoch": 1.7647704810015374, + "grad_norm": 1.8107291460037231, + "learning_rate": 5e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7236529588699341, + "num_tokens": 415758992.0, + "step": 16070 + }, + { + "epoch": 1.764880298704151, + "grad_norm": 1.8242297172546387, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7496187686920166, + "num_tokens": 415782961.0, + "step": 16071 + }, + { + "epoch": 1.7649901164067647, + "grad_norm": 1.8112202882766724, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7459651231765747, + "num_tokens": 415807532.0, + "step": 16072 + }, + { + "epoch": 1.7650999341093785, + "grad_norm": 1.7698431015014648, + "learning_rate": 5e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7183789610862732, + "num_tokens": 415835226.0, + "step": 16073 + }, + { + "epoch": 1.7652097518119922, + "grad_norm": 1.9829877614974976, + "learning_rate": 5e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7397540807723999, + "num_tokens": 415857716.0, + "step": 16074 + }, + { + "epoch": 1.7653195695146058, + "grad_norm": 1.708522081375122, + "learning_rate": 5e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.711206316947937, + "num_tokens": 415884594.0, + "step": 16075 + }, + { + "epoch": 1.7654293872172193, + "grad_norm": 1.8513591289520264, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7580533027648926, + "num_tokens": 415909555.0, + "step": 16076 + }, + { + "epoch": 1.765539204919833, + "grad_norm": 2.0145061016082764, + "learning_rate": 5e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7152162790298462, + "num_tokens": 415933335.0, + "step": 16077 + }, + { + "epoch": 1.7656490226224468, + "grad_norm": 1.9732900857925415, + "learning_rate": 5e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7242840528488159, + "num_tokens": 415956174.0, + "step": 16078 + }, + { + "epoch": 1.7657588403250604, + "grad_norm": 1.8523904085159302, + "learning_rate": 5e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7069307565689087, + "num_tokens": 415983666.0, + "step": 16079 + }, + { + "epoch": 1.765868658027674, + "grad_norm": 1.9386297464370728, + "learning_rate": 5e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7382709980010986, + "num_tokens": 416006638.0, + "step": 16080 + }, + { + "epoch": 1.7659784757302877, + "grad_norm": 1.8802087306976318, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7256293296813965, + "num_tokens": 416032630.0, + "step": 16081 + }, + { + "epoch": 1.7660882934329014, + "grad_norm": 2.112536668777466, + "learning_rate": 5e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7496929168701172, + "num_tokens": 416050973.0, + "step": 16082 + }, + { + "epoch": 1.7661981111355152, + "grad_norm": 1.8742305040359497, + "learning_rate": 5e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7311937808990479, + "num_tokens": 416075912.0, + "step": 16083 + }, + { + "epoch": 1.7663079288381287, + "grad_norm": 1.6924757957458496, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7327434420585632, + "num_tokens": 416103560.0, + "step": 16084 + }, + { + "epoch": 1.7664177465407422, + "grad_norm": 1.8390573263168335, + "learning_rate": 5e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7365249395370483, + "num_tokens": 416126640.0, + "step": 16085 + }, + { + "epoch": 1.766527564243356, + "grad_norm": 2.046126127243042, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7435927987098694, + "num_tokens": 416147063.0, + "step": 16086 + }, + { + "epoch": 1.7666373819459698, + "grad_norm": 1.6857898235321045, + "learning_rate": 5e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7178842425346375, + "num_tokens": 416175279.0, + "step": 16087 + }, + { + "epoch": 1.7667471996485835, + "grad_norm": 1.7574812173843384, + "learning_rate": 5e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7110790014266968, + "num_tokens": 416200412.0, + "step": 16088 + }, + { + "epoch": 1.766857017351197, + "grad_norm": 1.7991291284561157, + "learning_rate": 5e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7442366480827332, + "num_tokens": 416226296.0, + "step": 16089 + }, + { + "epoch": 1.7669668350538106, + "grad_norm": 1.6692363023757935, + "learning_rate": 5e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7073900699615479, + "num_tokens": 416257836.0, + "step": 16090 + }, + { + "epoch": 1.7670766527564243, + "grad_norm": 1.7854046821594238, + "learning_rate": 5e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.757931113243103, + "num_tokens": 416284301.0, + "step": 16091 + }, + { + "epoch": 1.767186470459038, + "grad_norm": 1.8483340740203857, + "learning_rate": 5e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7254602909088135, + "num_tokens": 416309205.0, + "step": 16092 + }, + { + "epoch": 1.7672962881616516, + "grad_norm": 1.8622283935546875, + "learning_rate": 5e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.72365403175354, + "num_tokens": 416335413.0, + "step": 16093 + }, + { + "epoch": 1.7674061058642652, + "grad_norm": 1.755397915840149, + "learning_rate": 5e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7469557523727417, + "num_tokens": 416360738.0, + "step": 16094 + }, + { + "epoch": 1.767515923566879, + "grad_norm": 1.6896198987960815, + "learning_rate": 5e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7243900895118713, + "num_tokens": 416390925.0, + "step": 16095 + }, + { + "epoch": 1.7676257412694927, + "grad_norm": 1.9286680221557617, + "learning_rate": 5e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7366243004798889, + "num_tokens": 416414319.0, + "step": 16096 + }, + { + "epoch": 1.7677355589721064, + "grad_norm": 1.725014567375183, + "learning_rate": 5e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7063723802566528, + "num_tokens": 416443126.0, + "step": 16097 + }, + { + "epoch": 1.76784537667472, + "grad_norm": 2.2036502361297607, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7430984377861023, + "num_tokens": 416461745.0, + "step": 16098 + }, + { + "epoch": 1.7679551943773335, + "grad_norm": 1.7724932432174683, + "learning_rate": 5e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7576347589492798, + "num_tokens": 416486062.0, + "step": 16099 + }, + { + "epoch": 1.7680650120799473, + "grad_norm": 1.8770136833190918, + "learning_rate": 5e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7124930620193481, + "num_tokens": 416510711.0, + "step": 16100 + }, + { + "epoch": 1.768174829782561, + "grad_norm": 1.9036319255828857, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7223227620124817, + "num_tokens": 416534999.0, + "step": 16101 + }, + { + "epoch": 1.7682846474851748, + "grad_norm": 1.5897737741470337, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7306927442550659, + "num_tokens": 416566247.0, + "step": 16102 + }, + { + "epoch": 1.7683944651877883, + "grad_norm": 2.1687822341918945, + "learning_rate": 5e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.737651526927948, + "num_tokens": 416585401.0, + "step": 16103 + }, + { + "epoch": 1.7685042828904018, + "grad_norm": 1.8145958185195923, + "learning_rate": 5e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7142045497894287, + "num_tokens": 416612848.0, + "step": 16104 + }, + { + "epoch": 1.7686141005930156, + "grad_norm": 1.8587422370910645, + "learning_rate": 5e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7404506206512451, + "num_tokens": 416637814.0, + "step": 16105 + }, + { + "epoch": 1.7687239182956294, + "grad_norm": 1.7629469633102417, + "learning_rate": 5e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7403444647789001, + "num_tokens": 416663278.0, + "step": 16106 + }, + { + "epoch": 1.768833735998243, + "grad_norm": 1.6310769319534302, + "learning_rate": 5e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7214841842651367, + "num_tokens": 416691624.0, + "step": 16107 + }, + { + "epoch": 1.7689435537008564, + "grad_norm": 1.78732168674469, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7391687035560608, + "num_tokens": 416715842.0, + "step": 16108 + }, + { + "epoch": 1.7690533714034702, + "grad_norm": 2.0238518714904785, + "learning_rate": 5e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7321823239326477, + "num_tokens": 416737900.0, + "step": 16109 + }, + { + "epoch": 1.769163189106084, + "grad_norm": 1.9129749536514282, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7407310009002686, + "num_tokens": 416759984.0, + "step": 16110 + }, + { + "epoch": 1.7692730068086977, + "grad_norm": 1.9157553911209106, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7339056730270386, + "num_tokens": 416784157.0, + "step": 16111 + }, + { + "epoch": 1.7693828245113112, + "grad_norm": 1.8883339166641235, + "learning_rate": 5e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7254034876823425, + "num_tokens": 416806055.0, + "step": 16112 + }, + { + "epoch": 1.7694926422139248, + "grad_norm": 1.9893962144851685, + "learning_rate": 5e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7484899163246155, + "num_tokens": 416827334.0, + "step": 16113 + }, + { + "epoch": 1.7696024599165385, + "grad_norm": 1.6142187118530273, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7258947491645813, + "num_tokens": 416860978.0, + "step": 16114 + }, + { + "epoch": 1.7697122776191523, + "grad_norm": 1.979264259338379, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7388699054718018, + "num_tokens": 416884050.0, + "step": 16115 + }, + { + "epoch": 1.7698220953217658, + "grad_norm": 1.6635419130325317, + "learning_rate": 5e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7311660051345825, + "num_tokens": 416913678.0, + "step": 16116 + }, + { + "epoch": 1.7699319130243796, + "grad_norm": 1.8619866371154785, + "learning_rate": 5e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7069029211997986, + "num_tokens": 416940190.0, + "step": 16117 + }, + { + "epoch": 1.7700417307269931, + "grad_norm": 1.6176739931106567, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7453827261924744, + "num_tokens": 416969756.0, + "step": 16118 + }, + { + "epoch": 1.7701515484296069, + "grad_norm": 1.699155569076538, + "learning_rate": 5e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.6990571618080139, + "num_tokens": 416999500.0, + "step": 16119 + }, + { + "epoch": 1.7702613661322206, + "grad_norm": 1.9370574951171875, + "learning_rate": 5e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.723758339881897, + "num_tokens": 417023978.0, + "step": 16120 + }, + { + "epoch": 1.7703711838348342, + "grad_norm": 1.5981416702270508, + "learning_rate": 5e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7252969741821289, + "num_tokens": 417055098.0, + "step": 16121 + }, + { + "epoch": 1.7704810015374477, + "grad_norm": 1.7709606885910034, + "learning_rate": 5e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7418029308319092, + "num_tokens": 417082632.0, + "step": 16122 + }, + { + "epoch": 1.7705908192400615, + "grad_norm": 1.9154843091964722, + "learning_rate": 5e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7339338660240173, + "num_tokens": 417105516.0, + "step": 16123 + }, + { + "epoch": 1.7707006369426752, + "grad_norm": 1.9221528768539429, + "learning_rate": 5e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7397722005844116, + "num_tokens": 417127718.0, + "step": 16124 + }, + { + "epoch": 1.770810454645289, + "grad_norm": 1.9249591827392578, + "learning_rate": 5e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7361441850662231, + "num_tokens": 417151305.0, + "step": 16125 + }, + { + "epoch": 1.7709202723479025, + "grad_norm": 1.63590669631958, + "learning_rate": 5e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7176762819290161, + "num_tokens": 417180939.0, + "step": 16126 + }, + { + "epoch": 1.771030090050516, + "grad_norm": 1.6911698579788208, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.717529296875, + "num_tokens": 417210778.0, + "step": 16127 + }, + { + "epoch": 1.7711399077531298, + "grad_norm": 1.8833383321762085, + "learning_rate": 5e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7418385744094849, + "num_tokens": 417235165.0, + "step": 16128 + }, + { + "epoch": 1.7712497254557436, + "grad_norm": 1.7656930685043335, + "learning_rate": 5e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7109501361846924, + "num_tokens": 417263740.0, + "step": 16129 + }, + { + "epoch": 1.771359543158357, + "grad_norm": 1.774685025215149, + "learning_rate": 5e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7431632280349731, + "num_tokens": 417288117.0, + "step": 16130 + }, + { + "epoch": 1.7714693608609708, + "grad_norm": 1.9841949939727783, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7502115368843079, + "num_tokens": 417308933.0, + "step": 16131 + }, + { + "epoch": 1.7715791785635844, + "grad_norm": 1.7088230848312378, + "learning_rate": 5e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7168552875518799, + "num_tokens": 417338250.0, + "step": 16132 + }, + { + "epoch": 1.7716889962661981, + "grad_norm": 1.7858928442001343, + "learning_rate": 5e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7214736342430115, + "num_tokens": 417367800.0, + "step": 16133 + }, + { + "epoch": 1.771798813968812, + "grad_norm": 1.6391953229904175, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7269207239151001, + "num_tokens": 417402473.0, + "step": 16134 + }, + { + "epoch": 1.7719086316714254, + "grad_norm": 1.6402671337127686, + "learning_rate": 5e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.70732581615448, + "num_tokens": 417431795.0, + "step": 16135 + }, + { + "epoch": 1.772018449374039, + "grad_norm": 1.7810574769973755, + "learning_rate": 5e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7555915117263794, + "num_tokens": 417457318.0, + "step": 16136 + }, + { + "epoch": 1.7721282670766527, + "grad_norm": 1.881703495979309, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7315778136253357, + "num_tokens": 417480919.0, + "step": 16137 + }, + { + "epoch": 1.7722380847792665, + "grad_norm": 1.851366639137268, + "learning_rate": 5e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7206341624259949, + "num_tokens": 417507244.0, + "step": 16138 + }, + { + "epoch": 1.7723479024818802, + "grad_norm": 1.8430665731430054, + "learning_rate": 5e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7197229266166687, + "num_tokens": 417534103.0, + "step": 16139 + }, + { + "epoch": 1.7724577201844938, + "grad_norm": 1.8154903650283813, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7510215640068054, + "num_tokens": 417558303.0, + "step": 16140 + }, + { + "epoch": 1.7725675378871073, + "grad_norm": 1.640634536743164, + "learning_rate": 5e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7247126698493958, + "num_tokens": 417589957.0, + "step": 16141 + }, + { + "epoch": 1.772677355589721, + "grad_norm": 2.0846168994903564, + "learning_rate": 5e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7233642339706421, + "num_tokens": 417613177.0, + "step": 16142 + }, + { + "epoch": 1.7727871732923348, + "grad_norm": 1.8354524374008179, + "learning_rate": 5e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7130746841430664, + "num_tokens": 417637654.0, + "step": 16143 + }, + { + "epoch": 1.7728969909949484, + "grad_norm": 1.7784078121185303, + "learning_rate": 5e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7243889570236206, + "num_tokens": 417665324.0, + "step": 16144 + }, + { + "epoch": 1.7730068086975619, + "grad_norm": 1.6334573030471802, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7426120638847351, + "num_tokens": 417695680.0, + "step": 16145 + }, + { + "epoch": 1.7731166264001756, + "grad_norm": 1.9368211030960083, + "learning_rate": 5e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7421342730522156, + "num_tokens": 417716666.0, + "step": 16146 + }, + { + "epoch": 1.7732264441027894, + "grad_norm": 1.7577975988388062, + "learning_rate": 5e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.721250593662262, + "num_tokens": 417745168.0, + "step": 16147 + }, + { + "epoch": 1.7733362618054032, + "grad_norm": 1.8417471647262573, + "learning_rate": 5e-06, + "loss": 0.7944, + "mean_token_accuracy": 0.7445248365402222, + "num_tokens": 417768909.0, + "step": 16148 + }, + { + "epoch": 1.7734460795080167, + "grad_norm": 1.8202570676803589, + "learning_rate": 5e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7389183044433594, + "num_tokens": 417794281.0, + "step": 16149 + }, + { + "epoch": 1.7735558972106302, + "grad_norm": 1.688563585281372, + "learning_rate": 5e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7285058498382568, + "num_tokens": 417821563.0, + "step": 16150 + }, + { + "epoch": 1.773665714913244, + "grad_norm": 1.8164668083190918, + "learning_rate": 5e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7338751554489136, + "num_tokens": 417846754.0, + "step": 16151 + }, + { + "epoch": 1.7737755326158577, + "grad_norm": 1.8195991516113281, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7437213063240051, + "num_tokens": 417871095.0, + "step": 16152 + }, + { + "epoch": 1.7738853503184715, + "grad_norm": 1.7990689277648926, + "learning_rate": 5e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7206292152404785, + "num_tokens": 417896045.0, + "step": 16153 + }, + { + "epoch": 1.773995168021085, + "grad_norm": 1.9145374298095703, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7428789734840393, + "num_tokens": 417919812.0, + "step": 16154 + }, + { + "epoch": 1.7741049857236986, + "grad_norm": 1.7629430294036865, + "learning_rate": 5e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7192550897598267, + "num_tokens": 417949638.0, + "step": 16155 + }, + { + "epoch": 1.7742148034263123, + "grad_norm": 2.0411651134490967, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7511919736862183, + "num_tokens": 417971173.0, + "step": 16156 + }, + { + "epoch": 1.774324621128926, + "grad_norm": 1.7699110507965088, + "learning_rate": 5e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.717638373374939, + "num_tokens": 418000204.0, + "step": 16157 + }, + { + "epoch": 1.7744344388315396, + "grad_norm": 1.8184765577316284, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7325453162193298, + "num_tokens": 418028203.0, + "step": 16158 + }, + { + "epoch": 1.7745442565341532, + "grad_norm": 1.9335378408432007, + "learning_rate": 5e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7502182722091675, + "num_tokens": 418050163.0, + "step": 16159 + }, + { + "epoch": 1.774654074236767, + "grad_norm": 1.7924836874008179, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7563711404800415, + "num_tokens": 418075300.0, + "step": 16160 + }, + { + "epoch": 1.7747638919393807, + "grad_norm": 1.7249584197998047, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7243041396141052, + "num_tokens": 418105601.0, + "step": 16161 + }, + { + "epoch": 1.7748737096419944, + "grad_norm": 2.0226807594299316, + "learning_rate": 5e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7390144467353821, + "num_tokens": 418127446.0, + "step": 16162 + }, + { + "epoch": 1.774983527344608, + "grad_norm": 1.6462150812149048, + "learning_rate": 5e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7264320254325867, + "num_tokens": 418157730.0, + "step": 16163 + }, + { + "epoch": 1.7750933450472215, + "grad_norm": 1.931299090385437, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7207605838775635, + "num_tokens": 418185654.0, + "step": 16164 + }, + { + "epoch": 1.7752031627498353, + "grad_norm": 1.8356245756149292, + "learning_rate": 5e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7401240468025208, + "num_tokens": 418208217.0, + "step": 16165 + }, + { + "epoch": 1.775312980452449, + "grad_norm": 1.5771675109863281, + "learning_rate": 5e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7419768571853638, + "num_tokens": 418238660.0, + "step": 16166 + }, + { + "epoch": 1.7754227981550625, + "grad_norm": 1.7874988317489624, + "learning_rate": 5e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7318786978721619, + "num_tokens": 418266349.0, + "step": 16167 + }, + { + "epoch": 1.7755326158576763, + "grad_norm": 1.804137945175171, + "learning_rate": 5e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7183296084403992, + "num_tokens": 418293746.0, + "step": 16168 + }, + { + "epoch": 1.7756424335602898, + "grad_norm": 1.758399248123169, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7167498469352722, + "num_tokens": 418321799.0, + "step": 16169 + }, + { + "epoch": 1.7757522512629036, + "grad_norm": 1.8633286952972412, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7308382987976074, + "num_tokens": 418346383.0, + "step": 16170 + }, + { + "epoch": 1.7758620689655173, + "grad_norm": 1.8839330673217773, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7264490127563477, + "num_tokens": 418368903.0, + "step": 16171 + }, + { + "epoch": 1.7759718866681309, + "grad_norm": 1.6741273403167725, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7255282402038574, + "num_tokens": 418397246.0, + "step": 16172 + }, + { + "epoch": 1.7760817043707444, + "grad_norm": 1.5181450843811035, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.744604229927063, + "num_tokens": 418432047.0, + "step": 16173 + }, + { + "epoch": 1.7761915220733582, + "grad_norm": 1.7596443891525269, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7086172103881836, + "num_tokens": 418460993.0, + "step": 16174 + }, + { + "epoch": 1.776301339775972, + "grad_norm": 1.6213072538375854, + "learning_rate": 5e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7143750190734863, + "num_tokens": 418494571.0, + "step": 16175 + }, + { + "epoch": 1.7764111574785857, + "grad_norm": 1.782654047012329, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7231377959251404, + "num_tokens": 418523280.0, + "step": 16176 + }, + { + "epoch": 1.7765209751811992, + "grad_norm": 1.730971097946167, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7316884994506836, + "num_tokens": 418551325.0, + "step": 16177 + }, + { + "epoch": 1.7766307928838128, + "grad_norm": 1.7367894649505615, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7336645126342773, + "num_tokens": 418578441.0, + "step": 16178 + }, + { + "epoch": 1.7767406105864265, + "grad_norm": 1.8485809564590454, + "learning_rate": 5e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7368032932281494, + "num_tokens": 418605949.0, + "step": 16179 + }, + { + "epoch": 1.7768504282890403, + "grad_norm": 1.808872103691101, + "learning_rate": 5e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7021359205245972, + "num_tokens": 418635955.0, + "step": 16180 + }, + { + "epoch": 1.7769602459916538, + "grad_norm": 1.8029741048812866, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7571610808372498, + "num_tokens": 418660076.0, + "step": 16181 + }, + { + "epoch": 1.7770700636942676, + "grad_norm": 1.9276297092437744, + "learning_rate": 5e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7192782163619995, + "num_tokens": 418685482.0, + "step": 16182 + }, + { + "epoch": 1.777179881396881, + "grad_norm": 1.9567937850952148, + "learning_rate": 5e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7426473498344421, + "num_tokens": 418707903.0, + "step": 16183 + }, + { + "epoch": 1.7772896990994949, + "grad_norm": 1.932002305984497, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7566530704498291, + "num_tokens": 418729076.0, + "step": 16184 + }, + { + "epoch": 1.7773995168021086, + "grad_norm": 2.075794219970703, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7343165278434753, + "num_tokens": 418750493.0, + "step": 16185 + }, + { + "epoch": 1.7775093345047221, + "grad_norm": 2.1014626026153564, + "learning_rate": 5e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7452518939971924, + "num_tokens": 418769752.0, + "step": 16186 + }, + { + "epoch": 1.7776191522073357, + "grad_norm": 1.744654893875122, + "learning_rate": 5e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7347951531410217, + "num_tokens": 418798988.0, + "step": 16187 + }, + { + "epoch": 1.7777289699099494, + "grad_norm": 1.7849674224853516, + "learning_rate": 5e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7498524188995361, + "num_tokens": 418823815.0, + "step": 16188 + }, + { + "epoch": 1.7778387876125632, + "grad_norm": 1.8376829624176025, + "learning_rate": 5e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7230854034423828, + "num_tokens": 418848619.0, + "step": 16189 + }, + { + "epoch": 1.777948605315177, + "grad_norm": 1.6451081037521362, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7336741089820862, + "num_tokens": 418879109.0, + "step": 16190 + }, + { + "epoch": 1.7780584230177905, + "grad_norm": 2.0335638523101807, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7344845533370972, + "num_tokens": 418900401.0, + "step": 16191 + }, + { + "epoch": 1.778168240720404, + "grad_norm": 1.8186653852462769, + "learning_rate": 5e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7292165756225586, + "num_tokens": 418925756.0, + "step": 16192 + }, + { + "epoch": 1.7782780584230178, + "grad_norm": 1.904991626739502, + "learning_rate": 5e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7160341739654541, + "num_tokens": 418953562.0, + "step": 16193 + }, + { + "epoch": 1.7783878761256315, + "grad_norm": 1.7828037738800049, + "learning_rate": 5e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7235854268074036, + "num_tokens": 418982983.0, + "step": 16194 + }, + { + "epoch": 1.778497693828245, + "grad_norm": 1.986220359802246, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7265801429748535, + "num_tokens": 419006147.0, + "step": 16195 + }, + { + "epoch": 1.7786075115308586, + "grad_norm": 1.5770620107650757, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7292541265487671, + "num_tokens": 419037964.0, + "step": 16196 + }, + { + "epoch": 1.7787173292334724, + "grad_norm": 1.596360445022583, + "learning_rate": 5e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7300196886062622, + "num_tokens": 419071043.0, + "step": 16197 + }, + { + "epoch": 1.7788271469360861, + "grad_norm": 1.8295785188674927, + "learning_rate": 5e-06, + "loss": 0.8081, + "mean_token_accuracy": 0.7365366816520691, + "num_tokens": 419095467.0, + "step": 16198 + }, + { + "epoch": 1.7789369646386999, + "grad_norm": 1.7987720966339111, + "learning_rate": 5e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7458371520042419, + "num_tokens": 419119435.0, + "step": 16199 + }, + { + "epoch": 1.7790467823413134, + "grad_norm": 1.6753002405166626, + "learning_rate": 5e-06, + "loss": 0.906, + "mean_token_accuracy": 0.714329719543457, + "num_tokens": 419148179.0, + "step": 16200 + }, + { + "epoch": 1.779156600043927, + "grad_norm": 1.7037949562072754, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7572433352470398, + "num_tokens": 419174045.0, + "step": 16201 + }, + { + "epoch": 1.7792664177465407, + "grad_norm": 1.6335195302963257, + "learning_rate": 5e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7531101107597351, + "num_tokens": 419202512.0, + "step": 16202 + }, + { + "epoch": 1.7793762354491545, + "grad_norm": 1.6768428087234497, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7395031452178955, + "num_tokens": 419232218.0, + "step": 16203 + }, + { + "epoch": 1.7794860531517682, + "grad_norm": 1.9319043159484863, + "learning_rate": 5e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.714138925075531, + "num_tokens": 419255874.0, + "step": 16204 + }, + { + "epoch": 1.7795958708543818, + "grad_norm": 1.9144439697265625, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7430171966552734, + "num_tokens": 419278622.0, + "step": 16205 + }, + { + "epoch": 1.7797056885569953, + "grad_norm": 1.7218775749206543, + "learning_rate": 5e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7040682435035706, + "num_tokens": 419309756.0, + "step": 16206 + }, + { + "epoch": 1.779815506259609, + "grad_norm": 1.879607915878296, + "learning_rate": 5e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7284263968467712, + "num_tokens": 419334143.0, + "step": 16207 + }, + { + "epoch": 1.7799253239622228, + "grad_norm": 1.8897498846054077, + "learning_rate": 5e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7192751169204712, + "num_tokens": 419359050.0, + "step": 16208 + }, + { + "epoch": 1.7800351416648363, + "grad_norm": 2.0248937606811523, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7360631227493286, + "num_tokens": 419380489.0, + "step": 16209 + }, + { + "epoch": 1.7801449593674499, + "grad_norm": 2.0170888900756836, + "learning_rate": 5e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7361159324645996, + "num_tokens": 419403276.0, + "step": 16210 + }, + { + "epoch": 1.7802547770700636, + "grad_norm": 1.8198872804641724, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7362627983093262, + "num_tokens": 419427103.0, + "step": 16211 + }, + { + "epoch": 1.7803645947726774, + "grad_norm": 1.6554192304611206, + "learning_rate": 5e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7247455716133118, + "num_tokens": 419457111.0, + "step": 16212 + }, + { + "epoch": 1.7804744124752911, + "grad_norm": 1.6833535432815552, + "learning_rate": 5e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7119346261024475, + "num_tokens": 419487216.0, + "step": 16213 + }, + { + "epoch": 1.7805842301779047, + "grad_norm": 1.6748480796813965, + "learning_rate": 5e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7316843867301941, + "num_tokens": 419514478.0, + "step": 16214 + }, + { + "epoch": 1.7806940478805182, + "grad_norm": 1.614700436592102, + "learning_rate": 5e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7341908812522888, + "num_tokens": 419543525.0, + "step": 16215 + }, + { + "epoch": 1.780803865583132, + "grad_norm": 1.77069091796875, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.715049147605896, + "num_tokens": 419570868.0, + "step": 16216 + }, + { + "epoch": 1.7809136832857457, + "grad_norm": 1.577783226966858, + "learning_rate": 5e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7185877561569214, + "num_tokens": 419604276.0, + "step": 16217 + }, + { + "epoch": 1.7810235009883595, + "grad_norm": 1.882309913635254, + "learning_rate": 5e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7306749820709229, + "num_tokens": 419628043.0, + "step": 16218 + }, + { + "epoch": 1.781133318690973, + "grad_norm": 1.7007808685302734, + "learning_rate": 5e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7193336486816406, + "num_tokens": 419656207.0, + "step": 16219 + }, + { + "epoch": 1.7812431363935866, + "grad_norm": 1.828244924545288, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7501122951507568, + "num_tokens": 419682108.0, + "step": 16220 + }, + { + "epoch": 1.7813529540962003, + "grad_norm": 1.7378108501434326, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7376967668533325, + "num_tokens": 419709677.0, + "step": 16221 + }, + { + "epoch": 1.781462771798814, + "grad_norm": 2.0040900707244873, + "learning_rate": 5e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7347689270973206, + "num_tokens": 419731514.0, + "step": 16222 + }, + { + "epoch": 1.7815725895014276, + "grad_norm": 1.5722200870513916, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7019052505493164, + "num_tokens": 419763858.0, + "step": 16223 + }, + { + "epoch": 1.7816824072040411, + "grad_norm": 2.037381887435913, + "learning_rate": 5e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7485630512237549, + "num_tokens": 419784161.0, + "step": 16224 + }, + { + "epoch": 1.781792224906655, + "grad_norm": 1.931674599647522, + "learning_rate": 5e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7524408102035522, + "num_tokens": 419805440.0, + "step": 16225 + }, + { + "epoch": 1.7819020426092687, + "grad_norm": 1.8250447511672974, + "learning_rate": 5e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7563507556915283, + "num_tokens": 419829165.0, + "step": 16226 + }, + { + "epoch": 1.7820118603118824, + "grad_norm": 1.7690415382385254, + "learning_rate": 5e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7282105088233948, + "num_tokens": 419855007.0, + "step": 16227 + }, + { + "epoch": 1.782121678014496, + "grad_norm": 1.6682299375534058, + "learning_rate": 5e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7311530709266663, + "num_tokens": 419884449.0, + "step": 16228 + }, + { + "epoch": 1.7822314957171095, + "grad_norm": 1.720064640045166, + "learning_rate": 5e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7110986709594727, + "num_tokens": 419913087.0, + "step": 16229 + }, + { + "epoch": 1.7823413134197232, + "grad_norm": 1.6700198650360107, + "learning_rate": 5e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7241548299789429, + "num_tokens": 419941582.0, + "step": 16230 + }, + { + "epoch": 1.782451131122337, + "grad_norm": 1.7868529558181763, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7258173227310181, + "num_tokens": 419968351.0, + "step": 16231 + }, + { + "epoch": 1.7825609488249505, + "grad_norm": 1.6168467998504639, + "learning_rate": 5e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7336932420730591, + "num_tokens": 419999020.0, + "step": 16232 + }, + { + "epoch": 1.7826707665275643, + "grad_norm": 1.7763698101043701, + "learning_rate": 5e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7280075550079346, + "num_tokens": 420024236.0, + "step": 16233 + }, + { + "epoch": 1.7827805842301778, + "grad_norm": 1.81529700756073, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7443247437477112, + "num_tokens": 420048825.0, + "step": 16234 + }, + { + "epoch": 1.7828904019327916, + "grad_norm": 1.9230376482009888, + "learning_rate": 5e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7256575226783752, + "num_tokens": 420072277.0, + "step": 16235 + }, + { + "epoch": 1.7830002196354053, + "grad_norm": 1.8584145307540894, + "learning_rate": 5e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7273879051208496, + "num_tokens": 420096892.0, + "step": 16236 + }, + { + "epoch": 1.7831100373380189, + "grad_norm": 1.8124165534973145, + "learning_rate": 5e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7505233287811279, + "num_tokens": 420121938.0, + "step": 16237 + }, + { + "epoch": 1.7832198550406324, + "grad_norm": 1.8069199323654175, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7381686568260193, + "num_tokens": 420147117.0, + "step": 16238 + }, + { + "epoch": 1.7833296727432462, + "grad_norm": 1.874484658241272, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7450754642486572, + "num_tokens": 420170662.0, + "step": 16239 + }, + { + "epoch": 1.78343949044586, + "grad_norm": 1.9388337135314941, + "learning_rate": 5e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7431411743164062, + "num_tokens": 420192994.0, + "step": 16240 + }, + { + "epoch": 1.7835493081484737, + "grad_norm": 1.8657560348510742, + "learning_rate": 5e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7397638559341431, + "num_tokens": 420216766.0, + "step": 16241 + }, + { + "epoch": 1.7836591258510872, + "grad_norm": 1.6051387786865234, + "learning_rate": 5e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7351689338684082, + "num_tokens": 420249983.0, + "step": 16242 + }, + { + "epoch": 1.7837689435537007, + "grad_norm": 1.7149488925933838, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7225915193557739, + "num_tokens": 420278753.0, + "step": 16243 + }, + { + "epoch": 1.7838787612563145, + "grad_norm": 1.772635817527771, + "learning_rate": 5e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7337031364440918, + "num_tokens": 420304916.0, + "step": 16244 + }, + { + "epoch": 1.7839885789589283, + "grad_norm": 1.7823725938796997, + "learning_rate": 5e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7516449093818665, + "num_tokens": 420329978.0, + "step": 16245 + }, + { + "epoch": 1.7840983966615418, + "grad_norm": 1.7755030393600464, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7393686771392822, + "num_tokens": 420355398.0, + "step": 16246 + }, + { + "epoch": 1.7842082143641556, + "grad_norm": 1.7348676919937134, + "learning_rate": 5e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.740714430809021, + "num_tokens": 420382393.0, + "step": 16247 + }, + { + "epoch": 1.784318032066769, + "grad_norm": 1.9522736072540283, + "learning_rate": 5e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7425500154495239, + "num_tokens": 420404412.0, + "step": 16248 + }, + { + "epoch": 1.7844278497693828, + "grad_norm": 1.6700347661972046, + "learning_rate": 5e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7076963782310486, + "num_tokens": 420432989.0, + "step": 16249 + }, + { + "epoch": 1.7845376674719966, + "grad_norm": 1.723728060722351, + "learning_rate": 5e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7304674983024597, + "num_tokens": 420463531.0, + "step": 16250 + }, + { + "epoch": 1.7846474851746101, + "grad_norm": 1.9768825769424438, + "learning_rate": 5e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7243127822875977, + "num_tokens": 420487694.0, + "step": 16251 + }, + { + "epoch": 1.7847573028772237, + "grad_norm": 1.6843394041061401, + "learning_rate": 5e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7327638268470764, + "num_tokens": 420516760.0, + "step": 16252 + }, + { + "epoch": 1.7848671205798374, + "grad_norm": 1.8853598833084106, + "learning_rate": 5e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7484534978866577, + "num_tokens": 420540538.0, + "step": 16253 + }, + { + "epoch": 1.7849769382824512, + "grad_norm": 1.89593505859375, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7579712271690369, + "num_tokens": 420565146.0, + "step": 16254 + }, + { + "epoch": 1.785086755985065, + "grad_norm": 1.91416597366333, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7510038018226624, + "num_tokens": 420586529.0, + "step": 16255 + }, + { + "epoch": 1.7851965736876785, + "grad_norm": 2.042445182800293, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7312217950820923, + "num_tokens": 420609280.0, + "step": 16256 + }, + { + "epoch": 1.785306391390292, + "grad_norm": 1.903902530670166, + "learning_rate": 5e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7375645637512207, + "num_tokens": 420631005.0, + "step": 16257 + }, + { + "epoch": 1.7854162090929058, + "grad_norm": 1.8384901285171509, + "learning_rate": 5e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7390212416648865, + "num_tokens": 420656373.0, + "step": 16258 + }, + { + "epoch": 1.7855260267955195, + "grad_norm": 2.0049171447753906, + "learning_rate": 5e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7292315363883972, + "num_tokens": 420678823.0, + "step": 16259 + }, + { + "epoch": 1.785635844498133, + "grad_norm": 1.751323938369751, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7247748970985413, + "num_tokens": 420707687.0, + "step": 16260 + }, + { + "epoch": 1.7857456622007466, + "grad_norm": 1.8738887310028076, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7223201990127563, + "num_tokens": 420733254.0, + "step": 16261 + }, + { + "epoch": 1.7858554799033604, + "grad_norm": 1.8855482339859009, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7330745458602905, + "num_tokens": 420756677.0, + "step": 16262 + }, + { + "epoch": 1.7859652976059741, + "grad_norm": 1.8715856075286865, + "learning_rate": 5e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7407791614532471, + "num_tokens": 420781129.0, + "step": 16263 + }, + { + "epoch": 1.7860751153085879, + "grad_norm": 1.5502066612243652, + "learning_rate": 5e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7185772061347961, + "num_tokens": 420815452.0, + "step": 16264 + }, + { + "epoch": 1.7861849330112014, + "grad_norm": 1.7206870317459106, + "learning_rate": 5e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7437487840652466, + "num_tokens": 420842915.0, + "step": 16265 + }, + { + "epoch": 1.786294750713815, + "grad_norm": 1.81197988986969, + "learning_rate": 5e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7317389249801636, + "num_tokens": 420869815.0, + "step": 16266 + }, + { + "epoch": 1.7864045684164287, + "grad_norm": 1.767945647239685, + "learning_rate": 5e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7087789177894592, + "num_tokens": 420900074.0, + "step": 16267 + }, + { + "epoch": 1.7865143861190425, + "grad_norm": 1.7665581703186035, + "learning_rate": 5e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7438063025474548, + "num_tokens": 420924408.0, + "step": 16268 + }, + { + "epoch": 1.7866242038216562, + "grad_norm": 2.016616106033325, + "learning_rate": 5e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7382080554962158, + "num_tokens": 420944932.0, + "step": 16269 + }, + { + "epoch": 1.7867340215242697, + "grad_norm": 1.7179752588272095, + "learning_rate": 5e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.736526608467102, + "num_tokens": 420972532.0, + "step": 16270 + }, + { + "epoch": 1.7868438392268833, + "grad_norm": 1.807586431503296, + "learning_rate": 5e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7208839654922485, + "num_tokens": 420999126.0, + "step": 16271 + }, + { + "epoch": 1.786953656929497, + "grad_norm": 1.6803486347198486, + "learning_rate": 5e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7161911725997925, + "num_tokens": 421028508.0, + "step": 16272 + }, + { + "epoch": 1.7870634746321108, + "grad_norm": 1.7563402652740479, + "learning_rate": 5e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7522934675216675, + "num_tokens": 421052616.0, + "step": 16273 + }, + { + "epoch": 1.7871732923347243, + "grad_norm": 1.6990551948547363, + "learning_rate": 5e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7245033979415894, + "num_tokens": 421080462.0, + "step": 16274 + }, + { + "epoch": 1.7872831100373379, + "grad_norm": 1.5779972076416016, + "learning_rate": 5e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7290657758712769, + "num_tokens": 421114105.0, + "step": 16275 + }, + { + "epoch": 1.7873929277399516, + "grad_norm": 1.8455007076263428, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7219984531402588, + "num_tokens": 421141437.0, + "step": 16276 + }, + { + "epoch": 1.7875027454425654, + "grad_norm": 1.674907922744751, + "learning_rate": 5e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7335982322692871, + "num_tokens": 421168871.0, + "step": 16277 + }, + { + "epoch": 1.7876125631451791, + "grad_norm": 1.6379050016403198, + "learning_rate": 5e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7272944450378418, + "num_tokens": 421198347.0, + "step": 16278 + }, + { + "epoch": 1.7877223808477927, + "grad_norm": 1.8690999746322632, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7493053674697876, + "num_tokens": 421219705.0, + "step": 16279 + }, + { + "epoch": 1.7878321985504062, + "grad_norm": 1.802973747253418, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7326803207397461, + "num_tokens": 421247201.0, + "step": 16280 + }, + { + "epoch": 1.78794201625302, + "grad_norm": 1.87259840965271, + "learning_rate": 5e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7384930849075317, + "num_tokens": 421270823.0, + "step": 16281 + }, + { + "epoch": 1.7880518339556337, + "grad_norm": 1.8836678266525269, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7276015281677246, + "num_tokens": 421295593.0, + "step": 16282 + }, + { + "epoch": 1.7881616516582475, + "grad_norm": 1.75527822971344, + "learning_rate": 5e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7030469179153442, + "num_tokens": 421326002.0, + "step": 16283 + }, + { + "epoch": 1.788271469360861, + "grad_norm": 1.8633923530578613, + "learning_rate": 5e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.719447910785675, + "num_tokens": 421350780.0, + "step": 16284 + }, + { + "epoch": 1.7883812870634745, + "grad_norm": 1.715149164199829, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.727121114730835, + "num_tokens": 421378787.0, + "step": 16285 + }, + { + "epoch": 1.7884911047660883, + "grad_norm": 1.7250699996948242, + "learning_rate": 5e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7406410574913025, + "num_tokens": 421406023.0, + "step": 16286 + }, + { + "epoch": 1.788600922468702, + "grad_norm": 2.0597245693206787, + "learning_rate": 5e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7430005669593811, + "num_tokens": 421427339.0, + "step": 16287 + }, + { + "epoch": 1.7887107401713156, + "grad_norm": 1.7003045082092285, + "learning_rate": 5e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7204831838607788, + "num_tokens": 421456171.0, + "step": 16288 + }, + { + "epoch": 1.7888205578739291, + "grad_norm": 2.0188682079315186, + "learning_rate": 5e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7515240907669067, + "num_tokens": 421475163.0, + "step": 16289 + }, + { + "epoch": 1.7889303755765429, + "grad_norm": 1.6379146575927734, + "learning_rate": 5e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7569885849952698, + "num_tokens": 421501592.0, + "step": 16290 + }, + { + "epoch": 1.7890401932791566, + "grad_norm": 1.6699857711791992, + "learning_rate": 5e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7047535181045532, + "num_tokens": 421532276.0, + "step": 16291 + }, + { + "epoch": 1.7891500109817704, + "grad_norm": 1.7954825162887573, + "learning_rate": 5e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7015557289123535, + "num_tokens": 421562770.0, + "step": 16292 + }, + { + "epoch": 1.789259828684384, + "grad_norm": 1.6577599048614502, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.747796356678009, + "num_tokens": 421591256.0, + "step": 16293 + }, + { + "epoch": 1.7893696463869975, + "grad_norm": 1.7765262126922607, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.751594066619873, + "num_tokens": 421617128.0, + "step": 16294 + }, + { + "epoch": 1.7894794640896112, + "grad_norm": 1.866112470626831, + "learning_rate": 5e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7498835921287537, + "num_tokens": 421641328.0, + "step": 16295 + }, + { + "epoch": 1.789589281792225, + "grad_norm": 2.099038600921631, + "learning_rate": 5e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7311992049217224, + "num_tokens": 421662675.0, + "step": 16296 + }, + { + "epoch": 1.7896990994948385, + "grad_norm": 1.7598358392715454, + "learning_rate": 5e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7100088596343994, + "num_tokens": 421689462.0, + "step": 16297 + }, + { + "epoch": 1.7898089171974523, + "grad_norm": 1.871286392211914, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7382392287254333, + "num_tokens": 421713304.0, + "step": 16298 + }, + { + "epoch": 1.7899187349000658, + "grad_norm": 1.7350828647613525, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7276306748390198, + "num_tokens": 421742433.0, + "step": 16299 + }, + { + "epoch": 1.7900285526026796, + "grad_norm": 1.634944200515747, + "learning_rate": 5e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7398688793182373, + "num_tokens": 421771958.0, + "step": 16300 + }, + { + "epoch": 1.7901383703052933, + "grad_norm": 1.6626989841461182, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7200025916099548, + "num_tokens": 421802621.0, + "step": 16301 + }, + { + "epoch": 1.7902481880079069, + "grad_norm": 1.876163363456726, + "learning_rate": 5e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7281286120414734, + "num_tokens": 421826378.0, + "step": 16302 + }, + { + "epoch": 1.7903580057105204, + "grad_norm": 1.8021055459976196, + "learning_rate": 5e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7180548906326294, + "num_tokens": 421853744.0, + "step": 16303 + }, + { + "epoch": 1.7904678234131342, + "grad_norm": 1.7904001474380493, + "learning_rate": 5e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7103737592697144, + "num_tokens": 421880975.0, + "step": 16304 + }, + { + "epoch": 1.790577641115748, + "grad_norm": 2.01180362701416, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7361136674880981, + "num_tokens": 421903493.0, + "step": 16305 + }, + { + "epoch": 1.7906874588183617, + "grad_norm": 1.7527892589569092, + "learning_rate": 5e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7486588954925537, + "num_tokens": 421929846.0, + "step": 16306 + }, + { + "epoch": 1.7907972765209752, + "grad_norm": 1.6214990615844727, + "learning_rate": 5e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7395723462104797, + "num_tokens": 421960837.0, + "step": 16307 + }, + { + "epoch": 1.7909070942235887, + "grad_norm": 1.874709129333496, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7319139838218689, + "num_tokens": 421985622.0, + "step": 16308 + }, + { + "epoch": 1.7910169119262025, + "grad_norm": 1.7834080457687378, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7255979776382446, + "num_tokens": 422012278.0, + "step": 16309 + }, + { + "epoch": 1.7911267296288162, + "grad_norm": 2.1241183280944824, + "learning_rate": 5e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7445409297943115, + "num_tokens": 422032213.0, + "step": 16310 + }, + { + "epoch": 1.7912365473314298, + "grad_norm": 1.7745003700256348, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7371665835380554, + "num_tokens": 422058975.0, + "step": 16311 + }, + { + "epoch": 1.7913463650340435, + "grad_norm": 1.7528526782989502, + "learning_rate": 5e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7096226215362549, + "num_tokens": 422087790.0, + "step": 16312 + }, + { + "epoch": 1.791456182736657, + "grad_norm": 1.8467909097671509, + "learning_rate": 5e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7302515506744385, + "num_tokens": 422112832.0, + "step": 16313 + }, + { + "epoch": 1.7915660004392708, + "grad_norm": 1.7812814712524414, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7394765615463257, + "num_tokens": 422140134.0, + "step": 16314 + }, + { + "epoch": 1.7916758181418846, + "grad_norm": 2.0515053272247314, + "learning_rate": 5e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.7463741302490234, + "num_tokens": 422161257.0, + "step": 16315 + }, + { + "epoch": 1.7917856358444981, + "grad_norm": 2.0391600131988525, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7256402969360352, + "num_tokens": 422182754.0, + "step": 16316 + }, + { + "epoch": 1.7918954535471117, + "grad_norm": 1.790643572807312, + "learning_rate": 5e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7306725978851318, + "num_tokens": 422208320.0, + "step": 16317 + }, + { + "epoch": 1.7920052712497254, + "grad_norm": 1.6843326091766357, + "learning_rate": 5e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7215922474861145, + "num_tokens": 422236857.0, + "step": 16318 + }, + { + "epoch": 1.7921150889523392, + "grad_norm": 2.049743413925171, + "learning_rate": 5e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.735377311706543, + "num_tokens": 422256649.0, + "step": 16319 + }, + { + "epoch": 1.792224906654953, + "grad_norm": 1.9265809059143066, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7418173551559448, + "num_tokens": 422279631.0, + "step": 16320 + }, + { + "epoch": 1.7923347243575665, + "grad_norm": 1.735661506652832, + "learning_rate": 5e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7405043840408325, + "num_tokens": 422305729.0, + "step": 16321 + }, + { + "epoch": 1.79244454206018, + "grad_norm": 1.6508152484893799, + "learning_rate": 5e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7138586640357971, + "num_tokens": 422336246.0, + "step": 16322 + }, + { + "epoch": 1.7925543597627938, + "grad_norm": 1.7516992092132568, + "learning_rate": 5e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7466762065887451, + "num_tokens": 422362204.0, + "step": 16323 + }, + { + "epoch": 1.7926641774654075, + "grad_norm": 1.5230579376220703, + "learning_rate": 5e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6943831443786621, + "num_tokens": 422397333.0, + "step": 16324 + }, + { + "epoch": 1.792773995168021, + "grad_norm": 1.8459569215774536, + "learning_rate": 5e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7296071648597717, + "num_tokens": 422421064.0, + "step": 16325 + }, + { + "epoch": 1.7928838128706346, + "grad_norm": 1.7426968812942505, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7102558612823486, + "num_tokens": 422450063.0, + "step": 16326 + }, + { + "epoch": 1.7929936305732483, + "grad_norm": 2.0052316188812256, + "learning_rate": 5e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7088399529457092, + "num_tokens": 422473543.0, + "step": 16327 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 1.6406948566436768, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7321426868438721, + "num_tokens": 422506148.0, + "step": 16328 + }, + { + "epoch": 1.7932132659784759, + "grad_norm": 1.7864198684692383, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7316243052482605, + "num_tokens": 422533934.0, + "step": 16329 + }, + { + "epoch": 1.7933230836810894, + "grad_norm": 1.8873051404953003, + "learning_rate": 5e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.739378809928894, + "num_tokens": 422558744.0, + "step": 16330 + }, + { + "epoch": 1.793432901383703, + "grad_norm": 1.9151160717010498, + "learning_rate": 5e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7210510969161987, + "num_tokens": 422582414.0, + "step": 16331 + }, + { + "epoch": 1.7935427190863167, + "grad_norm": 1.7201048135757446, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7189455628395081, + "num_tokens": 422611081.0, + "step": 16332 + }, + { + "epoch": 1.7936525367889304, + "grad_norm": 1.738114356994629, + "learning_rate": 5e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7134255170822144, + "num_tokens": 422637377.0, + "step": 16333 + }, + { + "epoch": 1.7937623544915442, + "grad_norm": 1.8175920248031616, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7502254247665405, + "num_tokens": 422663623.0, + "step": 16334 + }, + { + "epoch": 1.7938721721941577, + "grad_norm": 1.8335986137390137, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7189821004867554, + "num_tokens": 422692354.0, + "step": 16335 + }, + { + "epoch": 1.7939819898967713, + "grad_norm": 1.6307868957519531, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7254027128219604, + "num_tokens": 422722728.0, + "step": 16336 + }, + { + "epoch": 1.794091807599385, + "grad_norm": 1.9074084758758545, + "learning_rate": 5e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7144690752029419, + "num_tokens": 422746777.0, + "step": 16337 + }, + { + "epoch": 1.7942016253019988, + "grad_norm": 1.836548089981079, + "learning_rate": 5e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7342326045036316, + "num_tokens": 422772191.0, + "step": 16338 + }, + { + "epoch": 1.7943114430046123, + "grad_norm": 1.6987597942352295, + "learning_rate": 5e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7394368648529053, + "num_tokens": 422801691.0, + "step": 16339 + }, + { + "epoch": 1.7944212607072259, + "grad_norm": 1.7776788473129272, + "learning_rate": 5e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7150813937187195, + "num_tokens": 422832390.0, + "step": 16340 + }, + { + "epoch": 1.7945310784098396, + "grad_norm": 2.0821940898895264, + "learning_rate": 5e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7127012610435486, + "num_tokens": 422854415.0, + "step": 16341 + }, + { + "epoch": 1.7946408961124534, + "grad_norm": 1.9167561531066895, + "learning_rate": 5e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7383759021759033, + "num_tokens": 422878698.0, + "step": 16342 + }, + { + "epoch": 1.7947507138150671, + "grad_norm": 1.8331905603408813, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7364898920059204, + "num_tokens": 422902354.0, + "step": 16343 + }, + { + "epoch": 1.7948605315176807, + "grad_norm": 1.8298381567001343, + "learning_rate": 5e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7282747626304626, + "num_tokens": 422924913.0, + "step": 16344 + }, + { + "epoch": 1.7949703492202942, + "grad_norm": 2.0005037784576416, + "learning_rate": 5e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.739204466342926, + "num_tokens": 422947537.0, + "step": 16345 + }, + { + "epoch": 1.795080166922908, + "grad_norm": 1.8971132040023804, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7509143352508545, + "num_tokens": 422970032.0, + "step": 16346 + }, + { + "epoch": 1.7951899846255217, + "grad_norm": 2.1884765625, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7407773733139038, + "num_tokens": 422988792.0, + "step": 16347 + }, + { + "epoch": 1.7952998023281352, + "grad_norm": 1.9288538694381714, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7290278673171997, + "num_tokens": 423015969.0, + "step": 16348 + }, + { + "epoch": 1.795409620030749, + "grad_norm": 1.9124120473861694, + "learning_rate": 5e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7275596261024475, + "num_tokens": 423039332.0, + "step": 16349 + }, + { + "epoch": 1.7955194377333625, + "grad_norm": 2.0996687412261963, + "learning_rate": 5e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7457267045974731, + "num_tokens": 423059611.0, + "step": 16350 + }, + { + "epoch": 1.7956292554359763, + "grad_norm": 2.141406297683716, + "learning_rate": 5e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7252252101898193, + "num_tokens": 423080777.0, + "step": 16351 + }, + { + "epoch": 1.79573907313859, + "grad_norm": 1.9698262214660645, + "learning_rate": 5e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7483451962471008, + "num_tokens": 423102874.0, + "step": 16352 + }, + { + "epoch": 1.7958488908412036, + "grad_norm": 1.801859974861145, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7348360419273376, + "num_tokens": 423126688.0, + "step": 16353 + }, + { + "epoch": 1.7959587085438171, + "grad_norm": 1.8170548677444458, + "learning_rate": 5e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7515345811843872, + "num_tokens": 423150366.0, + "step": 16354 + }, + { + "epoch": 1.7960685262464309, + "grad_norm": 1.684077262878418, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7260183095932007, + "num_tokens": 423180493.0, + "step": 16355 + }, + { + "epoch": 1.7961783439490446, + "grad_norm": 2.115204334259033, + "learning_rate": 5e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7339632511138916, + "num_tokens": 423200112.0, + "step": 16356 + }, + { + "epoch": 1.7962881616516584, + "grad_norm": 1.810165286064148, + "learning_rate": 5e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7229180335998535, + "num_tokens": 423225578.0, + "step": 16357 + }, + { + "epoch": 1.796397979354272, + "grad_norm": 1.930395483970642, + "learning_rate": 5e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7296181917190552, + "num_tokens": 423250459.0, + "step": 16358 + }, + { + "epoch": 1.7965077970568855, + "grad_norm": 1.6737778186798096, + "learning_rate": 5e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7222930192947388, + "num_tokens": 423277537.0, + "step": 16359 + }, + { + "epoch": 1.7966176147594992, + "grad_norm": 1.8860657215118408, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7624599933624268, + "num_tokens": 423300606.0, + "step": 16360 + }, + { + "epoch": 1.796727432462113, + "grad_norm": 1.8217157125473022, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7228288054466248, + "num_tokens": 423327519.0, + "step": 16361 + }, + { + "epoch": 1.7968372501647265, + "grad_norm": 1.6240273714065552, + "learning_rate": 5e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7134760618209839, + "num_tokens": 423358675.0, + "step": 16362 + }, + { + "epoch": 1.7969470678673403, + "grad_norm": 1.8640427589416504, + "learning_rate": 5e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7197245359420776, + "num_tokens": 423384477.0, + "step": 16363 + }, + { + "epoch": 1.7970568855699538, + "grad_norm": 1.7015272378921509, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7342525720596313, + "num_tokens": 423411515.0, + "step": 16364 + }, + { + "epoch": 1.7971667032725676, + "grad_norm": 1.9651545286178589, + "learning_rate": 5e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7313488721847534, + "num_tokens": 423433106.0, + "step": 16365 + }, + { + "epoch": 1.7972765209751813, + "grad_norm": 1.670365333557129, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7550735473632812, + "num_tokens": 423459498.0, + "step": 16366 + }, + { + "epoch": 1.7973863386777948, + "grad_norm": 1.730139136314392, + "learning_rate": 5e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.744553804397583, + "num_tokens": 423485430.0, + "step": 16367 + }, + { + "epoch": 1.7974961563804084, + "grad_norm": 1.6352671384811401, + "learning_rate": 5e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7255364656448364, + "num_tokens": 423514935.0, + "step": 16368 + }, + { + "epoch": 1.7976059740830221, + "grad_norm": 1.785843014717102, + "learning_rate": 5e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7453610897064209, + "num_tokens": 423540195.0, + "step": 16369 + }, + { + "epoch": 1.797715791785636, + "grad_norm": 1.8367774486541748, + "learning_rate": 5e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7479732036590576, + "num_tokens": 423566053.0, + "step": 16370 + }, + { + "epoch": 1.7978256094882497, + "grad_norm": 1.948495864868164, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7362534999847412, + "num_tokens": 423589600.0, + "step": 16371 + }, + { + "epoch": 1.7979354271908632, + "grad_norm": 1.8103020191192627, + "learning_rate": 5e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7340442538261414, + "num_tokens": 423612224.0, + "step": 16372 + }, + { + "epoch": 1.7980452448934767, + "grad_norm": 1.6580039262771606, + "learning_rate": 5e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7218183279037476, + "num_tokens": 423643120.0, + "step": 16373 + }, + { + "epoch": 1.7981550625960905, + "grad_norm": 1.7873486280441284, + "learning_rate": 5e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7125188112258911, + "num_tokens": 423672817.0, + "step": 16374 + }, + { + "epoch": 1.7982648802987042, + "grad_norm": 1.9613922834396362, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7398282289505005, + "num_tokens": 423694071.0, + "step": 16375 + }, + { + "epoch": 1.7983746980013178, + "grad_norm": 1.6473159790039062, + "learning_rate": 5e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7066140174865723, + "num_tokens": 423723434.0, + "step": 16376 + }, + { + "epoch": 1.7984845157039313, + "grad_norm": 1.9519822597503662, + "learning_rate": 5e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7260958552360535, + "num_tokens": 423745237.0, + "step": 16377 + }, + { + "epoch": 1.798594333406545, + "grad_norm": 1.956201195716858, + "learning_rate": 5e-06, + "loss": 0.7863, + "mean_token_accuracy": 0.7422335743904114, + "num_tokens": 423766756.0, + "step": 16378 + }, + { + "epoch": 1.7987041511091588, + "grad_norm": 1.7606638669967651, + "learning_rate": 5e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7513964176177979, + "num_tokens": 423790485.0, + "step": 16379 + }, + { + "epoch": 1.7988139688117726, + "grad_norm": 2.065423011779785, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7679396271705627, + "num_tokens": 423807512.0, + "step": 16380 + }, + { + "epoch": 1.7989237865143861, + "grad_norm": 1.6428231000900269, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7210986614227295, + "num_tokens": 423837233.0, + "step": 16381 + }, + { + "epoch": 1.7990336042169996, + "grad_norm": 2.101665496826172, + "learning_rate": 5e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.744608998298645, + "num_tokens": 423858554.0, + "step": 16382 + }, + { + "epoch": 1.7991434219196134, + "grad_norm": 1.9225473403930664, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7370458841323853, + "num_tokens": 423880035.0, + "step": 16383 + }, + { + "epoch": 1.7992532396222272, + "grad_norm": 1.794012188911438, + "learning_rate": 5e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7376081347465515, + "num_tokens": 423902407.0, + "step": 16384 + }, + { + "epoch": 1.799363057324841, + "grad_norm": 2.0573208332061768, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7286142110824585, + "num_tokens": 423923417.0, + "step": 16385 + }, + { + "epoch": 1.7994728750274545, + "grad_norm": 1.8400624990463257, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7224923372268677, + "num_tokens": 423950965.0, + "step": 16386 + }, + { + "epoch": 1.799582692730068, + "grad_norm": 2.007941961288452, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7470408082008362, + "num_tokens": 423970113.0, + "step": 16387 + }, + { + "epoch": 1.7996925104326817, + "grad_norm": 1.9367730617523193, + "learning_rate": 5e-06, + "loss": 0.598, + "mean_token_accuracy": 0.7983118295669556, + "num_tokens": 423989119.0, + "step": 16388 + }, + { + "epoch": 1.7998023281352955, + "grad_norm": 1.8439933061599731, + "learning_rate": 5e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7132591605186462, + "num_tokens": 424013089.0, + "step": 16389 + }, + { + "epoch": 1.799912145837909, + "grad_norm": 1.8317060470581055, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7485443353652954, + "num_tokens": 424035984.0, + "step": 16390 + }, + { + "epoch": 1.8000219635405226, + "grad_norm": 1.6852575540542603, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7263544797897339, + "num_tokens": 424066085.0, + "step": 16391 + }, + { + "epoch": 1.8001317812431363, + "grad_norm": 1.6967601776123047, + "learning_rate": 5e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.6977077722549438, + "num_tokens": 424095380.0, + "step": 16392 + }, + { + "epoch": 1.80024159894575, + "grad_norm": 1.888590931892395, + "learning_rate": 5e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7183077931404114, + "num_tokens": 424121119.0, + "step": 16393 + }, + { + "epoch": 1.8003514166483638, + "grad_norm": 1.8759033679962158, + "learning_rate": 5e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7484527826309204, + "num_tokens": 424149124.0, + "step": 16394 + }, + { + "epoch": 1.8004612343509774, + "grad_norm": 1.9723845720291138, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7254226207733154, + "num_tokens": 424172950.0, + "step": 16395 + }, + { + "epoch": 1.800571052053591, + "grad_norm": 1.876120686531067, + "learning_rate": 5e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7384592294692993, + "num_tokens": 424197802.0, + "step": 16396 + }, + { + "epoch": 1.8006808697562047, + "grad_norm": 1.9052033424377441, + "learning_rate": 5e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.751456618309021, + "num_tokens": 424219852.0, + "step": 16397 + }, + { + "epoch": 1.8007906874588184, + "grad_norm": 1.8725569248199463, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7145076990127563, + "num_tokens": 424245541.0, + "step": 16398 + }, + { + "epoch": 1.8009005051614322, + "grad_norm": 1.9693889617919922, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7588608264923096, + "num_tokens": 424266420.0, + "step": 16399 + }, + { + "epoch": 1.8010103228640457, + "grad_norm": 1.6520463228225708, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7230756282806396, + "num_tokens": 424298368.0, + "step": 16400 + }, + { + "epoch": 1.8011201405666593, + "grad_norm": 1.7363380193710327, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7019284963607788, + "num_tokens": 424329395.0, + "step": 16401 + }, + { + "epoch": 1.801229958269273, + "grad_norm": 1.7386069297790527, + "learning_rate": 5e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7072567939758301, + "num_tokens": 424358287.0, + "step": 16402 + }, + { + "epoch": 1.8013397759718868, + "grad_norm": 1.7924330234527588, + "learning_rate": 5e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7239950299263, + "num_tokens": 424386836.0, + "step": 16403 + }, + { + "epoch": 1.8014495936745003, + "grad_norm": 2.074293851852417, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7238580584526062, + "num_tokens": 424408050.0, + "step": 16404 + }, + { + "epoch": 1.8015594113771138, + "grad_norm": 1.6840779781341553, + "learning_rate": 5e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7220253348350525, + "num_tokens": 424435522.0, + "step": 16405 + }, + { + "epoch": 1.8016692290797276, + "grad_norm": 1.852327823638916, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7348473072052002, + "num_tokens": 424459488.0, + "step": 16406 + }, + { + "epoch": 1.8017790467823414, + "grad_norm": 1.7519607543945312, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7291951179504395, + "num_tokens": 424486172.0, + "step": 16407 + }, + { + "epoch": 1.801888864484955, + "grad_norm": 1.669659972190857, + "learning_rate": 5e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7486345171928406, + "num_tokens": 424515899.0, + "step": 16408 + }, + { + "epoch": 1.8019986821875686, + "grad_norm": 1.6508009433746338, + "learning_rate": 5e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7367914915084839, + "num_tokens": 424547031.0, + "step": 16409 + }, + { + "epoch": 1.8021084998901822, + "grad_norm": 1.6698949337005615, + "learning_rate": 5e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7221506237983704, + "num_tokens": 424579108.0, + "step": 16410 + }, + { + "epoch": 1.802218317592796, + "grad_norm": 1.7325893640518188, + "learning_rate": 5e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7482512593269348, + "num_tokens": 424604479.0, + "step": 16411 + }, + { + "epoch": 1.8023281352954097, + "grad_norm": 1.913855791091919, + "learning_rate": 5e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7474644184112549, + "num_tokens": 424627974.0, + "step": 16412 + }, + { + "epoch": 1.8024379529980232, + "grad_norm": 1.5755115747451782, + "learning_rate": 5e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7119182348251343, + "num_tokens": 424662861.0, + "step": 16413 + }, + { + "epoch": 1.802547770700637, + "grad_norm": 1.7654523849487305, + "learning_rate": 5e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7230216264724731, + "num_tokens": 424692524.0, + "step": 16414 + }, + { + "epoch": 1.8026575884032505, + "grad_norm": 1.8705365657806396, + "learning_rate": 5e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7317989468574524, + "num_tokens": 424716249.0, + "step": 16415 + }, + { + "epoch": 1.8027674061058643, + "grad_norm": 1.9343146085739136, + "learning_rate": 5e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7393494248390198, + "num_tokens": 424738430.0, + "step": 16416 + }, + { + "epoch": 1.802877223808478, + "grad_norm": 1.6831579208374023, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7168526649475098, + "num_tokens": 424768897.0, + "step": 16417 + }, + { + "epoch": 1.8029870415110916, + "grad_norm": 2.1082663536071777, + "learning_rate": 5e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7384682297706604, + "num_tokens": 424789969.0, + "step": 16418 + }, + { + "epoch": 1.803096859213705, + "grad_norm": 1.9962607622146606, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7567154765129089, + "num_tokens": 424809713.0, + "step": 16419 + }, + { + "epoch": 1.8032066769163189, + "grad_norm": 1.677565097808838, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7233455181121826, + "num_tokens": 424841146.0, + "step": 16420 + }, + { + "epoch": 1.8033164946189326, + "grad_norm": 1.8524484634399414, + "learning_rate": 5e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7113102674484253, + "num_tokens": 424868883.0, + "step": 16421 + }, + { + "epoch": 1.8034263123215464, + "grad_norm": 1.5029605627059937, + "learning_rate": 5e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7580944299697876, + "num_tokens": 424901598.0, + "step": 16422 + }, + { + "epoch": 1.80353613002416, + "grad_norm": 1.5425608158111572, + "learning_rate": 5e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7142605185508728, + "num_tokens": 424933104.0, + "step": 16423 + }, + { + "epoch": 1.8036459477267734, + "grad_norm": 1.938930630683899, + "learning_rate": 5e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7381176948547363, + "num_tokens": 424956278.0, + "step": 16424 + }, + { + "epoch": 1.8037557654293872, + "grad_norm": 1.820600152015686, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7467995882034302, + "num_tokens": 424979016.0, + "step": 16425 + }, + { + "epoch": 1.803865583132001, + "grad_norm": 1.8694684505462646, + "learning_rate": 5e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7224819660186768, + "num_tokens": 425002750.0, + "step": 16426 + }, + { + "epoch": 1.8039754008346145, + "grad_norm": 1.7442436218261719, + "learning_rate": 5e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.720850944519043, + "num_tokens": 425031743.0, + "step": 16427 + }, + { + "epoch": 1.8040852185372283, + "grad_norm": 1.756125569343567, + "learning_rate": 5e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.7458301186561584, + "num_tokens": 425058595.0, + "step": 16428 + }, + { + "epoch": 1.8041950362398418, + "grad_norm": 1.9231456518173218, + "learning_rate": 5e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.729662299156189, + "num_tokens": 425081312.0, + "step": 16429 + }, + { + "epoch": 1.8043048539424555, + "grad_norm": 1.819215178489685, + "learning_rate": 5e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7386382818222046, + "num_tokens": 425106321.0, + "step": 16430 + }, + { + "epoch": 1.8044146716450693, + "grad_norm": 1.604730248451233, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7156864404678345, + "num_tokens": 425138426.0, + "step": 16431 + }, + { + "epoch": 1.8045244893476828, + "grad_norm": 2.0371651649475098, + "learning_rate": 5e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.740609884262085, + "num_tokens": 425158032.0, + "step": 16432 + }, + { + "epoch": 1.8046343070502964, + "grad_norm": 1.8935626745224, + "learning_rate": 5e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7147867679595947, + "num_tokens": 425183149.0, + "step": 16433 + }, + { + "epoch": 1.8047441247529101, + "grad_norm": 1.7012284994125366, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7337300181388855, + "num_tokens": 425211610.0, + "step": 16434 + }, + { + "epoch": 1.8048539424555239, + "grad_norm": 1.7274019718170166, + "learning_rate": 5e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7251375913619995, + "num_tokens": 425240000.0, + "step": 16435 + }, + { + "epoch": 1.8049637601581376, + "grad_norm": 1.6050727367401123, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7386125326156616, + "num_tokens": 425269841.0, + "step": 16436 + }, + { + "epoch": 1.8050735778607512, + "grad_norm": 1.681076169013977, + "learning_rate": 5e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6935962438583374, + "num_tokens": 425301001.0, + "step": 16437 + }, + { + "epoch": 1.8051833955633647, + "grad_norm": 1.9475184679031372, + "learning_rate": 5e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7274275422096252, + "num_tokens": 425325663.0, + "step": 16438 + }, + { + "epoch": 1.8052932132659785, + "grad_norm": 1.7518823146820068, + "learning_rate": 5e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7136375904083252, + "num_tokens": 425354295.0, + "step": 16439 + }, + { + "epoch": 1.8054030309685922, + "grad_norm": 1.8805079460144043, + "learning_rate": 5e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7341340184211731, + "num_tokens": 425378088.0, + "step": 16440 + }, + { + "epoch": 1.8055128486712058, + "grad_norm": 2.064638376235962, + "learning_rate": 5e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7254265546798706, + "num_tokens": 425399834.0, + "step": 16441 + }, + { + "epoch": 1.8056226663738193, + "grad_norm": 1.8837260007858276, + "learning_rate": 5e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7328601479530334, + "num_tokens": 425424677.0, + "step": 16442 + }, + { + "epoch": 1.805732484076433, + "grad_norm": 1.8638315200805664, + "learning_rate": 5e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7204544544219971, + "num_tokens": 425449935.0, + "step": 16443 + }, + { + "epoch": 1.8058423017790468, + "grad_norm": 1.921871304512024, + "learning_rate": 5e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7218661308288574, + "num_tokens": 425474832.0, + "step": 16444 + }, + { + "epoch": 1.8059521194816606, + "grad_norm": 1.9842743873596191, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7393120527267456, + "num_tokens": 425496234.0, + "step": 16445 + }, + { + "epoch": 1.806061937184274, + "grad_norm": 1.8572748899459839, + "learning_rate": 5e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.740967869758606, + "num_tokens": 425520264.0, + "step": 16446 + }, + { + "epoch": 1.8061717548868876, + "grad_norm": 1.8712968826293945, + "learning_rate": 5e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7502001523971558, + "num_tokens": 425542853.0, + "step": 16447 + }, + { + "epoch": 1.8062815725895014, + "grad_norm": 1.945052981376648, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7427101135253906, + "num_tokens": 425565518.0, + "step": 16448 + }, + { + "epoch": 1.8063913902921152, + "grad_norm": 1.6776154041290283, + "learning_rate": 5e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7118352055549622, + "num_tokens": 425595064.0, + "step": 16449 + }, + { + "epoch": 1.806501207994729, + "grad_norm": 1.5486247539520264, + "learning_rate": 5e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7191352248191833, + "num_tokens": 425632818.0, + "step": 16450 + }, + { + "epoch": 1.8066110256973424, + "grad_norm": 1.8726948499679565, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7499599456787109, + "num_tokens": 425655456.0, + "step": 16451 + }, + { + "epoch": 1.806720843399956, + "grad_norm": 1.578490138053894, + "learning_rate": 5e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7495943307876587, + "num_tokens": 425687716.0, + "step": 16452 + }, + { + "epoch": 1.8068306611025697, + "grad_norm": 2.03897762298584, + "learning_rate": 5e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7480844855308533, + "num_tokens": 425710909.0, + "step": 16453 + }, + { + "epoch": 1.8069404788051835, + "grad_norm": 1.4638421535491943, + "learning_rate": 5e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7413341999053955, + "num_tokens": 425746860.0, + "step": 16454 + }, + { + "epoch": 1.807050296507797, + "grad_norm": 1.8487136363983154, + "learning_rate": 5e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7245392799377441, + "num_tokens": 425772660.0, + "step": 16455 + }, + { + "epoch": 1.8071601142104106, + "grad_norm": 1.8684455156326294, + "learning_rate": 5e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7476503849029541, + "num_tokens": 425798268.0, + "step": 16456 + }, + { + "epoch": 1.8072699319130243, + "grad_norm": 1.8126591444015503, + "learning_rate": 5e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7339158058166504, + "num_tokens": 425822654.0, + "step": 16457 + }, + { + "epoch": 1.807379749615638, + "grad_norm": 1.9009195566177368, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7379319667816162, + "num_tokens": 425846371.0, + "step": 16458 + }, + { + "epoch": 1.8074895673182518, + "grad_norm": 1.9880350828170776, + "learning_rate": 5e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7337660789489746, + "num_tokens": 425867941.0, + "step": 16459 + }, + { + "epoch": 1.8075993850208654, + "grad_norm": 1.6807762384414673, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7321206331253052, + "num_tokens": 425895723.0, + "step": 16460 + }, + { + "epoch": 1.807709202723479, + "grad_norm": 1.8128846883773804, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7488052845001221, + "num_tokens": 425919843.0, + "step": 16461 + }, + { + "epoch": 1.8078190204260927, + "grad_norm": 1.7604557275772095, + "learning_rate": 5e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7175167798995972, + "num_tokens": 425947409.0, + "step": 16462 + }, + { + "epoch": 1.8079288381287064, + "grad_norm": 1.8621636629104614, + "learning_rate": 5e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7237269878387451, + "num_tokens": 425972130.0, + "step": 16463 + }, + { + "epoch": 1.8080386558313202, + "grad_norm": 1.8893673419952393, + "learning_rate": 5e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7298504710197449, + "num_tokens": 425996674.0, + "step": 16464 + }, + { + "epoch": 1.8081484735339337, + "grad_norm": 2.0989952087402344, + "learning_rate": 5e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7569379210472107, + "num_tokens": 426015116.0, + "step": 16465 + }, + { + "epoch": 1.8082582912365472, + "grad_norm": 1.6065170764923096, + "learning_rate": 5e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7230926752090454, + "num_tokens": 426045552.0, + "step": 16466 + }, + { + "epoch": 1.808368108939161, + "grad_norm": 1.8008100986480713, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7309814691543579, + "num_tokens": 426070323.0, + "step": 16467 + }, + { + "epoch": 1.8084779266417748, + "grad_norm": 1.7126439809799194, + "learning_rate": 5e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7212482690811157, + "num_tokens": 426097395.0, + "step": 16468 + }, + { + "epoch": 1.8085877443443883, + "grad_norm": 1.6747342348098755, + "learning_rate": 5e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7414247989654541, + "num_tokens": 426126608.0, + "step": 16469 + }, + { + "epoch": 1.8086975620470018, + "grad_norm": 1.8404260873794556, + "learning_rate": 5e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7042418718338013, + "num_tokens": 426152297.0, + "step": 16470 + }, + { + "epoch": 1.8088073797496156, + "grad_norm": 1.8035808801651, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7282591462135315, + "num_tokens": 426176491.0, + "step": 16471 + }, + { + "epoch": 1.8089171974522293, + "grad_norm": 1.6453181505203247, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7310510873794556, + "num_tokens": 426204747.0, + "step": 16472 + }, + { + "epoch": 1.809027015154843, + "grad_norm": 1.7430715560913086, + "learning_rate": 5e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7340055108070374, + "num_tokens": 426230345.0, + "step": 16473 + }, + { + "epoch": 1.8091368328574566, + "grad_norm": 1.8797820806503296, + "learning_rate": 5e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7539486289024353, + "num_tokens": 426255295.0, + "step": 16474 + }, + { + "epoch": 1.8092466505600702, + "grad_norm": 1.6928356885910034, + "learning_rate": 5e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7298652529716492, + "num_tokens": 426282172.0, + "step": 16475 + }, + { + "epoch": 1.809356468262684, + "grad_norm": 1.780361533164978, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.73687744140625, + "num_tokens": 426306321.0, + "step": 16476 + }, + { + "epoch": 1.8094662859652977, + "grad_norm": 1.7210596799850464, + "learning_rate": 5e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7053849101066589, + "num_tokens": 426337287.0, + "step": 16477 + }, + { + "epoch": 1.8095761036679112, + "grad_norm": 1.763791561126709, + "learning_rate": 5e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.7486587166786194, + "num_tokens": 426360590.0, + "step": 16478 + }, + { + "epoch": 1.809685921370525, + "grad_norm": 1.7900843620300293, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7192102670669556, + "num_tokens": 426387943.0, + "step": 16479 + }, + { + "epoch": 1.8097957390731385, + "grad_norm": 1.7157787084579468, + "learning_rate": 5e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7229322791099548, + "num_tokens": 426417956.0, + "step": 16480 + }, + { + "epoch": 1.8099055567757523, + "grad_norm": 1.928985595703125, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.742387592792511, + "num_tokens": 426441093.0, + "step": 16481 + }, + { + "epoch": 1.810015374478366, + "grad_norm": 1.8081002235412598, + "learning_rate": 5e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7100939750671387, + "num_tokens": 426466985.0, + "step": 16482 + }, + { + "epoch": 1.8101251921809796, + "grad_norm": 1.8055200576782227, + "learning_rate": 5e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7420201301574707, + "num_tokens": 426492420.0, + "step": 16483 + }, + { + "epoch": 1.810235009883593, + "grad_norm": 1.7192063331604004, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.716740608215332, + "num_tokens": 426521278.0, + "step": 16484 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 1.800287127494812, + "learning_rate": 5e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7337723970413208, + "num_tokens": 426547422.0, + "step": 16485 + }, + { + "epoch": 1.8104546452888206, + "grad_norm": 1.7292020320892334, + "learning_rate": 5e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7305057048797607, + "num_tokens": 426573665.0, + "step": 16486 + }, + { + "epoch": 1.8105644629914344, + "grad_norm": 1.8113728761672974, + "learning_rate": 5e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7142068147659302, + "num_tokens": 426600363.0, + "step": 16487 + }, + { + "epoch": 1.810674280694048, + "grad_norm": 1.6738675832748413, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7230390310287476, + "num_tokens": 426630941.0, + "step": 16488 + }, + { + "epoch": 1.8107840983966614, + "grad_norm": 1.8404628038406372, + "learning_rate": 5e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7297484278678894, + "num_tokens": 426656190.0, + "step": 16489 + }, + { + "epoch": 1.8108939160992752, + "grad_norm": 1.8077011108398438, + "learning_rate": 5e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7153578400611877, + "num_tokens": 426681655.0, + "step": 16490 + }, + { + "epoch": 1.811003733801889, + "grad_norm": 1.8398098945617676, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7368617057800293, + "num_tokens": 426705873.0, + "step": 16491 + }, + { + "epoch": 1.8111135515045025, + "grad_norm": 1.624294638633728, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7153153419494629, + "num_tokens": 426738736.0, + "step": 16492 + }, + { + "epoch": 1.8112233692071162, + "grad_norm": 1.9508413076400757, + "learning_rate": 5e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7243139147758484, + "num_tokens": 426762318.0, + "step": 16493 + }, + { + "epoch": 1.8113331869097298, + "grad_norm": 1.900759220123291, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7560666799545288, + "num_tokens": 426784818.0, + "step": 16494 + }, + { + "epoch": 1.8114430046123435, + "grad_norm": 1.596675157546997, + "learning_rate": 5e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7213670611381531, + "num_tokens": 426815743.0, + "step": 16495 + }, + { + "epoch": 1.8115528223149573, + "grad_norm": 1.9466012716293335, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7334818243980408, + "num_tokens": 426839293.0, + "step": 16496 + }, + { + "epoch": 1.8116626400175708, + "grad_norm": 1.8485543727874756, + "learning_rate": 5e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7330322861671448, + "num_tokens": 426864149.0, + "step": 16497 + }, + { + "epoch": 1.8117724577201844, + "grad_norm": 1.8792951107025146, + "learning_rate": 5e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7245646715164185, + "num_tokens": 426888821.0, + "step": 16498 + }, + { + "epoch": 1.8118822754227981, + "grad_norm": 1.5530893802642822, + "learning_rate": 5e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7322874069213867, + "num_tokens": 426920451.0, + "step": 16499 + }, + { + "epoch": 1.8119920931254119, + "grad_norm": 1.6135611534118652, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7208884954452515, + "num_tokens": 426952523.0, + "step": 16500 + }, + { + "epoch": 1.8121019108280256, + "grad_norm": 1.6473851203918457, + "learning_rate": 5e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7329931259155273, + "num_tokens": 426982365.0, + "step": 16501 + }, + { + "epoch": 1.8122117285306392, + "grad_norm": 1.715132713317871, + "learning_rate": 5e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7338129281997681, + "num_tokens": 427009420.0, + "step": 16502 + }, + { + "epoch": 1.8123215462332527, + "grad_norm": 1.8419862985610962, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7449038624763489, + "num_tokens": 427033144.0, + "step": 16503 + }, + { + "epoch": 1.8124313639358665, + "grad_norm": 1.6849110126495361, + "learning_rate": 5e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7148827314376831, + "num_tokens": 427065076.0, + "step": 16504 + }, + { + "epoch": 1.8125411816384802, + "grad_norm": 1.6864436864852905, + "learning_rate": 5e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7354784607887268, + "num_tokens": 427093234.0, + "step": 16505 + }, + { + "epoch": 1.8126509993410937, + "grad_norm": 1.8881210088729858, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7283039689064026, + "num_tokens": 427119586.0, + "step": 16506 + }, + { + "epoch": 1.8127608170437073, + "grad_norm": 1.881819248199463, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7217257022857666, + "num_tokens": 427143625.0, + "step": 16507 + }, + { + "epoch": 1.812870634746321, + "grad_norm": 1.8549935817718506, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.735159158706665, + "num_tokens": 427166790.0, + "step": 16508 + }, + { + "epoch": 1.8129804524489348, + "grad_norm": 1.8164507150650024, + "learning_rate": 5e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7240138053894043, + "num_tokens": 427192361.0, + "step": 16509 + }, + { + "epoch": 1.8130902701515486, + "grad_norm": 1.9140328168869019, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7386945486068726, + "num_tokens": 427215437.0, + "step": 16510 + }, + { + "epoch": 1.813200087854162, + "grad_norm": 1.8966000080108643, + "learning_rate": 5e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7258988618850708, + "num_tokens": 427240921.0, + "step": 16511 + }, + { + "epoch": 1.8133099055567756, + "grad_norm": 1.9382392168045044, + "learning_rate": 5e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.727056086063385, + "num_tokens": 427265294.0, + "step": 16512 + }, + { + "epoch": 1.8134197232593894, + "grad_norm": 1.792313575744629, + "learning_rate": 5e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7161158323287964, + "num_tokens": 427292361.0, + "step": 16513 + }, + { + "epoch": 1.8135295409620031, + "grad_norm": 1.6781039237976074, + "learning_rate": 5e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7242096662521362, + "num_tokens": 427321585.0, + "step": 16514 + }, + { + "epoch": 1.813639358664617, + "grad_norm": 1.887727975845337, + "learning_rate": 5e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7191708087921143, + "num_tokens": 427347949.0, + "step": 16515 + }, + { + "epoch": 1.8137491763672304, + "grad_norm": 1.877256155014038, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7744771242141724, + "num_tokens": 427370651.0, + "step": 16516 + }, + { + "epoch": 1.813858994069844, + "grad_norm": 1.5309933423995972, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.745964527130127, + "num_tokens": 427403177.0, + "step": 16517 + }, + { + "epoch": 1.8139688117724577, + "grad_norm": 2.040503740310669, + "learning_rate": 5e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7364505529403687, + "num_tokens": 427423784.0, + "step": 16518 + }, + { + "epoch": 1.8140786294750715, + "grad_norm": 1.6510463953018188, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7393107414245605, + "num_tokens": 427453538.0, + "step": 16519 + }, + { + "epoch": 1.814188447177685, + "grad_norm": 1.7227897644042969, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7231916189193726, + "num_tokens": 427482371.0, + "step": 16520 + }, + { + "epoch": 1.8142982648802986, + "grad_norm": 1.9248143434524536, + "learning_rate": 5e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7360065579414368, + "num_tokens": 427504049.0, + "step": 16521 + }, + { + "epoch": 1.8144080825829123, + "grad_norm": 1.8883413076400757, + "learning_rate": 5e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7505236864089966, + "num_tokens": 427526089.0, + "step": 16522 + }, + { + "epoch": 1.814517900285526, + "grad_norm": 1.7851507663726807, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7221307158470154, + "num_tokens": 427553389.0, + "step": 16523 + }, + { + "epoch": 1.8146277179881398, + "grad_norm": 1.6470173597335815, + "learning_rate": 5e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7172583937644958, + "num_tokens": 427583242.0, + "step": 16524 + }, + { + "epoch": 1.8147375356907534, + "grad_norm": 1.586267352104187, + "learning_rate": 5e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7416737079620361, + "num_tokens": 427614908.0, + "step": 16525 + }, + { + "epoch": 1.814847353393367, + "grad_norm": 1.8959614038467407, + "learning_rate": 5e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7314082980155945, + "num_tokens": 427639031.0, + "step": 16526 + }, + { + "epoch": 1.8149571710959806, + "grad_norm": 1.9924949407577515, + "learning_rate": 5e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7285152673721313, + "num_tokens": 427662175.0, + "step": 16527 + }, + { + "epoch": 1.8150669887985944, + "grad_norm": 1.7575639486312866, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7075117826461792, + "num_tokens": 427690181.0, + "step": 16528 + }, + { + "epoch": 1.815176806501208, + "grad_norm": 1.9246838092803955, + "learning_rate": 5e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7322829365730286, + "num_tokens": 427715238.0, + "step": 16529 + }, + { + "epoch": 1.8152866242038217, + "grad_norm": 1.918861746788025, + "learning_rate": 5e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7297526597976685, + "num_tokens": 427738990.0, + "step": 16530 + }, + { + "epoch": 1.8153964419064352, + "grad_norm": 1.6853151321411133, + "learning_rate": 5e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7300567030906677, + "num_tokens": 427765735.0, + "step": 16531 + }, + { + "epoch": 1.815506259609049, + "grad_norm": 1.8775029182434082, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7513667345046997, + "num_tokens": 427789129.0, + "step": 16532 + }, + { + "epoch": 1.8156160773116627, + "grad_norm": 1.8092457056045532, + "learning_rate": 5e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.725072979927063, + "num_tokens": 427815074.0, + "step": 16533 + }, + { + "epoch": 1.8157258950142763, + "grad_norm": 1.8055537939071655, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.752879798412323, + "num_tokens": 427839494.0, + "step": 16534 + }, + { + "epoch": 1.8158357127168898, + "grad_norm": 1.6886862516403198, + "learning_rate": 5e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7206763029098511, + "num_tokens": 427867590.0, + "step": 16535 + }, + { + "epoch": 1.8159455304195036, + "grad_norm": 1.876570463180542, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7307693958282471, + "num_tokens": 427891305.0, + "step": 16536 + }, + { + "epoch": 1.8160553481221173, + "grad_norm": 1.7786500453948975, + "learning_rate": 5e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7194483280181885, + "num_tokens": 427919411.0, + "step": 16537 + }, + { + "epoch": 1.816165165824731, + "grad_norm": 1.7498308420181274, + "learning_rate": 5e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7134033441543579, + "num_tokens": 427947504.0, + "step": 16538 + }, + { + "epoch": 1.8162749835273446, + "grad_norm": 1.7017663717269897, + "learning_rate": 5e-06, + "loss": 0.976, + "mean_token_accuracy": 0.702680230140686, + "num_tokens": 427979057.0, + "step": 16539 + }, + { + "epoch": 1.8163848012299582, + "grad_norm": 1.66668701171875, + "learning_rate": 5e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.748370885848999, + "num_tokens": 428005609.0, + "step": 16540 + }, + { + "epoch": 1.816494618932572, + "grad_norm": 1.646452784538269, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7218713164329529, + "num_tokens": 428035320.0, + "step": 16541 + }, + { + "epoch": 1.8166044366351857, + "grad_norm": 1.6574164628982544, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7261747121810913, + "num_tokens": 428064055.0, + "step": 16542 + }, + { + "epoch": 1.8167142543377992, + "grad_norm": 1.7161097526550293, + "learning_rate": 5e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7362493872642517, + "num_tokens": 428090102.0, + "step": 16543 + }, + { + "epoch": 1.816824072040413, + "grad_norm": 1.962298035621643, + "learning_rate": 5e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.72923743724823, + "num_tokens": 428112215.0, + "step": 16544 + }, + { + "epoch": 1.8169338897430265, + "grad_norm": 1.7946237325668335, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7306339740753174, + "num_tokens": 428138440.0, + "step": 16545 + }, + { + "epoch": 1.8170437074456403, + "grad_norm": 1.7810357809066772, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7468653321266174, + "num_tokens": 428162070.0, + "step": 16546 + }, + { + "epoch": 1.817153525148254, + "grad_norm": 1.6456106901168823, + "learning_rate": 5e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7109019756317139, + "num_tokens": 428193061.0, + "step": 16547 + }, + { + "epoch": 1.8172633428508675, + "grad_norm": 1.7867733240127563, + "learning_rate": 5e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7247617840766907, + "num_tokens": 428220795.0, + "step": 16548 + }, + { + "epoch": 1.817373160553481, + "grad_norm": 2.005596160888672, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7393450140953064, + "num_tokens": 428241501.0, + "step": 16549 + }, + { + "epoch": 1.8174829782560948, + "grad_norm": 1.837581753730774, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7316219210624695, + "num_tokens": 428264366.0, + "step": 16550 + }, + { + "epoch": 1.8175927959587086, + "grad_norm": 1.6831245422363281, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7543309330940247, + "num_tokens": 428291414.0, + "step": 16551 + }, + { + "epoch": 1.8177026136613224, + "grad_norm": 1.7603771686553955, + "learning_rate": 5e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7346733808517456, + "num_tokens": 428316432.0, + "step": 16552 + }, + { + "epoch": 1.8178124313639359, + "grad_norm": 2.0184502601623535, + "learning_rate": 5e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7546495199203491, + "num_tokens": 428336544.0, + "step": 16553 + }, + { + "epoch": 1.8179222490665494, + "grad_norm": 1.6801165342330933, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7355822324752808, + "num_tokens": 428363056.0, + "step": 16554 + }, + { + "epoch": 1.8180320667691632, + "grad_norm": 1.6798515319824219, + "learning_rate": 5e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7148605585098267, + "num_tokens": 428393096.0, + "step": 16555 + }, + { + "epoch": 1.818141884471777, + "grad_norm": 1.8130171298980713, + "learning_rate": 5e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6923550367355347, + "num_tokens": 428420288.0, + "step": 16556 + }, + { + "epoch": 1.8182517021743905, + "grad_norm": 1.9824159145355225, + "learning_rate": 5e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7217139601707458, + "num_tokens": 428443135.0, + "step": 16557 + }, + { + "epoch": 1.818361519877004, + "grad_norm": 1.6720211505889893, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7424484491348267, + "num_tokens": 428472505.0, + "step": 16558 + }, + { + "epoch": 1.8184713375796178, + "grad_norm": 2.036740779876709, + "learning_rate": 5e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7469278573989868, + "num_tokens": 428491669.0, + "step": 16559 + }, + { + "epoch": 1.8185811552822315, + "grad_norm": 1.72311532497406, + "learning_rate": 5e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7398642301559448, + "num_tokens": 428519412.0, + "step": 16560 + }, + { + "epoch": 1.8186909729848453, + "grad_norm": 1.8236489295959473, + "learning_rate": 5e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7044123411178589, + "num_tokens": 428546815.0, + "step": 16561 + }, + { + "epoch": 1.8188007906874588, + "grad_norm": 1.8326250314712524, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7264283895492554, + "num_tokens": 428571901.0, + "step": 16562 + }, + { + "epoch": 1.8189106083900723, + "grad_norm": 1.7131596803665161, + "learning_rate": 5e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7508164644241333, + "num_tokens": 428600286.0, + "step": 16563 + }, + { + "epoch": 1.819020426092686, + "grad_norm": 1.8286428451538086, + "learning_rate": 5e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7419931888580322, + "num_tokens": 428623684.0, + "step": 16564 + }, + { + "epoch": 1.8191302437952999, + "grad_norm": 1.8556861877441406, + "learning_rate": 5e-06, + "loss": 0.938, + "mean_token_accuracy": 0.70823073387146, + "num_tokens": 428650046.0, + "step": 16565 + }, + { + "epoch": 1.8192400614979136, + "grad_norm": 1.8905115127563477, + "learning_rate": 5e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.752687931060791, + "num_tokens": 428672461.0, + "step": 16566 + }, + { + "epoch": 1.8193498792005272, + "grad_norm": 1.6191060543060303, + "learning_rate": 5e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7337278127670288, + "num_tokens": 428702126.0, + "step": 16567 + }, + { + "epoch": 1.8194596969031407, + "grad_norm": 1.869342565536499, + "learning_rate": 5e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7615243196487427, + "num_tokens": 428723976.0, + "step": 16568 + }, + { + "epoch": 1.8195695146057544, + "grad_norm": 1.7426652908325195, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7153820395469666, + "num_tokens": 428751629.0, + "step": 16569 + }, + { + "epoch": 1.8196793323083682, + "grad_norm": 1.90855073928833, + "learning_rate": 5e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7163739204406738, + "num_tokens": 428775748.0, + "step": 16570 + }, + { + "epoch": 1.8197891500109817, + "grad_norm": 1.7277510166168213, + "learning_rate": 5e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7184669971466064, + "num_tokens": 428801635.0, + "step": 16571 + }, + { + "epoch": 1.8198989677135953, + "grad_norm": 1.8108587265014648, + "learning_rate": 5e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7161623239517212, + "num_tokens": 428827032.0, + "step": 16572 + }, + { + "epoch": 1.820008785416209, + "grad_norm": 1.8849951028823853, + "learning_rate": 5e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.739291787147522, + "num_tokens": 428850204.0, + "step": 16573 + }, + { + "epoch": 1.8201186031188228, + "grad_norm": 1.989593505859375, + "learning_rate": 5e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7569096684455872, + "num_tokens": 428871506.0, + "step": 16574 + }, + { + "epoch": 1.8202284208214365, + "grad_norm": 1.8382900953292847, + "learning_rate": 5e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.707709550857544, + "num_tokens": 428898069.0, + "step": 16575 + }, + { + "epoch": 1.82033823852405, + "grad_norm": 1.8101695775985718, + "learning_rate": 5e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7336971163749695, + "num_tokens": 428923660.0, + "step": 16576 + }, + { + "epoch": 1.8204480562266636, + "grad_norm": 1.9697644710540771, + "learning_rate": 5e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7345582842826843, + "num_tokens": 428944237.0, + "step": 16577 + }, + { + "epoch": 1.8205578739292774, + "grad_norm": 1.8471745252609253, + "learning_rate": 5e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7222671508789062, + "num_tokens": 428969666.0, + "step": 16578 + }, + { + "epoch": 1.8206676916318911, + "grad_norm": 1.910845160484314, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7435407638549805, + "num_tokens": 428992216.0, + "step": 16579 + }, + { + "epoch": 1.8207775093345049, + "grad_norm": 2.0118777751922607, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7221133708953857, + "num_tokens": 429015773.0, + "step": 16580 + }, + { + "epoch": 1.8208873270371184, + "grad_norm": 2.0130720138549805, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7355229258537292, + "num_tokens": 429035922.0, + "step": 16581 + }, + { + "epoch": 1.820997144739732, + "grad_norm": 1.705368161201477, + "learning_rate": 5e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7356696724891663, + "num_tokens": 429064620.0, + "step": 16582 + }, + { + "epoch": 1.8211069624423457, + "grad_norm": 1.8475900888442993, + "learning_rate": 5e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7277364134788513, + "num_tokens": 429090495.0, + "step": 16583 + }, + { + "epoch": 1.8212167801449595, + "grad_norm": 1.7132946252822876, + "learning_rate": 5e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7195658683776855, + "num_tokens": 429116251.0, + "step": 16584 + }, + { + "epoch": 1.821326597847573, + "grad_norm": 1.921097993850708, + "learning_rate": 5e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7459143400192261, + "num_tokens": 429138394.0, + "step": 16585 + }, + { + "epoch": 1.8214364155501865, + "grad_norm": 1.7510356903076172, + "learning_rate": 5e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7256617546081543, + "num_tokens": 429165697.0, + "step": 16586 + }, + { + "epoch": 1.8215462332528003, + "grad_norm": 1.594028115272522, + "learning_rate": 5e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7441282272338867, + "num_tokens": 429196733.0, + "step": 16587 + }, + { + "epoch": 1.821656050955414, + "grad_norm": 1.7138768434524536, + "learning_rate": 5e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7670614123344421, + "num_tokens": 429221876.0, + "step": 16588 + }, + { + "epoch": 1.8217658686580278, + "grad_norm": 1.8860563039779663, + "learning_rate": 5e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7032891511917114, + "num_tokens": 429249591.0, + "step": 16589 + }, + { + "epoch": 1.8218756863606413, + "grad_norm": 1.6888867616653442, + "learning_rate": 5e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7046831846237183, + "num_tokens": 429279917.0, + "step": 16590 + }, + { + "epoch": 1.8219855040632549, + "grad_norm": 1.828153371810913, + "learning_rate": 5e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7322035431861877, + "num_tokens": 429308147.0, + "step": 16591 + }, + { + "epoch": 1.8220953217658686, + "grad_norm": 2.1248464584350586, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7267134785652161, + "num_tokens": 429329921.0, + "step": 16592 + }, + { + "epoch": 1.8222051394684824, + "grad_norm": 1.8265918493270874, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7069977521896362, + "num_tokens": 429358220.0, + "step": 16593 + }, + { + "epoch": 1.822314957171096, + "grad_norm": 1.906168818473816, + "learning_rate": 5e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7529051899909973, + "num_tokens": 429381319.0, + "step": 16594 + }, + { + "epoch": 1.8224247748737097, + "grad_norm": 1.6910206079483032, + "learning_rate": 5e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6868941783905029, + "num_tokens": 429413103.0, + "step": 16595 + }, + { + "epoch": 1.8225345925763232, + "grad_norm": 1.802380084991455, + "learning_rate": 5e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7283613681793213, + "num_tokens": 429438844.0, + "step": 16596 + }, + { + "epoch": 1.822644410278937, + "grad_norm": 1.7445510625839233, + "learning_rate": 5e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7044708728790283, + "num_tokens": 429467999.0, + "step": 16597 + }, + { + "epoch": 1.8227542279815507, + "grad_norm": 2.0064234733581543, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7306210994720459, + "num_tokens": 429489815.0, + "step": 16598 + }, + { + "epoch": 1.8228640456841643, + "grad_norm": 1.752795934677124, + "learning_rate": 5e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7343263626098633, + "num_tokens": 429517020.0, + "step": 16599 + }, + { + "epoch": 1.8229738633867778, + "grad_norm": 1.797141194343567, + "learning_rate": 5e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7311787605285645, + "num_tokens": 429543780.0, + "step": 16600 + }, + { + "epoch": 1.8230836810893916, + "grad_norm": 1.7078520059585571, + "learning_rate": 5e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7060528993606567, + "num_tokens": 429573915.0, + "step": 16601 + }, + { + "epoch": 1.8231934987920053, + "grad_norm": 1.9223562479019165, + "learning_rate": 5e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7197485566139221, + "num_tokens": 429599155.0, + "step": 16602 + }, + { + "epoch": 1.823303316494619, + "grad_norm": 1.819153070449829, + "learning_rate": 5e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7440261244773865, + "num_tokens": 429624740.0, + "step": 16603 + }, + { + "epoch": 1.8234131341972326, + "grad_norm": 1.6964558362960815, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7264270782470703, + "num_tokens": 429654932.0, + "step": 16604 + }, + { + "epoch": 1.8235229518998461, + "grad_norm": 1.6080580949783325, + "learning_rate": 5e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7182175517082214, + "num_tokens": 429686246.0, + "step": 16605 + }, + { + "epoch": 1.82363276960246, + "grad_norm": 1.9739145040512085, + "learning_rate": 5e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7354780435562134, + "num_tokens": 429710250.0, + "step": 16606 + }, + { + "epoch": 1.8237425873050737, + "grad_norm": 1.8230522871017456, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7382376194000244, + "num_tokens": 429735266.0, + "step": 16607 + }, + { + "epoch": 1.8238524050076872, + "grad_norm": 1.8801751136779785, + "learning_rate": 5e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7198553085327148, + "num_tokens": 429760541.0, + "step": 16608 + }, + { + "epoch": 1.823962222710301, + "grad_norm": 1.748197317123413, + "learning_rate": 5e-06, + "loss": 0.899, + "mean_token_accuracy": 0.726954996585846, + "num_tokens": 429789681.0, + "step": 16609 + }, + { + "epoch": 1.8240720404129145, + "grad_norm": 1.663895606994629, + "learning_rate": 5e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.717402458190918, + "num_tokens": 429817594.0, + "step": 16610 + }, + { + "epoch": 1.8241818581155282, + "grad_norm": 1.8549877405166626, + "learning_rate": 5e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7501741647720337, + "num_tokens": 429842823.0, + "step": 16611 + }, + { + "epoch": 1.824291675818142, + "grad_norm": 1.8728057146072388, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7205408811569214, + "num_tokens": 429866379.0, + "step": 16612 + }, + { + "epoch": 1.8244014935207555, + "grad_norm": 1.6745975017547607, + "learning_rate": 5e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7208036184310913, + "num_tokens": 429894499.0, + "step": 16613 + }, + { + "epoch": 1.824511311223369, + "grad_norm": 1.7481179237365723, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7216053009033203, + "num_tokens": 429922713.0, + "step": 16614 + }, + { + "epoch": 1.8246211289259828, + "grad_norm": 1.7262070178985596, + "learning_rate": 5e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7257863283157349, + "num_tokens": 429948886.0, + "step": 16615 + }, + { + "epoch": 1.8247309466285966, + "grad_norm": 2.061882257461548, + "learning_rate": 5e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7432816028594971, + "num_tokens": 429970860.0, + "step": 16616 + }, + { + "epoch": 1.8248407643312103, + "grad_norm": 1.8838156461715698, + "learning_rate": 5e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7260868549346924, + "num_tokens": 429995263.0, + "step": 16617 + }, + { + "epoch": 1.8249505820338239, + "grad_norm": 1.7380279302597046, + "learning_rate": 5e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7395232915878296, + "num_tokens": 430022499.0, + "step": 16618 + }, + { + "epoch": 1.8250603997364374, + "grad_norm": 1.7746343612670898, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7408608198165894, + "num_tokens": 430046846.0, + "step": 16619 + }, + { + "epoch": 1.8251702174390512, + "grad_norm": 1.8680715560913086, + "learning_rate": 5e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.728362500667572, + "num_tokens": 430071280.0, + "step": 16620 + }, + { + "epoch": 1.825280035141665, + "grad_norm": 1.793679118156433, + "learning_rate": 5e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7165858745574951, + "num_tokens": 430096773.0, + "step": 16621 + }, + { + "epoch": 1.8253898528442785, + "grad_norm": 1.7529367208480835, + "learning_rate": 5e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7199934720993042, + "num_tokens": 430125407.0, + "step": 16622 + }, + { + "epoch": 1.825499670546892, + "grad_norm": 1.8097237348556519, + "learning_rate": 5e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7304919362068176, + "num_tokens": 430148597.0, + "step": 16623 + }, + { + "epoch": 1.8256094882495058, + "grad_norm": 1.9079632759094238, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7541340589523315, + "num_tokens": 430171820.0, + "step": 16624 + }, + { + "epoch": 1.8257193059521195, + "grad_norm": 1.8743282556533813, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7153985500335693, + "num_tokens": 430197653.0, + "step": 16625 + }, + { + "epoch": 1.8258291236547333, + "grad_norm": 1.668077826499939, + "learning_rate": 5e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7426488399505615, + "num_tokens": 430225176.0, + "step": 16626 + }, + { + "epoch": 1.8259389413573468, + "grad_norm": 1.5953537225723267, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.730873703956604, + "num_tokens": 430256137.0, + "step": 16627 + }, + { + "epoch": 1.8260487590599603, + "grad_norm": 1.7872666120529175, + "learning_rate": 5e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7150715589523315, + "num_tokens": 430280919.0, + "step": 16628 + }, + { + "epoch": 1.826158576762574, + "grad_norm": 1.763269305229187, + "learning_rate": 5e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7335101366043091, + "num_tokens": 430307431.0, + "step": 16629 + }, + { + "epoch": 1.8262683944651878, + "grad_norm": 1.5991448163986206, + "learning_rate": 5e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7379653453826904, + "num_tokens": 430336546.0, + "step": 16630 + }, + { + "epoch": 1.8263782121678016, + "grad_norm": 1.7040042877197266, + "learning_rate": 5e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7324721813201904, + "num_tokens": 430365704.0, + "step": 16631 + }, + { + "epoch": 1.8264880298704151, + "grad_norm": 2.2682433128356934, + "learning_rate": 5e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7425040602684021, + "num_tokens": 430384636.0, + "step": 16632 + }, + { + "epoch": 1.8265978475730287, + "grad_norm": 1.8617838621139526, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.724073588848114, + "num_tokens": 430409237.0, + "step": 16633 + }, + { + "epoch": 1.8267076652756424, + "grad_norm": 1.6598564386367798, + "learning_rate": 5e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7160539627075195, + "num_tokens": 430437845.0, + "step": 16634 + }, + { + "epoch": 1.8268174829782562, + "grad_norm": 1.795301079750061, + "learning_rate": 5e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.724942684173584, + "num_tokens": 430462739.0, + "step": 16635 + }, + { + "epoch": 1.8269273006808697, + "grad_norm": 1.8099137544631958, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.726943850517273, + "num_tokens": 430489084.0, + "step": 16636 + }, + { + "epoch": 1.8270371183834833, + "grad_norm": 1.9697086811065674, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7264915704727173, + "num_tokens": 430511043.0, + "step": 16637 + }, + { + "epoch": 1.827146936086097, + "grad_norm": 1.9836702346801758, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7345641851425171, + "num_tokens": 430533066.0, + "step": 16638 + }, + { + "epoch": 1.8272567537887108, + "grad_norm": 1.8767937421798706, + "learning_rate": 5e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7545891404151917, + "num_tokens": 430554834.0, + "step": 16639 + }, + { + "epoch": 1.8273665714913245, + "grad_norm": 2.11275577545166, + "learning_rate": 5e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7630877494812012, + "num_tokens": 430572723.0, + "step": 16640 + }, + { + "epoch": 1.827476389193938, + "grad_norm": 1.7478939294815063, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.747400164604187, + "num_tokens": 430600453.0, + "step": 16641 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 2.0134787559509277, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7368297576904297, + "num_tokens": 430624813.0, + "step": 16642 + }, + { + "epoch": 1.8276960245991654, + "grad_norm": 1.6576703786849976, + "learning_rate": 5e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.731924295425415, + "num_tokens": 430655494.0, + "step": 16643 + }, + { + "epoch": 1.8278058423017791, + "grad_norm": 1.6582978963851929, + "learning_rate": 5e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7138599157333374, + "num_tokens": 430686403.0, + "step": 16644 + }, + { + "epoch": 1.8279156600043929, + "grad_norm": 2.0015361309051514, + "learning_rate": 5e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7156479954719543, + "num_tokens": 430708658.0, + "step": 16645 + }, + { + "epoch": 1.8280254777070064, + "grad_norm": 1.9223276376724243, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7321921586990356, + "num_tokens": 430730858.0, + "step": 16646 + }, + { + "epoch": 1.82813529540962, + "grad_norm": 1.7212110757827759, + "learning_rate": 5e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.712944746017456, + "num_tokens": 430760207.0, + "step": 16647 + }, + { + "epoch": 1.8282451131122337, + "grad_norm": 1.8494858741760254, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7105106711387634, + "num_tokens": 430785639.0, + "step": 16648 + }, + { + "epoch": 1.8283549308148475, + "grad_norm": 1.527240514755249, + "learning_rate": 5e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7376108169555664, + "num_tokens": 430817409.0, + "step": 16649 + }, + { + "epoch": 1.828464748517461, + "grad_norm": 1.6446353197097778, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7195888161659241, + "num_tokens": 430848048.0, + "step": 16650 + }, + { + "epoch": 1.8285745662200745, + "grad_norm": 1.7272371053695679, + "learning_rate": 5e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7253973484039307, + "num_tokens": 430875179.0, + "step": 16651 + }, + { + "epoch": 1.8286843839226883, + "grad_norm": 1.6529500484466553, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7124783992767334, + "num_tokens": 430904489.0, + "step": 16652 + }, + { + "epoch": 1.828794201625302, + "grad_norm": 1.6492637395858765, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.72123783826828, + "num_tokens": 430935872.0, + "step": 16653 + }, + { + "epoch": 1.8289040193279158, + "grad_norm": 1.8669328689575195, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7412213087081909, + "num_tokens": 430959482.0, + "step": 16654 + }, + { + "epoch": 1.8290138370305293, + "grad_norm": 1.61329185962677, + "learning_rate": 5e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7503508925437927, + "num_tokens": 430987566.0, + "step": 16655 + }, + { + "epoch": 1.8291236547331429, + "grad_norm": 2.182687520980835, + "learning_rate": 5e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7497780323028564, + "num_tokens": 431004910.0, + "step": 16656 + }, + { + "epoch": 1.8292334724357566, + "grad_norm": 1.6576710939407349, + "learning_rate": 5e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7080556750297546, + "num_tokens": 431037948.0, + "step": 16657 + }, + { + "epoch": 1.8293432901383704, + "grad_norm": 2.135033369064331, + "learning_rate": 5e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7225111722946167, + "num_tokens": 431061444.0, + "step": 16658 + }, + { + "epoch": 1.829453107840984, + "grad_norm": 1.8298099040985107, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7245192527770996, + "num_tokens": 431087608.0, + "step": 16659 + }, + { + "epoch": 1.8295629255435977, + "grad_norm": 1.9595067501068115, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7452273964881897, + "num_tokens": 431109741.0, + "step": 16660 + }, + { + "epoch": 1.8296727432462112, + "grad_norm": 1.6570974588394165, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7383460998535156, + "num_tokens": 431136842.0, + "step": 16661 + }, + { + "epoch": 1.829782560948825, + "grad_norm": 1.8213435411453247, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7259417176246643, + "num_tokens": 431161767.0, + "step": 16662 + }, + { + "epoch": 1.8298923786514387, + "grad_norm": 1.6462104320526123, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7296266555786133, + "num_tokens": 431192878.0, + "step": 16663 + }, + { + "epoch": 1.8300021963540523, + "grad_norm": 1.9698559045791626, + "learning_rate": 5e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7537863254547119, + "num_tokens": 431213680.0, + "step": 16664 + }, + { + "epoch": 1.8301120140566658, + "grad_norm": 1.8103349208831787, + "learning_rate": 5e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7376574277877808, + "num_tokens": 431237800.0, + "step": 16665 + }, + { + "epoch": 1.8302218317592795, + "grad_norm": 1.6610376834869385, + "learning_rate": 5e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7198212146759033, + "num_tokens": 431267374.0, + "step": 16666 + }, + { + "epoch": 1.8303316494618933, + "grad_norm": 1.9076290130615234, + "learning_rate": 5e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7237181663513184, + "num_tokens": 431293127.0, + "step": 16667 + }, + { + "epoch": 1.830441467164507, + "grad_norm": 1.7052407264709473, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7369977235794067, + "num_tokens": 431319601.0, + "step": 16668 + }, + { + "epoch": 1.8305512848671206, + "grad_norm": 1.9054574966430664, + "learning_rate": 5e-06, + "loss": 0.814, + "mean_token_accuracy": 0.7395950555801392, + "num_tokens": 431342372.0, + "step": 16669 + }, + { + "epoch": 1.8306611025697341, + "grad_norm": 1.8335036039352417, + "learning_rate": 5e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.728792667388916, + "num_tokens": 431368622.0, + "step": 16670 + }, + { + "epoch": 1.830770920272348, + "grad_norm": 1.8242199420928955, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.74309903383255, + "num_tokens": 431392865.0, + "step": 16671 + }, + { + "epoch": 1.8308807379749616, + "grad_norm": 1.7097302675247192, + "learning_rate": 5e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.739155113697052, + "num_tokens": 431417995.0, + "step": 16672 + }, + { + "epoch": 1.8309905556775752, + "grad_norm": 1.6376395225524902, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7384812235832214, + "num_tokens": 431447260.0, + "step": 16673 + }, + { + "epoch": 1.831100373380189, + "grad_norm": 1.9395759105682373, + "learning_rate": 5e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7168616056442261, + "num_tokens": 431472335.0, + "step": 16674 + }, + { + "epoch": 1.8312101910828025, + "grad_norm": 1.7998485565185547, + "learning_rate": 5e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7143983244895935, + "num_tokens": 431497642.0, + "step": 16675 + }, + { + "epoch": 1.8313200087854162, + "grad_norm": 2.154937505722046, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7609843611717224, + "num_tokens": 431516324.0, + "step": 16676 + }, + { + "epoch": 1.83142982648803, + "grad_norm": 1.880804419517517, + "learning_rate": 5e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7261639833450317, + "num_tokens": 431541458.0, + "step": 16677 + }, + { + "epoch": 1.8315396441906435, + "grad_norm": 1.8413490056991577, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.72003573179245, + "num_tokens": 431566880.0, + "step": 16678 + }, + { + "epoch": 1.831649461893257, + "grad_norm": 1.9911818504333496, + "learning_rate": 5e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7505702972412109, + "num_tokens": 431588276.0, + "step": 16679 + }, + { + "epoch": 1.8317592795958708, + "grad_norm": 1.9389644861221313, + "learning_rate": 5e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7271535396575928, + "num_tokens": 431611173.0, + "step": 16680 + }, + { + "epoch": 1.8318690972984846, + "grad_norm": 1.9402250051498413, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7571316957473755, + "num_tokens": 431631105.0, + "step": 16681 + }, + { + "epoch": 1.8319789150010983, + "grad_norm": 2.1968741416931152, + "learning_rate": 5e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7358411550521851, + "num_tokens": 431649782.0, + "step": 16682 + }, + { + "epoch": 1.8320887327037119, + "grad_norm": 1.892215371131897, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7324891686439514, + "num_tokens": 431674761.0, + "step": 16683 + }, + { + "epoch": 1.8321985504063254, + "grad_norm": 1.8894245624542236, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.728108286857605, + "num_tokens": 431701212.0, + "step": 16684 + }, + { + "epoch": 1.8323083681089392, + "grad_norm": 1.8823704719543457, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7349961400032043, + "num_tokens": 431725169.0, + "step": 16685 + }, + { + "epoch": 1.832418185811553, + "grad_norm": 1.6367924213409424, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7223285436630249, + "num_tokens": 431753611.0, + "step": 16686 + }, + { + "epoch": 1.8325280035141664, + "grad_norm": 1.8480161428451538, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7360858917236328, + "num_tokens": 431778359.0, + "step": 16687 + }, + { + "epoch": 1.83263782121678, + "grad_norm": 2.1007468700408936, + "learning_rate": 5e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7388474941253662, + "num_tokens": 431797198.0, + "step": 16688 + }, + { + "epoch": 1.8327476389193937, + "grad_norm": 1.8225661516189575, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7143621444702148, + "num_tokens": 431824969.0, + "step": 16689 + }, + { + "epoch": 1.8328574566220075, + "grad_norm": 1.92228102684021, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7127728462219238, + "num_tokens": 431852901.0, + "step": 16690 + }, + { + "epoch": 1.8329672743246213, + "grad_norm": 1.790714979171753, + "learning_rate": 5e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7173089981079102, + "num_tokens": 431880059.0, + "step": 16691 + }, + { + "epoch": 1.8330770920272348, + "grad_norm": 1.6786205768585205, + "learning_rate": 5e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7262771129608154, + "num_tokens": 431908971.0, + "step": 16692 + }, + { + "epoch": 1.8331869097298483, + "grad_norm": 1.9956895112991333, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7187176942825317, + "num_tokens": 431936321.0, + "step": 16693 + }, + { + "epoch": 1.833296727432462, + "grad_norm": 1.7115271091461182, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7334373593330383, + "num_tokens": 431965876.0, + "step": 16694 + }, + { + "epoch": 1.8334065451350758, + "grad_norm": 1.8849058151245117, + "learning_rate": 5e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7376112937927246, + "num_tokens": 431988600.0, + "step": 16695 + }, + { + "epoch": 1.8335163628376896, + "grad_norm": 1.6516259908676147, + "learning_rate": 5e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7027921676635742, + "num_tokens": 432019149.0, + "step": 16696 + }, + { + "epoch": 1.8336261805403031, + "grad_norm": 1.949445366859436, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7411477565765381, + "num_tokens": 432042081.0, + "step": 16697 + }, + { + "epoch": 1.8337359982429167, + "grad_norm": 1.957836627960205, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7534563541412354, + "num_tokens": 432061544.0, + "step": 16698 + }, + { + "epoch": 1.8338458159455304, + "grad_norm": 1.6315826177597046, + "learning_rate": 5e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7092225551605225, + "num_tokens": 432095507.0, + "step": 16699 + }, + { + "epoch": 1.8339556336481442, + "grad_norm": 1.749245524406433, + "learning_rate": 5e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7669917345046997, + "num_tokens": 432118785.0, + "step": 16700 + }, + { + "epoch": 1.8340654513507577, + "grad_norm": 1.9316328763961792, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7313032150268555, + "num_tokens": 432141570.0, + "step": 16701 + }, + { + "epoch": 1.8341752690533712, + "grad_norm": 1.9320460557937622, + "learning_rate": 5e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7354497909545898, + "num_tokens": 432164257.0, + "step": 16702 + }, + { + "epoch": 1.834285086755985, + "grad_norm": 1.6960718631744385, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7312220335006714, + "num_tokens": 432191182.0, + "step": 16703 + }, + { + "epoch": 1.8343949044585988, + "grad_norm": 1.6498222351074219, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7374019622802734, + "num_tokens": 432219712.0, + "step": 16704 + }, + { + "epoch": 1.8345047221612125, + "grad_norm": 1.9409091472625732, + "learning_rate": 5e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7343574166297913, + "num_tokens": 432243154.0, + "step": 16705 + }, + { + "epoch": 1.834614539863826, + "grad_norm": 1.7175610065460205, + "learning_rate": 5e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7434796094894409, + "num_tokens": 432269512.0, + "step": 16706 + }, + { + "epoch": 1.8347243575664396, + "grad_norm": 1.614172339439392, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7309447526931763, + "num_tokens": 432300000.0, + "step": 16707 + }, + { + "epoch": 1.8348341752690533, + "grad_norm": 1.8953735828399658, + "learning_rate": 5e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7467398643493652, + "num_tokens": 432324439.0, + "step": 16708 + }, + { + "epoch": 1.834943992971667, + "grad_norm": 1.9076770544052124, + "learning_rate": 5e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.706027090549469, + "num_tokens": 432351931.0, + "step": 16709 + }, + { + "epoch": 1.8350538106742806, + "grad_norm": 1.881934642791748, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7163089513778687, + "num_tokens": 432375349.0, + "step": 16710 + }, + { + "epoch": 1.8351636283768944, + "grad_norm": 1.7024273872375488, + "learning_rate": 5e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6935552954673767, + "num_tokens": 432406533.0, + "step": 16711 + }, + { + "epoch": 1.835273446079508, + "grad_norm": 1.6752780675888062, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7109159231185913, + "num_tokens": 432436844.0, + "step": 16712 + }, + { + "epoch": 1.8353832637821217, + "grad_norm": 1.8332831859588623, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.733277440071106, + "num_tokens": 432460525.0, + "step": 16713 + }, + { + "epoch": 1.8354930814847354, + "grad_norm": 1.8666553497314453, + "learning_rate": 5e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7194064259529114, + "num_tokens": 432484570.0, + "step": 16714 + }, + { + "epoch": 1.835602899187349, + "grad_norm": 1.9332417249679565, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7222025394439697, + "num_tokens": 432510601.0, + "step": 16715 + }, + { + "epoch": 1.8357127168899625, + "grad_norm": 1.8705600500106812, + "learning_rate": 5e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7451145052909851, + "num_tokens": 432534104.0, + "step": 16716 + }, + { + "epoch": 1.8358225345925763, + "grad_norm": 1.609736680984497, + "learning_rate": 5e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7504854202270508, + "num_tokens": 432561870.0, + "step": 16717 + }, + { + "epoch": 1.83593235229519, + "grad_norm": 1.7018051147460938, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7375012636184692, + "num_tokens": 432587252.0, + "step": 16718 + }, + { + "epoch": 1.8360421699978038, + "grad_norm": 1.882150411605835, + "learning_rate": 5e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7263138294219971, + "num_tokens": 432613584.0, + "step": 16719 + }, + { + "epoch": 1.8361519877004173, + "grad_norm": 1.9073406457901, + "learning_rate": 5e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7413051724433899, + "num_tokens": 432635165.0, + "step": 16720 + }, + { + "epoch": 1.8362618054030309, + "grad_norm": 1.9235055446624756, + "learning_rate": 5e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7276197671890259, + "num_tokens": 432656861.0, + "step": 16721 + }, + { + "epoch": 1.8363716231056446, + "grad_norm": 2.019383192062378, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.73757004737854, + "num_tokens": 432677352.0, + "step": 16722 + }, + { + "epoch": 1.8364814408082584, + "grad_norm": 1.7620974779129028, + "learning_rate": 5e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7443697452545166, + "num_tokens": 432702368.0, + "step": 16723 + }, + { + "epoch": 1.836591258510872, + "grad_norm": 1.6580345630645752, + "learning_rate": 5e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7393507361412048, + "num_tokens": 432729165.0, + "step": 16724 + }, + { + "epoch": 1.8367010762134857, + "grad_norm": 1.836560606956482, + "learning_rate": 5e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7320799827575684, + "num_tokens": 432754181.0, + "step": 16725 + }, + { + "epoch": 1.8368108939160992, + "grad_norm": 1.870355248451233, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7457891702651978, + "num_tokens": 432776064.0, + "step": 16726 + }, + { + "epoch": 1.836920711618713, + "grad_norm": 1.854077696800232, + "learning_rate": 5e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7201614379882812, + "num_tokens": 432801457.0, + "step": 16727 + }, + { + "epoch": 1.8370305293213267, + "grad_norm": 1.8761154413223267, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7298385500907898, + "num_tokens": 432824651.0, + "step": 16728 + }, + { + "epoch": 1.8371403470239402, + "grad_norm": 1.8386355638504028, + "learning_rate": 5e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.71998530626297, + "num_tokens": 432851064.0, + "step": 16729 + }, + { + "epoch": 1.8372501647265538, + "grad_norm": 1.439719319343567, + "learning_rate": 5e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7114200592041016, + "num_tokens": 432886394.0, + "step": 16730 + }, + { + "epoch": 1.8373599824291675, + "grad_norm": 1.7752562761306763, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7417460680007935, + "num_tokens": 432910452.0, + "step": 16731 + }, + { + "epoch": 1.8374698001317813, + "grad_norm": 1.5918477773666382, + "learning_rate": 5e-06, + "loss": 0.898, + "mean_token_accuracy": 0.718933641910553, + "num_tokens": 432941753.0, + "step": 16732 + }, + { + "epoch": 1.837579617834395, + "grad_norm": 1.7148123979568481, + "learning_rate": 5e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7293529510498047, + "num_tokens": 432968925.0, + "step": 16733 + }, + { + "epoch": 1.8376894355370086, + "grad_norm": 1.9661744832992554, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.725933849811554, + "num_tokens": 432990173.0, + "step": 16734 + }, + { + "epoch": 1.8377992532396221, + "grad_norm": 1.7726644277572632, + "learning_rate": 5e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7283877730369568, + "num_tokens": 433017633.0, + "step": 16735 + }, + { + "epoch": 1.8379090709422359, + "grad_norm": 1.7639135122299194, + "learning_rate": 5e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7407076954841614, + "num_tokens": 433044754.0, + "step": 16736 + }, + { + "epoch": 1.8380188886448496, + "grad_norm": 1.930590033531189, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.72978276014328, + "num_tokens": 433067843.0, + "step": 16737 + }, + { + "epoch": 1.8381287063474632, + "grad_norm": 1.9514262676239014, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7386483550071716, + "num_tokens": 433088968.0, + "step": 16738 + }, + { + "epoch": 1.838238524050077, + "grad_norm": 1.7029753923416138, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.714028537273407, + "num_tokens": 433117777.0, + "step": 16739 + }, + { + "epoch": 1.8383483417526905, + "grad_norm": 1.904136300086975, + "learning_rate": 5e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7467522621154785, + "num_tokens": 433139659.0, + "step": 16740 + }, + { + "epoch": 1.8384581594553042, + "grad_norm": 1.9474928379058838, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.747008204460144, + "num_tokens": 433162460.0, + "step": 16741 + }, + { + "epoch": 1.838567977157918, + "grad_norm": 2.1760263442993164, + "learning_rate": 5e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.758637547492981, + "num_tokens": 433180412.0, + "step": 16742 + }, + { + "epoch": 1.8386777948605315, + "grad_norm": 1.8620669841766357, + "learning_rate": 5e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7359648942947388, + "num_tokens": 433204429.0, + "step": 16743 + }, + { + "epoch": 1.838787612563145, + "grad_norm": 1.8184459209442139, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7269765734672546, + "num_tokens": 433230597.0, + "step": 16744 + }, + { + "epoch": 1.8388974302657588, + "grad_norm": 1.9553935527801514, + "learning_rate": 5e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7373679280281067, + "num_tokens": 433252055.0, + "step": 16745 + }, + { + "epoch": 1.8390072479683726, + "grad_norm": 1.766379475593567, + "learning_rate": 5e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7386240363121033, + "num_tokens": 433279801.0, + "step": 16746 + }, + { + "epoch": 1.8391170656709863, + "grad_norm": 1.9342467784881592, + "learning_rate": 5e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7284553647041321, + "num_tokens": 433302389.0, + "step": 16747 + }, + { + "epoch": 1.8392268833735999, + "grad_norm": 1.8540931940078735, + "learning_rate": 5e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7210469245910645, + "num_tokens": 433330496.0, + "step": 16748 + }, + { + "epoch": 1.8393367010762134, + "grad_norm": 1.8186531066894531, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.730316698551178, + "num_tokens": 433357055.0, + "step": 16749 + }, + { + "epoch": 1.8394465187788271, + "grad_norm": 1.9338879585266113, + "learning_rate": 5e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7424229979515076, + "num_tokens": 433380221.0, + "step": 16750 + }, + { + "epoch": 1.839556336481441, + "grad_norm": 1.9758549928665161, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7277987003326416, + "num_tokens": 433402128.0, + "step": 16751 + }, + { + "epoch": 1.8396661541840544, + "grad_norm": 2.0745339393615723, + "learning_rate": 5e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7485190033912659, + "num_tokens": 433422039.0, + "step": 16752 + }, + { + "epoch": 1.839775971886668, + "grad_norm": 2.0112175941467285, + "learning_rate": 5e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7223333716392517, + "num_tokens": 433443800.0, + "step": 16753 + }, + { + "epoch": 1.8398857895892817, + "grad_norm": 1.7400811910629272, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7214338779449463, + "num_tokens": 433472556.0, + "step": 16754 + }, + { + "epoch": 1.8399956072918955, + "grad_norm": 2.134991407394409, + "learning_rate": 5e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7413021326065063, + "num_tokens": 433491998.0, + "step": 16755 + }, + { + "epoch": 1.8401054249945092, + "grad_norm": 1.5756040811538696, + "learning_rate": 5e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7219065427780151, + "num_tokens": 433521668.0, + "step": 16756 + }, + { + "epoch": 1.8402152426971228, + "grad_norm": 2.054582118988037, + "learning_rate": 5e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7554003596305847, + "num_tokens": 433542027.0, + "step": 16757 + }, + { + "epoch": 1.8403250603997363, + "grad_norm": 1.9615455865859985, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7401962280273438, + "num_tokens": 433565567.0, + "step": 16758 + }, + { + "epoch": 1.84043487810235, + "grad_norm": 1.713492512702942, + "learning_rate": 5e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7423768043518066, + "num_tokens": 433594289.0, + "step": 16759 + }, + { + "epoch": 1.8405446958049638, + "grad_norm": 1.921475887298584, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7539163827896118, + "num_tokens": 433617444.0, + "step": 16760 + }, + { + "epoch": 1.8406545135075776, + "grad_norm": 1.9558148384094238, + "learning_rate": 5e-06, + "loss": 0.895, + "mean_token_accuracy": 0.718795120716095, + "num_tokens": 433645079.0, + "step": 16761 + }, + { + "epoch": 1.8407643312101911, + "grad_norm": 1.8247747421264648, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7269313335418701, + "num_tokens": 433669828.0, + "step": 16762 + }, + { + "epoch": 1.8408741489128047, + "grad_norm": 2.1041035652160645, + "learning_rate": 5e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7342441082000732, + "num_tokens": 433691159.0, + "step": 16763 + }, + { + "epoch": 1.8409839666154184, + "grad_norm": 1.7129212617874146, + "learning_rate": 5e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7562889456748962, + "num_tokens": 433720878.0, + "step": 16764 + }, + { + "epoch": 1.8410937843180322, + "grad_norm": 1.821748971939087, + "learning_rate": 5e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7425014972686768, + "num_tokens": 433744934.0, + "step": 16765 + }, + { + "epoch": 1.8412036020206457, + "grad_norm": 1.8820080757141113, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7402523159980774, + "num_tokens": 433769856.0, + "step": 16766 + }, + { + "epoch": 1.8413134197232592, + "grad_norm": 1.5714884996414185, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7360052466392517, + "num_tokens": 433799299.0, + "step": 16767 + }, + { + "epoch": 1.841423237425873, + "grad_norm": 2.0334532260894775, + "learning_rate": 5e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.735016942024231, + "num_tokens": 433819513.0, + "step": 16768 + }, + { + "epoch": 1.8415330551284868, + "grad_norm": 1.7368971109390259, + "learning_rate": 5e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7463464140892029, + "num_tokens": 433844507.0, + "step": 16769 + }, + { + "epoch": 1.8416428728311005, + "grad_norm": 1.9005591869354248, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7264670133590698, + "num_tokens": 433867997.0, + "step": 16770 + }, + { + "epoch": 1.841752690533714, + "grad_norm": 1.695198893547058, + "learning_rate": 5e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7094146013259888, + "num_tokens": 433898101.0, + "step": 16771 + }, + { + "epoch": 1.8418625082363276, + "grad_norm": 2.051438331604004, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7456583380699158, + "num_tokens": 433918129.0, + "step": 16772 + }, + { + "epoch": 1.8419723259389413, + "grad_norm": 1.4267035722732544, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.717437744140625, + "num_tokens": 433959291.0, + "step": 16773 + }, + { + "epoch": 1.842082143641555, + "grad_norm": 1.8888052701950073, + "learning_rate": 5e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7495965957641602, + "num_tokens": 433982717.0, + "step": 16774 + }, + { + "epoch": 1.8421919613441686, + "grad_norm": 1.6220160722732544, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7425978183746338, + "num_tokens": 434011582.0, + "step": 16775 + }, + { + "epoch": 1.8423017790467824, + "grad_norm": 1.9308640956878662, + "learning_rate": 5e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7433139085769653, + "num_tokens": 434033487.0, + "step": 16776 + }, + { + "epoch": 1.842411596749396, + "grad_norm": 1.849685549736023, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7185690402984619, + "num_tokens": 434059604.0, + "step": 16777 + }, + { + "epoch": 1.8425214144520097, + "grad_norm": 1.78752601146698, + "learning_rate": 5e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7231484651565552, + "num_tokens": 434086451.0, + "step": 16778 + }, + { + "epoch": 1.8426312321546234, + "grad_norm": 1.765141487121582, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7375946640968323, + "num_tokens": 434115876.0, + "step": 16779 + }, + { + "epoch": 1.842741049857237, + "grad_norm": 1.9579834938049316, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7500501871109009, + "num_tokens": 434137908.0, + "step": 16780 + }, + { + "epoch": 1.8428508675598505, + "grad_norm": 1.7889546155929565, + "learning_rate": 5e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7124178409576416, + "num_tokens": 434166756.0, + "step": 16781 + }, + { + "epoch": 1.8429606852624643, + "grad_norm": 1.8434253931045532, + "learning_rate": 5e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7495793700218201, + "num_tokens": 434190823.0, + "step": 16782 + }, + { + "epoch": 1.843070502965078, + "grad_norm": 2.0573296546936035, + "learning_rate": 5e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7489728927612305, + "num_tokens": 434211123.0, + "step": 16783 + }, + { + "epoch": 1.8431803206676918, + "grad_norm": 1.6550517082214355, + "learning_rate": 5e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7114010453224182, + "num_tokens": 434241442.0, + "step": 16784 + }, + { + "epoch": 1.8432901383703053, + "grad_norm": 1.8995420932769775, + "learning_rate": 5e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7419342398643494, + "num_tokens": 434266970.0, + "step": 16785 + }, + { + "epoch": 1.8433999560729188, + "grad_norm": 1.6909511089324951, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7335559129714966, + "num_tokens": 434294789.0, + "step": 16786 + }, + { + "epoch": 1.8435097737755326, + "grad_norm": 1.818561315536499, + "learning_rate": 5e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7424831986427307, + "num_tokens": 434320558.0, + "step": 16787 + }, + { + "epoch": 1.8436195914781464, + "grad_norm": 1.802992582321167, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7367224097251892, + "num_tokens": 434346342.0, + "step": 16788 + }, + { + "epoch": 1.84372940918076, + "grad_norm": 1.7488797903060913, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7303310632705688, + "num_tokens": 434372630.0, + "step": 16789 + }, + { + "epoch": 1.8438392268833736, + "grad_norm": 1.75067138671875, + "learning_rate": 5e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7506479024887085, + "num_tokens": 434399100.0, + "step": 16790 + }, + { + "epoch": 1.8439490445859872, + "grad_norm": 2.062100887298584, + "learning_rate": 5e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7422976493835449, + "num_tokens": 434420555.0, + "step": 16791 + }, + { + "epoch": 1.844058862288601, + "grad_norm": 1.8288220167160034, + "learning_rate": 5e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7034393548965454, + "num_tokens": 434447480.0, + "step": 16792 + }, + { + "epoch": 1.8441686799912147, + "grad_norm": 1.8974500894546509, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7277775406837463, + "num_tokens": 434471001.0, + "step": 16793 + }, + { + "epoch": 1.8442784976938282, + "grad_norm": 1.9168192148208618, + "learning_rate": 5e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.713463306427002, + "num_tokens": 434495375.0, + "step": 16794 + }, + { + "epoch": 1.8443883153964418, + "grad_norm": 1.7714284658432007, + "learning_rate": 5e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7558655142784119, + "num_tokens": 434521308.0, + "step": 16795 + }, + { + "epoch": 1.8444981330990555, + "grad_norm": 1.8903381824493408, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7475942969322205, + "num_tokens": 434544937.0, + "step": 16796 + }, + { + "epoch": 1.8446079508016693, + "grad_norm": 1.8141919374465942, + "learning_rate": 5e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7279908061027527, + "num_tokens": 434569337.0, + "step": 16797 + }, + { + "epoch": 1.844717768504283, + "grad_norm": 1.9631704092025757, + "learning_rate": 5e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7452201247215271, + "num_tokens": 434591710.0, + "step": 16798 + }, + { + "epoch": 1.8448275862068966, + "grad_norm": 1.8562309741973877, + "learning_rate": 5e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7270434498786926, + "num_tokens": 434617315.0, + "step": 16799 + }, + { + "epoch": 1.84493740390951, + "grad_norm": 1.9839287996292114, + "learning_rate": 5e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.741478681564331, + "num_tokens": 434639313.0, + "step": 16800 + }, + { + "epoch": 1.8450472216121239, + "grad_norm": 1.8403890132904053, + "learning_rate": 5e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7117799520492554, + "num_tokens": 434664639.0, + "step": 16801 + }, + { + "epoch": 1.8451570393147376, + "grad_norm": 1.659361720085144, + "learning_rate": 5e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7315531373023987, + "num_tokens": 434696495.0, + "step": 16802 + }, + { + "epoch": 1.8452668570173512, + "grad_norm": 1.796766757965088, + "learning_rate": 5e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7225674390792847, + "num_tokens": 434722930.0, + "step": 16803 + }, + { + "epoch": 1.8453766747199647, + "grad_norm": 1.8372207880020142, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.719366192817688, + "num_tokens": 434750093.0, + "step": 16804 + }, + { + "epoch": 1.8454864924225785, + "grad_norm": 1.673333764076233, + "learning_rate": 5e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7106175422668457, + "num_tokens": 434780961.0, + "step": 16805 + }, + { + "epoch": 1.8455963101251922, + "grad_norm": 1.8298805952072144, + "learning_rate": 5e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7370772361755371, + "num_tokens": 434805768.0, + "step": 16806 + }, + { + "epoch": 1.845706127827806, + "grad_norm": 2.0865180492401123, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7408284544944763, + "num_tokens": 434824343.0, + "step": 16807 + }, + { + "epoch": 1.8458159455304195, + "grad_norm": 2.003520965576172, + "learning_rate": 5e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7393895387649536, + "num_tokens": 434844877.0, + "step": 16808 + }, + { + "epoch": 1.845925763233033, + "grad_norm": 1.6847889423370361, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7689212560653687, + "num_tokens": 434871003.0, + "step": 16809 + }, + { + "epoch": 1.8460355809356468, + "grad_norm": 1.7557038068771362, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7251434326171875, + "num_tokens": 434898959.0, + "step": 16810 + }, + { + "epoch": 1.8461453986382605, + "grad_norm": 1.561529517173767, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.739470899105072, + "num_tokens": 434932405.0, + "step": 16811 + }, + { + "epoch": 1.8462552163408743, + "grad_norm": 1.6481457948684692, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7362697720527649, + "num_tokens": 434962584.0, + "step": 16812 + }, + { + "epoch": 1.8463650340434878, + "grad_norm": 1.7957792282104492, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7443037033081055, + "num_tokens": 434987679.0, + "step": 16813 + }, + { + "epoch": 1.8464748517461014, + "grad_norm": 1.7266597747802734, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.726996898651123, + "num_tokens": 435016436.0, + "step": 16814 + }, + { + "epoch": 1.8465846694487151, + "grad_norm": 1.9251165390014648, + "learning_rate": 5e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7032719254493713, + "num_tokens": 435041645.0, + "step": 16815 + }, + { + "epoch": 1.846694487151329, + "grad_norm": 1.737294316291809, + "learning_rate": 5e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7244738340377808, + "num_tokens": 435070540.0, + "step": 16816 + }, + { + "epoch": 1.8468043048539424, + "grad_norm": 1.6392343044281006, + "learning_rate": 5e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7108567953109741, + "num_tokens": 435104586.0, + "step": 16817 + }, + { + "epoch": 1.846914122556556, + "grad_norm": 1.9909889698028564, + "learning_rate": 5e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.714879035949707, + "num_tokens": 435128405.0, + "step": 16818 + }, + { + "epoch": 1.8470239402591697, + "grad_norm": 1.6054774522781372, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7254018783569336, + "num_tokens": 435157456.0, + "step": 16819 + }, + { + "epoch": 1.8471337579617835, + "grad_norm": 1.6950759887695312, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7233335971832275, + "num_tokens": 435185560.0, + "step": 16820 + }, + { + "epoch": 1.8472435756643972, + "grad_norm": 2.219735622406006, + "learning_rate": 5e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7297332286834717, + "num_tokens": 435204811.0, + "step": 16821 + }, + { + "epoch": 1.8473533933670108, + "grad_norm": 1.9179086685180664, + "learning_rate": 5e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7454712390899658, + "num_tokens": 435226106.0, + "step": 16822 + }, + { + "epoch": 1.8474632110696243, + "grad_norm": 1.618591547012329, + "learning_rate": 5e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7266374826431274, + "num_tokens": 435257669.0, + "step": 16823 + }, + { + "epoch": 1.847573028772238, + "grad_norm": 1.7260798215866089, + "learning_rate": 5e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7391448616981506, + "num_tokens": 435284109.0, + "step": 16824 + }, + { + "epoch": 1.8476828464748518, + "grad_norm": 2.0036048889160156, + "learning_rate": 5e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7054363489151001, + "num_tokens": 435308092.0, + "step": 16825 + }, + { + "epoch": 1.8477926641774656, + "grad_norm": 1.7727237939834595, + "learning_rate": 5e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7473623752593994, + "num_tokens": 435334188.0, + "step": 16826 + }, + { + "epoch": 1.847902481880079, + "grad_norm": 1.7598861455917358, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7328137159347534, + "num_tokens": 435359457.0, + "step": 16827 + }, + { + "epoch": 1.8480122995826926, + "grad_norm": 2.1183347702026367, + "learning_rate": 5e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7439597249031067, + "num_tokens": 435378527.0, + "step": 16828 + }, + { + "epoch": 1.8481221172853064, + "grad_norm": 1.873966097831726, + "learning_rate": 5e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7207081317901611, + "num_tokens": 435402348.0, + "step": 16829 + }, + { + "epoch": 1.8482319349879202, + "grad_norm": 1.6108897924423218, + "learning_rate": 5e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7237355709075928, + "num_tokens": 435434747.0, + "step": 16830 + }, + { + "epoch": 1.8483417526905337, + "grad_norm": 1.8627821207046509, + "learning_rate": 5e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7266098260879517, + "num_tokens": 435458624.0, + "step": 16831 + }, + { + "epoch": 1.8484515703931472, + "grad_norm": 1.8228724002838135, + "learning_rate": 5e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7428337335586548, + "num_tokens": 435482142.0, + "step": 16832 + }, + { + "epoch": 1.848561388095761, + "grad_norm": 1.739885926246643, + "learning_rate": 5e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7333328723907471, + "num_tokens": 435507727.0, + "step": 16833 + }, + { + "epoch": 1.8486712057983747, + "grad_norm": 1.7381945848464966, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7180522680282593, + "num_tokens": 435535526.0, + "step": 16834 + }, + { + "epoch": 1.8487810235009885, + "grad_norm": 1.6698640584945679, + "learning_rate": 5e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7263854742050171, + "num_tokens": 435563177.0, + "step": 16835 + }, + { + "epoch": 1.848890841203602, + "grad_norm": 1.748295545578003, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7291914224624634, + "num_tokens": 435588034.0, + "step": 16836 + }, + { + "epoch": 1.8490006589062156, + "grad_norm": 1.8726871013641357, + "learning_rate": 5e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7363734245300293, + "num_tokens": 435611120.0, + "step": 16837 + }, + { + "epoch": 1.8491104766088293, + "grad_norm": 1.985300064086914, + "learning_rate": 5e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7186684608459473, + "num_tokens": 435634821.0, + "step": 16838 + }, + { + "epoch": 1.849220294311443, + "grad_norm": 1.920174241065979, + "learning_rate": 5e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7147823572158813, + "num_tokens": 435658697.0, + "step": 16839 + }, + { + "epoch": 1.8493301120140566, + "grad_norm": 1.6003506183624268, + "learning_rate": 5e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.6986979246139526, + "num_tokens": 435691491.0, + "step": 16840 + }, + { + "epoch": 1.8494399297166704, + "grad_norm": 1.8228230476379395, + "learning_rate": 5e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.725090503692627, + "num_tokens": 435714957.0, + "step": 16841 + }, + { + "epoch": 1.849549747419284, + "grad_norm": 1.7144334316253662, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7298674583435059, + "num_tokens": 435742182.0, + "step": 16842 + }, + { + "epoch": 1.8496595651218977, + "grad_norm": 1.715825080871582, + "learning_rate": 5e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7324371337890625, + "num_tokens": 435768718.0, + "step": 16843 + }, + { + "epoch": 1.8497693828245114, + "grad_norm": 1.8965810537338257, + "learning_rate": 5e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7600489854812622, + "num_tokens": 435791124.0, + "step": 16844 + }, + { + "epoch": 1.849879200527125, + "grad_norm": 1.808377981185913, + "learning_rate": 5e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7184800505638123, + "num_tokens": 435820351.0, + "step": 16845 + }, + { + "epoch": 1.8499890182297385, + "grad_norm": 2.009941339492798, + "learning_rate": 5e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7420473694801331, + "num_tokens": 435843070.0, + "step": 16846 + }, + { + "epoch": 1.8500988359323522, + "grad_norm": 1.6101489067077637, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7344173192977905, + "num_tokens": 435875434.0, + "step": 16847 + }, + { + "epoch": 1.850208653634966, + "grad_norm": 1.81736421585083, + "learning_rate": 5e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.727074146270752, + "num_tokens": 435902472.0, + "step": 16848 + }, + { + "epoch": 1.8503184713375798, + "grad_norm": 1.8199139833450317, + "learning_rate": 5e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7180876731872559, + "num_tokens": 435931119.0, + "step": 16849 + }, + { + "epoch": 1.8504282890401933, + "grad_norm": 1.7656587362289429, + "learning_rate": 5e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7576816082000732, + "num_tokens": 435956251.0, + "step": 16850 + }, + { + "epoch": 1.8505381067428068, + "grad_norm": 1.760362982749939, + "learning_rate": 5e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7181429862976074, + "num_tokens": 435983597.0, + "step": 16851 + }, + { + "epoch": 1.8506479244454206, + "grad_norm": 1.9442627429962158, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7324235439300537, + "num_tokens": 436005320.0, + "step": 16852 + }, + { + "epoch": 1.8507577421480343, + "grad_norm": 1.8477905988693237, + "learning_rate": 5e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7286567091941833, + "num_tokens": 436029103.0, + "step": 16853 + }, + { + "epoch": 1.8508675598506479, + "grad_norm": 1.606367588043213, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7391331791877747, + "num_tokens": 436056841.0, + "step": 16854 + }, + { + "epoch": 1.8509773775532616, + "grad_norm": 1.9268836975097656, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7295141816139221, + "num_tokens": 436079322.0, + "step": 16855 + }, + { + "epoch": 1.8510871952558752, + "grad_norm": 1.7510936260223389, + "learning_rate": 5e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.704896092414856, + "num_tokens": 436109126.0, + "step": 16856 + }, + { + "epoch": 1.851197012958489, + "grad_norm": 1.7108527421951294, + "learning_rate": 5e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7214401960372925, + "num_tokens": 436138130.0, + "step": 16857 + }, + { + "epoch": 1.8513068306611027, + "grad_norm": 1.7758985757827759, + "learning_rate": 5e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7405633926391602, + "num_tokens": 436165344.0, + "step": 16858 + }, + { + "epoch": 1.8514166483637162, + "grad_norm": 1.8543779850006104, + "learning_rate": 5e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7352911829948425, + "num_tokens": 436190477.0, + "step": 16859 + }, + { + "epoch": 1.8515264660663298, + "grad_norm": 1.657090187072754, + "learning_rate": 5e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7430269718170166, + "num_tokens": 436218491.0, + "step": 16860 + }, + { + "epoch": 1.8516362837689435, + "grad_norm": 1.7567987442016602, + "learning_rate": 5e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.712114155292511, + "num_tokens": 436246170.0, + "step": 16861 + }, + { + "epoch": 1.8517461014715573, + "grad_norm": 1.829278826713562, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7332492470741272, + "num_tokens": 436272163.0, + "step": 16862 + }, + { + "epoch": 1.851855919174171, + "grad_norm": 1.8176316022872925, + "learning_rate": 5e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.7481411695480347, + "num_tokens": 436299519.0, + "step": 16863 + }, + { + "epoch": 1.8519657368767846, + "grad_norm": 1.6021296977996826, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.724826991558075, + "num_tokens": 436331504.0, + "step": 16864 + }, + { + "epoch": 1.852075554579398, + "grad_norm": 1.757634162902832, + "learning_rate": 5e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7202485799789429, + "num_tokens": 436359386.0, + "step": 16865 + }, + { + "epoch": 1.8521853722820119, + "grad_norm": 1.7842018604278564, + "learning_rate": 5e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7373538017272949, + "num_tokens": 436387618.0, + "step": 16866 + }, + { + "epoch": 1.8522951899846256, + "grad_norm": 1.856319546699524, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7505338788032532, + "num_tokens": 436409220.0, + "step": 16867 + }, + { + "epoch": 1.8524050076872391, + "grad_norm": 1.9535365104675293, + "learning_rate": 5e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7227256894111633, + "num_tokens": 436434394.0, + "step": 16868 + }, + { + "epoch": 1.8525148253898527, + "grad_norm": 1.7119646072387695, + "learning_rate": 5e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7380719184875488, + "num_tokens": 436463243.0, + "step": 16869 + }, + { + "epoch": 1.8526246430924664, + "grad_norm": 1.7425391674041748, + "learning_rate": 5e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7027802467346191, + "num_tokens": 436492485.0, + "step": 16870 + }, + { + "epoch": 1.8527344607950802, + "grad_norm": 1.690749168395996, + "learning_rate": 5e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7470041513442993, + "num_tokens": 436522698.0, + "step": 16871 + }, + { + "epoch": 1.852844278497694, + "grad_norm": 1.681548833847046, + "learning_rate": 5e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7026946544647217, + "num_tokens": 436554920.0, + "step": 16872 + }, + { + "epoch": 1.8529540962003075, + "grad_norm": 1.7740020751953125, + "learning_rate": 5e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7376455664634705, + "num_tokens": 436579753.0, + "step": 16873 + }, + { + "epoch": 1.853063913902921, + "grad_norm": 1.8907955884933472, + "learning_rate": 5e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7183164358139038, + "num_tokens": 436605050.0, + "step": 16874 + }, + { + "epoch": 1.8531737316055348, + "grad_norm": 1.9268146753311157, + "learning_rate": 5e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7445653676986694, + "num_tokens": 436626745.0, + "step": 16875 + }, + { + "epoch": 1.8532835493081485, + "grad_norm": 1.8358404636383057, + "learning_rate": 5e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7136397361755371, + "num_tokens": 436654781.0, + "step": 16876 + }, + { + "epoch": 1.8533933670107623, + "grad_norm": 2.118744373321533, + "learning_rate": 5e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7219986915588379, + "num_tokens": 436674640.0, + "step": 16877 + }, + { + "epoch": 1.8535031847133758, + "grad_norm": 1.8726869821548462, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7539912462234497, + "num_tokens": 436700242.0, + "step": 16878 + }, + { + "epoch": 1.8536130024159894, + "grad_norm": 1.7163153886795044, + "learning_rate": 5e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7262628078460693, + "num_tokens": 436729418.0, + "step": 16879 + }, + { + "epoch": 1.8537228201186031, + "grad_norm": 1.9399434328079224, + "learning_rate": 5e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7121384143829346, + "num_tokens": 436754490.0, + "step": 16880 + }, + { + "epoch": 1.8538326378212169, + "grad_norm": 1.9967540502548218, + "learning_rate": 5e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7235096096992493, + "num_tokens": 436779424.0, + "step": 16881 + }, + { + "epoch": 1.8539424555238304, + "grad_norm": 1.875784158706665, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7487025856971741, + "num_tokens": 436801929.0, + "step": 16882 + }, + { + "epoch": 1.854052273226444, + "grad_norm": 1.7390222549438477, + "learning_rate": 5e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7211329936981201, + "num_tokens": 436830575.0, + "step": 16883 + }, + { + "epoch": 1.8541620909290577, + "grad_norm": 1.8260291814804077, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7307249307632446, + "num_tokens": 436858313.0, + "step": 16884 + }, + { + "epoch": 1.8542719086316715, + "grad_norm": 1.6470075845718384, + "learning_rate": 5e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7251749634742737, + "num_tokens": 436888680.0, + "step": 16885 + }, + { + "epoch": 1.8543817263342852, + "grad_norm": 1.8800911903381348, + "learning_rate": 5e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7231918573379517, + "num_tokens": 436914425.0, + "step": 16886 + }, + { + "epoch": 1.8544915440368988, + "grad_norm": 1.9635283946990967, + "learning_rate": 5e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.762162983417511, + "num_tokens": 436935169.0, + "step": 16887 + }, + { + "epoch": 1.8546013617395123, + "grad_norm": 1.8460718393325806, + "learning_rate": 5e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7255761623382568, + "num_tokens": 436961055.0, + "step": 16888 + }, + { + "epoch": 1.854711179442126, + "grad_norm": 1.7256522178649902, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7280274629592896, + "num_tokens": 436986752.0, + "step": 16889 + }, + { + "epoch": 1.8548209971447398, + "grad_norm": 1.7020957469940186, + "learning_rate": 5e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7276363372802734, + "num_tokens": 437013906.0, + "step": 16890 + }, + { + "epoch": 1.8549308148473536, + "grad_norm": 1.8246792554855347, + "learning_rate": 5e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7252264022827148, + "num_tokens": 437036774.0, + "step": 16891 + }, + { + "epoch": 1.855040632549967, + "grad_norm": 2.047360897064209, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.728979766368866, + "num_tokens": 437056787.0, + "step": 16892 + }, + { + "epoch": 1.8551504502525806, + "grad_norm": 1.7923296689987183, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7334184646606445, + "num_tokens": 437080190.0, + "step": 16893 + }, + { + "epoch": 1.8552602679551944, + "grad_norm": 1.7354772090911865, + "learning_rate": 5e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7306270599365234, + "num_tokens": 437108915.0, + "step": 16894 + }, + { + "epoch": 1.8553700856578081, + "grad_norm": 1.7982124090194702, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7217158079147339, + "num_tokens": 437134686.0, + "step": 16895 + }, + { + "epoch": 1.8554799033604217, + "grad_norm": 1.991551160812378, + "learning_rate": 5e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7399552464485168, + "num_tokens": 437156052.0, + "step": 16896 + }, + { + "epoch": 1.8555897210630352, + "grad_norm": 1.662706971168518, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7246769666671753, + "num_tokens": 437183219.0, + "step": 16897 + }, + { + "epoch": 1.855699538765649, + "grad_norm": 2.1148290634155273, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7212544083595276, + "num_tokens": 437204518.0, + "step": 16898 + }, + { + "epoch": 1.8558093564682627, + "grad_norm": 1.3906002044677734, + "learning_rate": 5e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7246794104576111, + "num_tokens": 437242582.0, + "step": 16899 + }, + { + "epoch": 1.8559191741708765, + "grad_norm": 2.0186479091644287, + "learning_rate": 5e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7325019240379333, + "num_tokens": 437263858.0, + "step": 16900 + }, + { + "epoch": 1.85602899187349, + "grad_norm": 1.6367024183273315, + "learning_rate": 5e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7326674461364746, + "num_tokens": 437293762.0, + "step": 16901 + }, + { + "epoch": 1.8561388095761036, + "grad_norm": 1.712123155593872, + "learning_rate": 5e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7151921391487122, + "num_tokens": 437321528.0, + "step": 16902 + }, + { + "epoch": 1.8562486272787173, + "grad_norm": 1.6937530040740967, + "learning_rate": 5e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7317548990249634, + "num_tokens": 437348821.0, + "step": 16903 + }, + { + "epoch": 1.856358444981331, + "grad_norm": 1.7175837755203247, + "learning_rate": 5e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7518632411956787, + "num_tokens": 437374769.0, + "step": 16904 + }, + { + "epoch": 1.8564682626839446, + "grad_norm": 1.769768238067627, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.743913471698761, + "num_tokens": 437400299.0, + "step": 16905 + }, + { + "epoch": 1.8565780803865584, + "grad_norm": 1.8508121967315674, + "learning_rate": 5e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7540692687034607, + "num_tokens": 437424358.0, + "step": 16906 + }, + { + "epoch": 1.856687898089172, + "grad_norm": 1.7357189655303955, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7406714558601379, + "num_tokens": 437449099.0, + "step": 16907 + }, + { + "epoch": 1.8567977157917857, + "grad_norm": 1.642883062362671, + "learning_rate": 5e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7501280903816223, + "num_tokens": 437477626.0, + "step": 16908 + }, + { + "epoch": 1.8569075334943994, + "grad_norm": 1.9961479902267456, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7232011556625366, + "num_tokens": 437498086.0, + "step": 16909 + }, + { + "epoch": 1.857017351197013, + "grad_norm": 1.7311335802078247, + "learning_rate": 5e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.7481870651245117, + "num_tokens": 437525289.0, + "step": 16910 + }, + { + "epoch": 1.8571271688996265, + "grad_norm": 1.8708699941635132, + "learning_rate": 5e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7193819284439087, + "num_tokens": 437550760.0, + "step": 16911 + }, + { + "epoch": 1.8572369866022402, + "grad_norm": 1.8750616312026978, + "learning_rate": 5e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7739485502243042, + "num_tokens": 437571196.0, + "step": 16912 + }, + { + "epoch": 1.857346804304854, + "grad_norm": 1.8849619626998901, + "learning_rate": 5e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7333303689956665, + "num_tokens": 437594855.0, + "step": 16913 + }, + { + "epoch": 1.8574566220074678, + "grad_norm": 1.8613333702087402, + "learning_rate": 5e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7312890291213989, + "num_tokens": 437620298.0, + "step": 16914 + }, + { + "epoch": 1.8575664397100813, + "grad_norm": 1.7389295101165771, + "learning_rate": 5e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7174847722053528, + "num_tokens": 437647871.0, + "step": 16915 + }, + { + "epoch": 1.8576762574126948, + "grad_norm": 1.89152193069458, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7655822038650513, + "num_tokens": 437669426.0, + "step": 16916 + }, + { + "epoch": 1.8577860751153086, + "grad_norm": 1.9944242238998413, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7230153679847717, + "num_tokens": 437692737.0, + "step": 16917 + }, + { + "epoch": 1.8578958928179223, + "grad_norm": 1.6999967098236084, + "learning_rate": 5e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7316389083862305, + "num_tokens": 437719331.0, + "step": 16918 + }, + { + "epoch": 1.8580057105205359, + "grad_norm": 1.9118194580078125, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7396215200424194, + "num_tokens": 437740949.0, + "step": 16919 + }, + { + "epoch": 1.8581155282231496, + "grad_norm": 1.757678508758545, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7215876579284668, + "num_tokens": 437768746.0, + "step": 16920 + }, + { + "epoch": 1.8582253459257632, + "grad_norm": 1.6027698516845703, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7542829513549805, + "num_tokens": 437796211.0, + "step": 16921 + }, + { + "epoch": 1.858335163628377, + "grad_norm": 1.8075658082962036, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7325354814529419, + "num_tokens": 437819388.0, + "step": 16922 + }, + { + "epoch": 1.8584449813309907, + "grad_norm": 1.6676595211029053, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7152138948440552, + "num_tokens": 437846965.0, + "step": 16923 + }, + { + "epoch": 1.8585547990336042, + "grad_norm": 1.713974118232727, + "learning_rate": 5e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7295613884925842, + "num_tokens": 437872802.0, + "step": 16924 + }, + { + "epoch": 1.8586646167362177, + "grad_norm": 1.6263257265090942, + "learning_rate": 5e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7368131875991821, + "num_tokens": 437903519.0, + "step": 16925 + }, + { + "epoch": 1.8587744344388315, + "grad_norm": 1.8471384048461914, + "learning_rate": 5e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7276941537857056, + "num_tokens": 437927653.0, + "step": 16926 + }, + { + "epoch": 1.8588842521414453, + "grad_norm": 1.8756422996520996, + "learning_rate": 5e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7332903742790222, + "num_tokens": 437951695.0, + "step": 16927 + }, + { + "epoch": 1.858994069844059, + "grad_norm": 2.037083387374878, + "learning_rate": 5e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7247776389122009, + "num_tokens": 437976754.0, + "step": 16928 + }, + { + "epoch": 1.8591038875466726, + "grad_norm": 1.6777045726776123, + "learning_rate": 5e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7127796411514282, + "num_tokens": 438005381.0, + "step": 16929 + }, + { + "epoch": 1.859213705249286, + "grad_norm": 1.7109726667404175, + "learning_rate": 5e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7058466672897339, + "num_tokens": 438033551.0, + "step": 16930 + }, + { + "epoch": 1.8593235229518998, + "grad_norm": 1.7208919525146484, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7398208975791931, + "num_tokens": 438060476.0, + "step": 16931 + }, + { + "epoch": 1.8594333406545136, + "grad_norm": 1.6233799457550049, + "learning_rate": 5e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7515196800231934, + "num_tokens": 438087855.0, + "step": 16932 + }, + { + "epoch": 1.8595431583571271, + "grad_norm": 1.8364001512527466, + "learning_rate": 5e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7348043322563171, + "num_tokens": 438112869.0, + "step": 16933 + }, + { + "epoch": 1.8596529760597407, + "grad_norm": 1.6060203313827515, + "learning_rate": 5e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7565404176712036, + "num_tokens": 438140727.0, + "step": 16934 + }, + { + "epoch": 1.8597627937623544, + "grad_norm": 1.9086641073226929, + "learning_rate": 5e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.74965500831604, + "num_tokens": 438162105.0, + "step": 16935 + }, + { + "epoch": 1.8598726114649682, + "grad_norm": 1.7010104656219482, + "learning_rate": 5e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7250161170959473, + "num_tokens": 438191157.0, + "step": 16936 + }, + { + "epoch": 1.859982429167582, + "grad_norm": 1.8860723972320557, + "learning_rate": 5e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7273838520050049, + "num_tokens": 438214368.0, + "step": 16937 + }, + { + "epoch": 1.8600922468701955, + "grad_norm": 2.058361291885376, + "learning_rate": 5e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7261803150177002, + "num_tokens": 438236422.0, + "step": 16938 + }, + { + "epoch": 1.860202064572809, + "grad_norm": 1.5975852012634277, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7321265935897827, + "num_tokens": 438267430.0, + "step": 16939 + }, + { + "epoch": 1.8603118822754228, + "grad_norm": 1.929997444152832, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7315048575401306, + "num_tokens": 438289928.0, + "step": 16940 + }, + { + "epoch": 1.8604216999780365, + "grad_norm": 1.665919542312622, + "learning_rate": 5e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7265924215316772, + "num_tokens": 438318833.0, + "step": 16941 + }, + { + "epoch": 1.8605315176806503, + "grad_norm": 1.9958295822143555, + "learning_rate": 5e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7239463925361633, + "num_tokens": 438343227.0, + "step": 16942 + }, + { + "epoch": 1.8606413353832638, + "grad_norm": 1.735657811164856, + "learning_rate": 5e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7406314611434937, + "num_tokens": 438369358.0, + "step": 16943 + }, + { + "epoch": 1.8607511530858774, + "grad_norm": 1.7926652431488037, + "learning_rate": 5e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7267577052116394, + "num_tokens": 438393054.0, + "step": 16944 + }, + { + "epoch": 1.860860970788491, + "grad_norm": 1.9192094802856445, + "learning_rate": 5e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7373077869415283, + "num_tokens": 438415371.0, + "step": 16945 + }, + { + "epoch": 1.8609707884911049, + "grad_norm": 1.8050481081008911, + "learning_rate": 5e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7309110164642334, + "num_tokens": 438440800.0, + "step": 16946 + }, + { + "epoch": 1.8610806061937184, + "grad_norm": 1.7897192239761353, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7149702310562134, + "num_tokens": 438468873.0, + "step": 16947 + }, + { + "epoch": 1.861190423896332, + "grad_norm": 1.8345611095428467, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7363563776016235, + "num_tokens": 438493731.0, + "step": 16948 + }, + { + "epoch": 1.8613002415989457, + "grad_norm": 1.8393491506576538, + "learning_rate": 5e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7426227331161499, + "num_tokens": 438519066.0, + "step": 16949 + }, + { + "epoch": 1.8614100593015594, + "grad_norm": 1.6620514392852783, + "learning_rate": 5e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7397300004959106, + "num_tokens": 438548406.0, + "step": 16950 + }, + { + "epoch": 1.8615198770041732, + "grad_norm": 2.2793221473693848, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7550767660140991, + "num_tokens": 438564474.0, + "step": 16951 + }, + { + "epoch": 1.8616296947067867, + "grad_norm": 1.8608577251434326, + "learning_rate": 5e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7187378406524658, + "num_tokens": 438589370.0, + "step": 16952 + }, + { + "epoch": 1.8617395124094003, + "grad_norm": 1.743562936782837, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7309524416923523, + "num_tokens": 438616678.0, + "step": 16953 + }, + { + "epoch": 1.861849330112014, + "grad_norm": 1.6982003450393677, + "learning_rate": 5e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7140897512435913, + "num_tokens": 438645531.0, + "step": 16954 + }, + { + "epoch": 1.8619591478146278, + "grad_norm": 1.6382038593292236, + "learning_rate": 5e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6820027828216553, + "num_tokens": 438676726.0, + "step": 16955 + }, + { + "epoch": 1.8620689655172413, + "grad_norm": 1.8962318897247314, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7681846022605896, + "num_tokens": 438698685.0, + "step": 16956 + }, + { + "epoch": 1.862178783219855, + "grad_norm": 1.5089948177337646, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7280049324035645, + "num_tokens": 438735511.0, + "step": 16957 + }, + { + "epoch": 1.8622886009224686, + "grad_norm": 1.5634726285934448, + "learning_rate": 5e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7184838652610779, + "num_tokens": 438769739.0, + "step": 16958 + }, + { + "epoch": 1.8623984186250824, + "grad_norm": 1.9282983541488647, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7557780146598816, + "num_tokens": 438791671.0, + "step": 16959 + }, + { + "epoch": 1.8625082363276961, + "grad_norm": 1.8909462690353394, + "learning_rate": 5e-06, + "loss": 0.734, + "mean_token_accuracy": 0.769318699836731, + "num_tokens": 438812932.0, + "step": 16960 + }, + { + "epoch": 1.8626180540303097, + "grad_norm": 1.6522133350372314, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7367627620697021, + "num_tokens": 438841317.0, + "step": 16961 + }, + { + "epoch": 1.8627278717329232, + "grad_norm": 1.661399483680725, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7288132309913635, + "num_tokens": 438871398.0, + "step": 16962 + }, + { + "epoch": 1.862837689435537, + "grad_norm": 1.8410049676895142, + "learning_rate": 5e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7252079844474792, + "num_tokens": 438896105.0, + "step": 16963 + }, + { + "epoch": 1.8629475071381507, + "grad_norm": 2.1389105319976807, + "learning_rate": 5e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7349179983139038, + "num_tokens": 438916539.0, + "step": 16964 + }, + { + "epoch": 1.8630573248407645, + "grad_norm": 1.8174562454223633, + "learning_rate": 5e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7158173322677612, + "num_tokens": 438943632.0, + "step": 16965 + }, + { + "epoch": 1.863167142543378, + "grad_norm": 1.8196548223495483, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7372243404388428, + "num_tokens": 438969819.0, + "step": 16966 + }, + { + "epoch": 1.8632769602459915, + "grad_norm": 1.7492663860321045, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7372177839279175, + "num_tokens": 438997119.0, + "step": 16967 + }, + { + "epoch": 1.8633867779486053, + "grad_norm": 2.024263858795166, + "learning_rate": 5e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7339798212051392, + "num_tokens": 439020525.0, + "step": 16968 + }, + { + "epoch": 1.863496595651219, + "grad_norm": 1.699048638343811, + "learning_rate": 5e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7086629271507263, + "num_tokens": 439052138.0, + "step": 16969 + }, + { + "epoch": 1.8636064133538326, + "grad_norm": 1.6474789381027222, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7413747906684875, + "num_tokens": 439081779.0, + "step": 16970 + }, + { + "epoch": 1.8637162310564463, + "grad_norm": 1.761579990386963, + "learning_rate": 5e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.6987058520317078, + "num_tokens": 439109186.0, + "step": 16971 + }, + { + "epoch": 1.8638260487590599, + "grad_norm": 1.7237472534179688, + "learning_rate": 5e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.717682957649231, + "num_tokens": 439138389.0, + "step": 16972 + }, + { + "epoch": 1.8639358664616736, + "grad_norm": 1.8691927194595337, + "learning_rate": 5e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7382038235664368, + "num_tokens": 439164349.0, + "step": 16973 + }, + { + "epoch": 1.8640456841642874, + "grad_norm": 1.9253947734832764, + "learning_rate": 5e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7418817281723022, + "num_tokens": 439187695.0, + "step": 16974 + }, + { + "epoch": 1.864155501866901, + "grad_norm": 1.7422312498092651, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.748290479183197, + "num_tokens": 439212611.0, + "step": 16975 + }, + { + "epoch": 1.8642653195695145, + "grad_norm": 1.7649906873703003, + "learning_rate": 5e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7391129732131958, + "num_tokens": 439239137.0, + "step": 16976 + }, + { + "epoch": 1.8643751372721282, + "grad_norm": 1.7166463136672974, + "learning_rate": 5e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7083932161331177, + "num_tokens": 439268110.0, + "step": 16977 + }, + { + "epoch": 1.864484954974742, + "grad_norm": 1.9808624982833862, + "learning_rate": 5e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7269517183303833, + "num_tokens": 439288702.0, + "step": 16978 + }, + { + "epoch": 1.8645947726773557, + "grad_norm": 1.6920714378356934, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7309824824333191, + "num_tokens": 439315492.0, + "step": 16979 + }, + { + "epoch": 1.8647045903799693, + "grad_norm": 1.8909658193588257, + "learning_rate": 5e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7485221028327942, + "num_tokens": 439335838.0, + "step": 16980 + }, + { + "epoch": 1.8648144080825828, + "grad_norm": 1.9818768501281738, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7702768445014954, + "num_tokens": 439357114.0, + "step": 16981 + }, + { + "epoch": 1.8649242257851966, + "grad_norm": 1.6177583932876587, + "learning_rate": 5e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7492849230766296, + "num_tokens": 439387294.0, + "step": 16982 + }, + { + "epoch": 1.8650340434878103, + "grad_norm": 1.6505017280578613, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7289361953735352, + "num_tokens": 439417754.0, + "step": 16983 + }, + { + "epoch": 1.8651438611904239, + "grad_norm": 2.2092955112457275, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.743376612663269, + "num_tokens": 439437268.0, + "step": 16984 + }, + { + "epoch": 1.8652536788930374, + "grad_norm": 1.7943840026855469, + "learning_rate": 5e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7377587556838989, + "num_tokens": 439463692.0, + "step": 16985 + }, + { + "epoch": 1.8653634965956511, + "grad_norm": 1.5798358917236328, + "learning_rate": 5e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7269440293312073, + "num_tokens": 439496604.0, + "step": 16986 + }, + { + "epoch": 1.865473314298265, + "grad_norm": 1.5750750303268433, + "learning_rate": 5e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7723779678344727, + "num_tokens": 439523536.0, + "step": 16987 + }, + { + "epoch": 1.8655831320008787, + "grad_norm": 1.8450124263763428, + "learning_rate": 5e-06, + "loss": 0.946, + "mean_token_accuracy": 0.700826108455658, + "num_tokens": 439549766.0, + "step": 16988 + }, + { + "epoch": 1.8656929497034922, + "grad_norm": 1.803228735923767, + "learning_rate": 5e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7479011416435242, + "num_tokens": 439576030.0, + "step": 16989 + }, + { + "epoch": 1.8658027674061057, + "grad_norm": 1.5907782316207886, + "learning_rate": 5e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7191259860992432, + "num_tokens": 439608877.0, + "step": 16990 + }, + { + "epoch": 1.8659125851087195, + "grad_norm": 1.7235188484191895, + "learning_rate": 5e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7130993604660034, + "num_tokens": 439635924.0, + "step": 16991 + }, + { + "epoch": 1.8660224028113332, + "grad_norm": 1.8031737804412842, + "learning_rate": 5e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7097763419151306, + "num_tokens": 439663453.0, + "step": 16992 + }, + { + "epoch": 1.866132220513947, + "grad_norm": 1.6986886262893677, + "learning_rate": 5e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7371257543563843, + "num_tokens": 439691912.0, + "step": 16993 + }, + { + "epoch": 1.8662420382165605, + "grad_norm": 1.862870216369629, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7139948606491089, + "num_tokens": 439717167.0, + "step": 16994 + }, + { + "epoch": 1.866351855919174, + "grad_norm": 1.9502646923065186, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7367139458656311, + "num_tokens": 439738899.0, + "step": 16995 + }, + { + "epoch": 1.8664616736217878, + "grad_norm": 1.762269377708435, + "learning_rate": 5e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7467024326324463, + "num_tokens": 439762282.0, + "step": 16996 + }, + { + "epoch": 1.8665714913244016, + "grad_norm": 1.6393249034881592, + "learning_rate": 5e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7440604567527771, + "num_tokens": 439790672.0, + "step": 16997 + }, + { + "epoch": 1.8666813090270151, + "grad_norm": 1.8268482685089111, + "learning_rate": 5e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7235679626464844, + "num_tokens": 439817974.0, + "step": 16998 + }, + { + "epoch": 1.8667911267296287, + "grad_norm": 1.6542881727218628, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7347987294197083, + "num_tokens": 439847258.0, + "step": 16999 + }, + { + "epoch": 1.8669009444322424, + "grad_norm": 2.053576946258545, + "learning_rate": 5e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7253168225288391, + "num_tokens": 439870260.0, + "step": 17000 + }, + { + "epoch": 1.8670107621348562, + "grad_norm": 2.006870746612549, + "learning_rate": 5e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7384239435195923, + "num_tokens": 439890184.0, + "step": 17001 + }, + { + "epoch": 1.86712057983747, + "grad_norm": 2.0453178882598877, + "learning_rate": 5e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7606053352355957, + "num_tokens": 439910453.0, + "step": 17002 + }, + { + "epoch": 1.8672303975400835, + "grad_norm": 1.8067774772644043, + "learning_rate": 5e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7474403381347656, + "num_tokens": 439934541.0, + "step": 17003 + }, + { + "epoch": 1.867340215242697, + "grad_norm": 1.6538634300231934, + "learning_rate": 5e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7327414751052856, + "num_tokens": 439963120.0, + "step": 17004 + }, + { + "epoch": 1.8674500329453108, + "grad_norm": 1.830112338066101, + "learning_rate": 5e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7290798425674438, + "num_tokens": 439986106.0, + "step": 17005 + }, + { + "epoch": 1.8675598506479245, + "grad_norm": 1.8572190999984741, + "learning_rate": 5e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7353357076644897, + "num_tokens": 440011346.0, + "step": 17006 + }, + { + "epoch": 1.8676696683505383, + "grad_norm": 1.5521901845932007, + "learning_rate": 5e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.727091372013092, + "num_tokens": 440043272.0, + "step": 17007 + }, + { + "epoch": 1.8677794860531518, + "grad_norm": 1.7497406005859375, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7217273712158203, + "num_tokens": 440071431.0, + "step": 17008 + }, + { + "epoch": 1.8678893037557653, + "grad_norm": 1.871473789215088, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7290993332862854, + "num_tokens": 440095578.0, + "step": 17009 + }, + { + "epoch": 1.867999121458379, + "grad_norm": 1.7647854089736938, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7200145721435547, + "num_tokens": 440123715.0, + "step": 17010 + }, + { + "epoch": 1.8681089391609929, + "grad_norm": 1.6043256521224976, + "learning_rate": 5e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7014039158821106, + "num_tokens": 440157162.0, + "step": 17011 + }, + { + "epoch": 1.8682187568636064, + "grad_norm": 1.771056056022644, + "learning_rate": 5e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7611414790153503, + "num_tokens": 440181054.0, + "step": 17012 + }, + { + "epoch": 1.86832857456622, + "grad_norm": 1.7827844619750977, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7343825101852417, + "num_tokens": 440206197.0, + "step": 17013 + }, + { + "epoch": 1.8684383922688337, + "grad_norm": 1.6619435548782349, + "learning_rate": 5e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7335649728775024, + "num_tokens": 440234912.0, + "step": 17014 + }, + { + "epoch": 1.8685482099714474, + "grad_norm": 1.6087721586227417, + "learning_rate": 5e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7256438732147217, + "num_tokens": 440268140.0, + "step": 17015 + }, + { + "epoch": 1.8686580276740612, + "grad_norm": 2.3694067001342773, + "learning_rate": 5e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7549896240234375, + "num_tokens": 440284851.0, + "step": 17016 + }, + { + "epoch": 1.8687678453766747, + "grad_norm": 1.78484046459198, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7226758003234863, + "num_tokens": 440313373.0, + "step": 17017 + }, + { + "epoch": 1.8688776630792883, + "grad_norm": 1.837563395500183, + "learning_rate": 5e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7279918193817139, + "num_tokens": 440336158.0, + "step": 17018 + }, + { + "epoch": 1.868987480781902, + "grad_norm": 2.0159778594970703, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7324898838996887, + "num_tokens": 440356184.0, + "step": 17019 + }, + { + "epoch": 1.8690972984845158, + "grad_norm": 1.68937087059021, + "learning_rate": 5e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7248486876487732, + "num_tokens": 440386685.0, + "step": 17020 + }, + { + "epoch": 1.8692071161871293, + "grad_norm": 1.8535929918289185, + "learning_rate": 5e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7352921962738037, + "num_tokens": 440412936.0, + "step": 17021 + }, + { + "epoch": 1.869316933889743, + "grad_norm": 1.760574460029602, + "learning_rate": 5e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7267991900444031, + "num_tokens": 440442413.0, + "step": 17022 + }, + { + "epoch": 1.8694267515923566, + "grad_norm": 1.8824973106384277, + "learning_rate": 5e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7263286709785461, + "num_tokens": 440466104.0, + "step": 17023 + }, + { + "epoch": 1.8695365692949704, + "grad_norm": 1.9520829916000366, + "learning_rate": 5e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7324864268302917, + "num_tokens": 440487859.0, + "step": 17024 + }, + { + "epoch": 1.8696463869975841, + "grad_norm": 1.8961029052734375, + "learning_rate": 5e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7541409134864807, + "num_tokens": 440511623.0, + "step": 17025 + }, + { + "epoch": 1.8697562047001977, + "grad_norm": 1.6931487321853638, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7343206405639648, + "num_tokens": 440539873.0, + "step": 17026 + }, + { + "epoch": 1.8698660224028112, + "grad_norm": 1.8364241123199463, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7219089269638062, + "num_tokens": 440565233.0, + "step": 17027 + }, + { + "epoch": 1.869975840105425, + "grad_norm": 1.6420879364013672, + "learning_rate": 5e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7332496643066406, + "num_tokens": 440597331.0, + "step": 17028 + }, + { + "epoch": 1.8700856578080387, + "grad_norm": 1.8440446853637695, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7426232695579529, + "num_tokens": 440620953.0, + "step": 17029 + }, + { + "epoch": 1.8701954755106525, + "grad_norm": 1.7551624774932861, + "learning_rate": 5e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7197569608688354, + "num_tokens": 440649312.0, + "step": 17030 + }, + { + "epoch": 1.870305293213266, + "grad_norm": 1.8641555309295654, + "learning_rate": 5e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7337567210197449, + "num_tokens": 440674800.0, + "step": 17031 + }, + { + "epoch": 1.8704151109158795, + "grad_norm": 1.8329395055770874, + "learning_rate": 5e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.743865430355072, + "num_tokens": 440698306.0, + "step": 17032 + }, + { + "epoch": 1.8705249286184933, + "grad_norm": 1.6195242404937744, + "learning_rate": 5e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7084642648696899, + "num_tokens": 440731415.0, + "step": 17033 + }, + { + "epoch": 1.870634746321107, + "grad_norm": 1.7280197143554688, + "learning_rate": 5e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7162387371063232, + "num_tokens": 440760591.0, + "step": 17034 + }, + { + "epoch": 1.8707445640237206, + "grad_norm": 1.8065882921218872, + "learning_rate": 5e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7426047325134277, + "num_tokens": 440787042.0, + "step": 17035 + }, + { + "epoch": 1.8708543817263343, + "grad_norm": 1.6242128610610962, + "learning_rate": 5e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7369144558906555, + "num_tokens": 440816618.0, + "step": 17036 + }, + { + "epoch": 1.8709641994289479, + "grad_norm": 1.5527297258377075, + "learning_rate": 5e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7036241292953491, + "num_tokens": 440852564.0, + "step": 17037 + }, + { + "epoch": 1.8710740171315616, + "grad_norm": 1.6328321695327759, + "learning_rate": 5e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7261154651641846, + "num_tokens": 440883719.0, + "step": 17038 + }, + { + "epoch": 1.8711838348341754, + "grad_norm": 1.782771110534668, + "learning_rate": 5e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7229735851287842, + "num_tokens": 440910304.0, + "step": 17039 + }, + { + "epoch": 1.871293652536789, + "grad_norm": 1.7424163818359375, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7352504134178162, + "num_tokens": 440936594.0, + "step": 17040 + }, + { + "epoch": 1.8714034702394025, + "grad_norm": 1.607475996017456, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7133310437202454, + "num_tokens": 440965985.0, + "step": 17041 + }, + { + "epoch": 1.8715132879420162, + "grad_norm": 2.0782957077026367, + "learning_rate": 5e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.7695063948631287, + "num_tokens": 440984037.0, + "step": 17042 + }, + { + "epoch": 1.87162310564463, + "grad_norm": 2.0069456100463867, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7354666590690613, + "num_tokens": 441005202.0, + "step": 17043 + }, + { + "epoch": 1.8717329233472437, + "grad_norm": 1.881855845451355, + "learning_rate": 5e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7309422492980957, + "num_tokens": 441028024.0, + "step": 17044 + }, + { + "epoch": 1.8718427410498573, + "grad_norm": 1.85448157787323, + "learning_rate": 5e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7168154716491699, + "num_tokens": 441053970.0, + "step": 17045 + }, + { + "epoch": 1.8719525587524708, + "grad_norm": 1.8193658590316772, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7108418345451355, + "num_tokens": 441080255.0, + "step": 17046 + }, + { + "epoch": 1.8720623764550846, + "grad_norm": 1.854213833808899, + "learning_rate": 5e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7125861048698425, + "num_tokens": 441104133.0, + "step": 17047 + }, + { + "epoch": 1.8721721941576983, + "grad_norm": 1.6553137302398682, + "learning_rate": 5e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.735418438911438, + "num_tokens": 441132665.0, + "step": 17048 + }, + { + "epoch": 1.8722820118603118, + "grad_norm": 1.8766671419143677, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7458569407463074, + "num_tokens": 441153148.0, + "step": 17049 + }, + { + "epoch": 1.8723918295629254, + "grad_norm": 1.6866673231124878, + "learning_rate": 5e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7417513132095337, + "num_tokens": 441182615.0, + "step": 17050 + }, + { + "epoch": 1.8725016472655391, + "grad_norm": 1.8035274744033813, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7285451292991638, + "num_tokens": 441208216.0, + "step": 17051 + }, + { + "epoch": 1.872611464968153, + "grad_norm": 1.8129661083221436, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7336347103118896, + "num_tokens": 441235432.0, + "step": 17052 + }, + { + "epoch": 1.8727212826707667, + "grad_norm": 1.9782553911209106, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7464776635169983, + "num_tokens": 441257276.0, + "step": 17053 + }, + { + "epoch": 1.8728311003733802, + "grad_norm": 2.14079213142395, + "learning_rate": 5e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7375353574752808, + "num_tokens": 441276615.0, + "step": 17054 + }, + { + "epoch": 1.8729409180759937, + "grad_norm": 1.795002818107605, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.727628231048584, + "num_tokens": 441302481.0, + "step": 17055 + }, + { + "epoch": 1.8730507357786075, + "grad_norm": 2.0158445835113525, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7437435388565063, + "num_tokens": 441324053.0, + "step": 17056 + }, + { + "epoch": 1.8731605534812212, + "grad_norm": 1.8799028396606445, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7306879162788391, + "num_tokens": 441349387.0, + "step": 17057 + }, + { + "epoch": 1.873270371183835, + "grad_norm": 1.9757637977600098, + "learning_rate": 5e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.723288893699646, + "num_tokens": 441374730.0, + "step": 17058 + }, + { + "epoch": 1.8733801888864485, + "grad_norm": 1.944783329963684, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7373183965682983, + "num_tokens": 441396045.0, + "step": 17059 + }, + { + "epoch": 1.873490006589062, + "grad_norm": 1.9267174005508423, + "learning_rate": 5e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7170748114585876, + "num_tokens": 441421088.0, + "step": 17060 + }, + { + "epoch": 1.8735998242916758, + "grad_norm": 1.9700692892074585, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7263169288635254, + "num_tokens": 441444720.0, + "step": 17061 + }, + { + "epoch": 1.8737096419942896, + "grad_norm": 1.919763445854187, + "learning_rate": 5e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7102985382080078, + "num_tokens": 441469808.0, + "step": 17062 + }, + { + "epoch": 1.8738194596969031, + "grad_norm": 1.8337647914886475, + "learning_rate": 5e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7212693691253662, + "num_tokens": 441496258.0, + "step": 17063 + }, + { + "epoch": 1.8739292773995166, + "grad_norm": 1.8260973691940308, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7323227524757385, + "num_tokens": 441522688.0, + "step": 17064 + }, + { + "epoch": 1.8740390951021304, + "grad_norm": 1.7728787660598755, + "learning_rate": 5e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7361573576927185, + "num_tokens": 441548605.0, + "step": 17065 + }, + { + "epoch": 1.8741489128047442, + "grad_norm": 2.103555917739868, + "learning_rate": 5e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.753129243850708, + "num_tokens": 441568662.0, + "step": 17066 + }, + { + "epoch": 1.874258730507358, + "grad_norm": 1.9023586511611938, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7222607135772705, + "num_tokens": 441593383.0, + "step": 17067 + }, + { + "epoch": 1.8743685482099715, + "grad_norm": 2.1140003204345703, + "learning_rate": 5e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7329504489898682, + "num_tokens": 441614648.0, + "step": 17068 + }, + { + "epoch": 1.874478365912585, + "grad_norm": 1.9207466840744019, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7380546927452087, + "num_tokens": 441636857.0, + "step": 17069 + }, + { + "epoch": 1.8745881836151987, + "grad_norm": 1.7272207736968994, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.724862813949585, + "num_tokens": 441665057.0, + "step": 17070 + }, + { + "epoch": 1.8746980013178125, + "grad_norm": 1.6579090356826782, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7274824976921082, + "num_tokens": 441693487.0, + "step": 17071 + }, + { + "epoch": 1.8748078190204263, + "grad_norm": 1.7751120328903198, + "learning_rate": 5e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7509424686431885, + "num_tokens": 441720700.0, + "step": 17072 + }, + { + "epoch": 1.8749176367230398, + "grad_norm": 1.9171769618988037, + "learning_rate": 5e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7420413494110107, + "num_tokens": 441744145.0, + "step": 17073 + }, + { + "epoch": 1.8750274544256533, + "grad_norm": 1.5923300981521606, + "learning_rate": 5e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7184363603591919, + "num_tokens": 441779217.0, + "step": 17074 + }, + { + "epoch": 1.875137272128267, + "grad_norm": 1.6293085813522339, + "learning_rate": 5e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7286080121994019, + "num_tokens": 441808466.0, + "step": 17075 + }, + { + "epoch": 1.8752470898308808, + "grad_norm": 2.274139165878296, + "learning_rate": 5e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7432588338851929, + "num_tokens": 441827077.0, + "step": 17076 + }, + { + "epoch": 1.8753569075334944, + "grad_norm": 1.7271642684936523, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7237156629562378, + "num_tokens": 441853172.0, + "step": 17077 + }, + { + "epoch": 1.875466725236108, + "grad_norm": 1.9893503189086914, + "learning_rate": 5e-06, + "loss": 0.862, + "mean_token_accuracy": 0.73279869556427, + "num_tokens": 441876874.0, + "step": 17078 + }, + { + "epoch": 1.8755765429387217, + "grad_norm": 1.9138187170028687, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.729634702205658, + "num_tokens": 441902443.0, + "step": 17079 + }, + { + "epoch": 1.8756863606413354, + "grad_norm": 1.8385215997695923, + "learning_rate": 5e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.726856529712677, + "num_tokens": 441926757.0, + "step": 17080 + }, + { + "epoch": 1.8757961783439492, + "grad_norm": 2.088158130645752, + "learning_rate": 5e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7165392637252808, + "num_tokens": 441948220.0, + "step": 17081 + }, + { + "epoch": 1.8759059960465627, + "grad_norm": 1.8541195392608643, + "learning_rate": 5e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7379379868507385, + "num_tokens": 441972436.0, + "step": 17082 + }, + { + "epoch": 1.8760158137491763, + "grad_norm": 1.8791751861572266, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7199760675430298, + "num_tokens": 441997583.0, + "step": 17083 + }, + { + "epoch": 1.87612563145179, + "grad_norm": 1.6204049587249756, + "learning_rate": 5e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7406874895095825, + "num_tokens": 442027803.0, + "step": 17084 + }, + { + "epoch": 1.8762354491544038, + "grad_norm": 1.5969010591506958, + "learning_rate": 5e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7069133520126343, + "num_tokens": 442059293.0, + "step": 17085 + }, + { + "epoch": 1.8763452668570173, + "grad_norm": 1.611406683921814, + "learning_rate": 5e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7250460982322693, + "num_tokens": 442088854.0, + "step": 17086 + }, + { + "epoch": 1.876455084559631, + "grad_norm": 1.8532720804214478, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7433416247367859, + "num_tokens": 442111377.0, + "step": 17087 + }, + { + "epoch": 1.8765649022622446, + "grad_norm": 2.0691514015197754, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7322632074356079, + "num_tokens": 442132399.0, + "step": 17088 + }, + { + "epoch": 1.8766747199648584, + "grad_norm": 1.823233723640442, + "learning_rate": 5e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7433410882949829, + "num_tokens": 442156013.0, + "step": 17089 + }, + { + "epoch": 1.876784537667472, + "grad_norm": 1.8540446758270264, + "learning_rate": 5e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7215625643730164, + "num_tokens": 442185154.0, + "step": 17090 + }, + { + "epoch": 1.8768943553700856, + "grad_norm": 1.5805792808532715, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7340711951255798, + "num_tokens": 442217130.0, + "step": 17091 + }, + { + "epoch": 1.8770041730726992, + "grad_norm": 1.7038013935089111, + "learning_rate": 5e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7439111471176147, + "num_tokens": 442244154.0, + "step": 17092 + }, + { + "epoch": 1.877113990775313, + "grad_norm": 1.795660138130188, + "learning_rate": 5e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7406758069992065, + "num_tokens": 442267915.0, + "step": 17093 + }, + { + "epoch": 1.8772238084779267, + "grad_norm": 1.7617546319961548, + "learning_rate": 5e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7387408018112183, + "num_tokens": 442295304.0, + "step": 17094 + }, + { + "epoch": 1.8773336261805404, + "grad_norm": 1.7281635999679565, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7441509962081909, + "num_tokens": 442320605.0, + "step": 17095 + }, + { + "epoch": 1.877443443883154, + "grad_norm": 1.6027350425720215, + "learning_rate": 5e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7293165922164917, + "num_tokens": 442349452.0, + "step": 17096 + }, + { + "epoch": 1.8775532615857675, + "grad_norm": 1.9310548305511475, + "learning_rate": 5e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7528953552246094, + "num_tokens": 442371642.0, + "step": 17097 + }, + { + "epoch": 1.8776630792883813, + "grad_norm": 1.8265202045440674, + "learning_rate": 5e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7185210585594177, + "num_tokens": 442396420.0, + "step": 17098 + }, + { + "epoch": 1.877772896990995, + "grad_norm": 2.1297054290771484, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7229174375534058, + "num_tokens": 442417210.0, + "step": 17099 + }, + { + "epoch": 1.8778827146936086, + "grad_norm": 1.8197816610336304, + "learning_rate": 5e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7213249206542969, + "num_tokens": 442444003.0, + "step": 17100 + }, + { + "epoch": 1.8779925323962223, + "grad_norm": 1.6485072374343872, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7481429576873779, + "num_tokens": 442473175.0, + "step": 17101 + }, + { + "epoch": 1.8781023500988359, + "grad_norm": 1.6205775737762451, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.717643141746521, + "num_tokens": 442503521.0, + "step": 17102 + }, + { + "epoch": 1.8782121678014496, + "grad_norm": 1.6747976541519165, + "learning_rate": 5e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7281754612922668, + "num_tokens": 442530912.0, + "step": 17103 + }, + { + "epoch": 1.8783219855040634, + "grad_norm": 2.1751606464385986, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7690852880477905, + "num_tokens": 442548131.0, + "step": 17104 + }, + { + "epoch": 1.878431803206677, + "grad_norm": 1.9451029300689697, + "learning_rate": 5e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7555249333381653, + "num_tokens": 442569195.0, + "step": 17105 + }, + { + "epoch": 1.8785416209092904, + "grad_norm": 2.005685806274414, + "learning_rate": 5e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7488926649093628, + "num_tokens": 442589997.0, + "step": 17106 + }, + { + "epoch": 1.8786514386119042, + "grad_norm": 1.6404640674591064, + "learning_rate": 5e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7205429077148438, + "num_tokens": 442620178.0, + "step": 17107 + }, + { + "epoch": 1.878761256314518, + "grad_norm": 1.6387022733688354, + "learning_rate": 5e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7446912527084351, + "num_tokens": 442650974.0, + "step": 17108 + }, + { + "epoch": 1.8788710740171317, + "grad_norm": 1.8629696369171143, + "learning_rate": 5e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7187225222587585, + "num_tokens": 442677138.0, + "step": 17109 + }, + { + "epoch": 1.8789808917197452, + "grad_norm": 1.6225334405899048, + "learning_rate": 5e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7365493178367615, + "num_tokens": 442708559.0, + "step": 17110 + }, + { + "epoch": 1.8790907094223588, + "grad_norm": 1.8614295721054077, + "learning_rate": 5e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7364638447761536, + "num_tokens": 442732848.0, + "step": 17111 + }, + { + "epoch": 1.8792005271249725, + "grad_norm": 1.759809970855713, + "learning_rate": 5e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7348697185516357, + "num_tokens": 442758195.0, + "step": 17112 + }, + { + "epoch": 1.8793103448275863, + "grad_norm": 1.7913331985473633, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7317670583724976, + "num_tokens": 442782935.0, + "step": 17113 + }, + { + "epoch": 1.8794201625301998, + "grad_norm": 1.7961084842681885, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7154208421707153, + "num_tokens": 442809244.0, + "step": 17114 + }, + { + "epoch": 1.8795299802328134, + "grad_norm": 1.6370002031326294, + "learning_rate": 5e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.721726655960083, + "num_tokens": 442840247.0, + "step": 17115 + }, + { + "epoch": 1.8796397979354271, + "grad_norm": 1.8560500144958496, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7356702089309692, + "num_tokens": 442867359.0, + "step": 17116 + }, + { + "epoch": 1.8797496156380409, + "grad_norm": 1.8113040924072266, + "learning_rate": 5e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.725824236869812, + "num_tokens": 442892619.0, + "step": 17117 + }, + { + "epoch": 1.8798594333406546, + "grad_norm": 1.7059813737869263, + "learning_rate": 5e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7247294783592224, + "num_tokens": 442919395.0, + "step": 17118 + }, + { + "epoch": 1.8799692510432682, + "grad_norm": 1.9000462293624878, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7396396398544312, + "num_tokens": 442943806.0, + "step": 17119 + }, + { + "epoch": 1.8800790687458817, + "grad_norm": 1.6927168369293213, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7414829730987549, + "num_tokens": 442968606.0, + "step": 17120 + }, + { + "epoch": 1.8801888864484955, + "grad_norm": 1.8255226612091064, + "learning_rate": 5e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7183648943901062, + "num_tokens": 442994917.0, + "step": 17121 + }, + { + "epoch": 1.8802987041511092, + "grad_norm": 2.0025577545166016, + "learning_rate": 5e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7448409199714661, + "num_tokens": 443019170.0, + "step": 17122 + }, + { + "epoch": 1.880408521853723, + "grad_norm": 1.4944415092468262, + "learning_rate": 5e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7115892767906189, + "num_tokens": 443055192.0, + "step": 17123 + }, + { + "epoch": 1.8805183395563365, + "grad_norm": 1.8005439043045044, + "learning_rate": 5e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7439815402030945, + "num_tokens": 443083457.0, + "step": 17124 + }, + { + "epoch": 1.88062815725895, + "grad_norm": 1.5611964464187622, + "learning_rate": 5e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7499246597290039, + "num_tokens": 443113631.0, + "step": 17125 + }, + { + "epoch": 1.8807379749615638, + "grad_norm": 2.2719249725341797, + "learning_rate": 5e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7620803713798523, + "num_tokens": 443130932.0, + "step": 17126 + }, + { + "epoch": 1.8808477926641776, + "grad_norm": 1.9624519348144531, + "learning_rate": 5e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7233654260635376, + "num_tokens": 443156683.0, + "step": 17127 + }, + { + "epoch": 1.880957610366791, + "grad_norm": 2.101959466934204, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.732361912727356, + "num_tokens": 443176665.0, + "step": 17128 + }, + { + "epoch": 1.8810674280694046, + "grad_norm": 1.9299285411834717, + "learning_rate": 5e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7092519998550415, + "num_tokens": 443202172.0, + "step": 17129 + }, + { + "epoch": 1.8811772457720184, + "grad_norm": 1.8836034536361694, + "learning_rate": 5e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7301173210144043, + "num_tokens": 443227921.0, + "step": 17130 + }, + { + "epoch": 1.8812870634746321, + "grad_norm": 1.8198128938674927, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.76190584897995, + "num_tokens": 443252879.0, + "step": 17131 + }, + { + "epoch": 1.881396881177246, + "grad_norm": 2.061922073364258, + "learning_rate": 5e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7561045289039612, + "num_tokens": 443272654.0, + "step": 17132 + }, + { + "epoch": 1.8815066988798594, + "grad_norm": 1.7895796298980713, + "learning_rate": 5e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7239134311676025, + "num_tokens": 443299380.0, + "step": 17133 + }, + { + "epoch": 1.881616516582473, + "grad_norm": 1.7341587543487549, + "learning_rate": 5e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7331788539886475, + "num_tokens": 443328407.0, + "step": 17134 + }, + { + "epoch": 1.8817263342850867, + "grad_norm": 1.633960247039795, + "learning_rate": 5e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.737739622592926, + "num_tokens": 443360326.0, + "step": 17135 + }, + { + "epoch": 1.8818361519877005, + "grad_norm": 1.7051246166229248, + "learning_rate": 5e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.736451268196106, + "num_tokens": 443388531.0, + "step": 17136 + }, + { + "epoch": 1.881945969690314, + "grad_norm": 1.7250510454177856, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7311204671859741, + "num_tokens": 443413785.0, + "step": 17137 + }, + { + "epoch": 1.8820557873929278, + "grad_norm": 1.8665671348571777, + "learning_rate": 5e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.757176399230957, + "num_tokens": 443436471.0, + "step": 17138 + }, + { + "epoch": 1.8821656050955413, + "grad_norm": 1.6647226810455322, + "learning_rate": 5e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7319023609161377, + "num_tokens": 443464345.0, + "step": 17139 + }, + { + "epoch": 1.882275422798155, + "grad_norm": 1.7601298093795776, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7262272834777832, + "num_tokens": 443491450.0, + "step": 17140 + }, + { + "epoch": 1.8823852405007688, + "grad_norm": 2.0694222450256348, + "learning_rate": 5e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.756255030632019, + "num_tokens": 443510390.0, + "step": 17141 + }, + { + "epoch": 1.8824950582033824, + "grad_norm": 1.6432710886001587, + "learning_rate": 5e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7125654220581055, + "num_tokens": 443540162.0, + "step": 17142 + }, + { + "epoch": 1.882604875905996, + "grad_norm": 1.747872233390808, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7612340450286865, + "num_tokens": 443564929.0, + "step": 17143 + }, + { + "epoch": 1.8827146936086097, + "grad_norm": 1.827043056488037, + "learning_rate": 5e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7392733097076416, + "num_tokens": 443588428.0, + "step": 17144 + }, + { + "epoch": 1.8828245113112234, + "grad_norm": 2.0104241371154785, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7327162027359009, + "num_tokens": 443614060.0, + "step": 17145 + }, + { + "epoch": 1.8829343290138372, + "grad_norm": 1.8550668954849243, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7043596506118774, + "num_tokens": 443639875.0, + "step": 17146 + }, + { + "epoch": 1.8830441467164507, + "grad_norm": 1.6996583938598633, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.723257303237915, + "num_tokens": 443667064.0, + "step": 17147 + }, + { + "epoch": 1.8831539644190642, + "grad_norm": 1.8469483852386475, + "learning_rate": 5e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7214974761009216, + "num_tokens": 443689988.0, + "step": 17148 + }, + { + "epoch": 1.883263782121678, + "grad_norm": 1.6764335632324219, + "learning_rate": 5e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7626340389251709, + "num_tokens": 443716792.0, + "step": 17149 + }, + { + "epoch": 1.8833735998242918, + "grad_norm": 1.6351573467254639, + "learning_rate": 5e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7331289052963257, + "num_tokens": 443746114.0, + "step": 17150 + }, + { + "epoch": 1.8834834175269053, + "grad_norm": 1.9595727920532227, + "learning_rate": 5e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7316542863845825, + "num_tokens": 443769138.0, + "step": 17151 + }, + { + "epoch": 1.883593235229519, + "grad_norm": 1.6140354871749878, + "learning_rate": 5e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7612640857696533, + "num_tokens": 443798358.0, + "step": 17152 + }, + { + "epoch": 1.8837030529321326, + "grad_norm": 1.6732006072998047, + "learning_rate": 5e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7480909824371338, + "num_tokens": 443823725.0, + "step": 17153 + }, + { + "epoch": 1.8838128706347463, + "grad_norm": 1.9641858339309692, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7450616359710693, + "num_tokens": 443846062.0, + "step": 17154 + }, + { + "epoch": 1.88392268833736, + "grad_norm": 1.838283658027649, + "learning_rate": 5e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7336665391921997, + "num_tokens": 443870247.0, + "step": 17155 + }, + { + "epoch": 1.8840325060399736, + "grad_norm": 1.6881016492843628, + "learning_rate": 5e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.707853376865387, + "num_tokens": 443900027.0, + "step": 17156 + }, + { + "epoch": 1.8841423237425872, + "grad_norm": 1.8302748203277588, + "learning_rate": 5e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7451491355895996, + "num_tokens": 443925761.0, + "step": 17157 + }, + { + "epoch": 1.884252141445201, + "grad_norm": 1.7591209411621094, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.725767970085144, + "num_tokens": 443954555.0, + "step": 17158 + }, + { + "epoch": 1.8843619591478147, + "grad_norm": 1.8225816488265991, + "learning_rate": 5e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7285268902778625, + "num_tokens": 443980869.0, + "step": 17159 + }, + { + "epoch": 1.8844717768504284, + "grad_norm": 1.9826338291168213, + "learning_rate": 5e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7261090278625488, + "num_tokens": 444003685.0, + "step": 17160 + }, + { + "epoch": 1.884581594553042, + "grad_norm": 1.7457644939422607, + "learning_rate": 5e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7526265382766724, + "num_tokens": 444027673.0, + "step": 17161 + }, + { + "epoch": 1.8846914122556555, + "grad_norm": 1.9106719493865967, + "learning_rate": 5e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7476300597190857, + "num_tokens": 444049264.0, + "step": 17162 + }, + { + "epoch": 1.8848012299582693, + "grad_norm": 1.7538981437683105, + "learning_rate": 5e-06, + "loss": 0.893, + "mean_token_accuracy": 0.721619725227356, + "num_tokens": 444075920.0, + "step": 17163 + }, + { + "epoch": 1.884911047660883, + "grad_norm": 1.6600720882415771, + "learning_rate": 5e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.738792896270752, + "num_tokens": 444103511.0, + "step": 17164 + }, + { + "epoch": 1.8850208653634966, + "grad_norm": 1.8965948820114136, + "learning_rate": 5e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7095233201980591, + "num_tokens": 444130229.0, + "step": 17165 + }, + { + "epoch": 1.88513068306611, + "grad_norm": 1.7827069759368896, + "learning_rate": 5e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7526937127113342, + "num_tokens": 444153285.0, + "step": 17166 + }, + { + "epoch": 1.8852405007687238, + "grad_norm": 1.7515509128570557, + "learning_rate": 5e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7354601621627808, + "num_tokens": 444180885.0, + "step": 17167 + }, + { + "epoch": 1.8853503184713376, + "grad_norm": 1.7310017347335815, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.717519998550415, + "num_tokens": 444209230.0, + "step": 17168 + }, + { + "epoch": 1.8854601361739514, + "grad_norm": 1.8454177379608154, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7460728883743286, + "num_tokens": 444233545.0, + "step": 17169 + }, + { + "epoch": 1.885569953876565, + "grad_norm": 2.0036466121673584, + "learning_rate": 5e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7114576101303101, + "num_tokens": 444257870.0, + "step": 17170 + }, + { + "epoch": 1.8856797715791784, + "grad_norm": 1.6473850011825562, + "learning_rate": 5e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7314899563789368, + "num_tokens": 444286106.0, + "step": 17171 + }, + { + "epoch": 1.8857895892817922, + "grad_norm": 1.7964469194412231, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7460654377937317, + "num_tokens": 444311545.0, + "step": 17172 + }, + { + "epoch": 1.885899406984406, + "grad_norm": 1.7678710222244263, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7407238483428955, + "num_tokens": 444336837.0, + "step": 17173 + }, + { + "epoch": 1.8860092246870197, + "grad_norm": 2.034384250640869, + "learning_rate": 5e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7271742820739746, + "num_tokens": 444359621.0, + "step": 17174 + }, + { + "epoch": 1.8861190423896332, + "grad_norm": 1.7739723920822144, + "learning_rate": 5e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7185269594192505, + "num_tokens": 444383869.0, + "step": 17175 + }, + { + "epoch": 1.8862288600922468, + "grad_norm": 1.4560843706130981, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7383948564529419, + "num_tokens": 444417718.0, + "step": 17176 + }, + { + "epoch": 1.8863386777948605, + "grad_norm": 1.6360163688659668, + "learning_rate": 5e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7500901818275452, + "num_tokens": 444446374.0, + "step": 17177 + }, + { + "epoch": 1.8864484954974743, + "grad_norm": 1.7227805852890015, + "learning_rate": 5e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7200635075569153, + "num_tokens": 444473230.0, + "step": 17178 + }, + { + "epoch": 1.8865583132000878, + "grad_norm": 1.7299412488937378, + "learning_rate": 5e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7256475687026978, + "num_tokens": 444502675.0, + "step": 17179 + }, + { + "epoch": 1.8866681309027014, + "grad_norm": 1.6218990087509155, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7392227649688721, + "num_tokens": 444531403.0, + "step": 17180 + }, + { + "epoch": 1.8867779486053151, + "grad_norm": 1.8710025548934937, + "learning_rate": 5e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7245597839355469, + "num_tokens": 444555327.0, + "step": 17181 + }, + { + "epoch": 1.8868877663079289, + "grad_norm": 1.8141297101974487, + "learning_rate": 5e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7334374189376831, + "num_tokens": 444579926.0, + "step": 17182 + }, + { + "epoch": 1.8869975840105426, + "grad_norm": 1.6397064924240112, + "learning_rate": 5e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7257410287857056, + "num_tokens": 444610387.0, + "step": 17183 + }, + { + "epoch": 1.8871074017131562, + "grad_norm": 1.705748200416565, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7505798935890198, + "num_tokens": 444636956.0, + "step": 17184 + }, + { + "epoch": 1.8872172194157697, + "grad_norm": 1.647788643836975, + "learning_rate": 5e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7322080135345459, + "num_tokens": 444668861.0, + "step": 17185 + }, + { + "epoch": 1.8873270371183835, + "grad_norm": 1.8418316841125488, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7386179566383362, + "num_tokens": 444692330.0, + "step": 17186 + }, + { + "epoch": 1.8874368548209972, + "grad_norm": 1.8794643878936768, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7364176511764526, + "num_tokens": 444715898.0, + "step": 17187 + }, + { + "epoch": 1.887546672523611, + "grad_norm": 1.9575875997543335, + "learning_rate": 5e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7253193259239197, + "num_tokens": 444737489.0, + "step": 17188 + }, + { + "epoch": 1.8876564902262245, + "grad_norm": 1.77987802028656, + "learning_rate": 5e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7537909746170044, + "num_tokens": 444761698.0, + "step": 17189 + }, + { + "epoch": 1.887766307928838, + "grad_norm": 2.0312294960021973, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7639621496200562, + "num_tokens": 444782080.0, + "step": 17190 + }, + { + "epoch": 1.8878761256314518, + "grad_norm": 1.8225277662277222, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7255213856697083, + "num_tokens": 444808751.0, + "step": 17191 + }, + { + "epoch": 1.8879859433340656, + "grad_norm": 1.7836992740631104, + "learning_rate": 5e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7432820200920105, + "num_tokens": 444833511.0, + "step": 17192 + }, + { + "epoch": 1.888095761036679, + "grad_norm": 1.9899530410766602, + "learning_rate": 5e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7449542284011841, + "num_tokens": 444855570.0, + "step": 17193 + }, + { + "epoch": 1.8882055787392926, + "grad_norm": 1.8053914308547974, + "learning_rate": 5e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.721075713634491, + "num_tokens": 444880608.0, + "step": 17194 + }, + { + "epoch": 1.8883153964419064, + "grad_norm": 1.9389560222625732, + "learning_rate": 5e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7422640919685364, + "num_tokens": 444902044.0, + "step": 17195 + }, + { + "epoch": 1.8884252141445201, + "grad_norm": 1.8355406522750854, + "learning_rate": 5e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7277175188064575, + "num_tokens": 444927919.0, + "step": 17196 + }, + { + "epoch": 1.888535031847134, + "grad_norm": 1.7257146835327148, + "learning_rate": 5e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.732429563999176, + "num_tokens": 444957552.0, + "step": 17197 + }, + { + "epoch": 1.8886448495497474, + "grad_norm": 1.9018898010253906, + "learning_rate": 5e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7327759861946106, + "num_tokens": 444980503.0, + "step": 17198 + }, + { + "epoch": 1.888754667252361, + "grad_norm": 2.0301244258880615, + "learning_rate": 5e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.734434962272644, + "num_tokens": 445003841.0, + "step": 17199 + }, + { + "epoch": 1.8888644849549747, + "grad_norm": 2.0585880279541016, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7384828329086304, + "num_tokens": 445025427.0, + "step": 17200 + }, + { + "epoch": 1.8889743026575885, + "grad_norm": 1.937964677810669, + "learning_rate": 5e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7386624813079834, + "num_tokens": 445048247.0, + "step": 17201 + }, + { + "epoch": 1.889084120360202, + "grad_norm": 1.7911368608474731, + "learning_rate": 5e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7165088653564453, + "num_tokens": 445077532.0, + "step": 17202 + }, + { + "epoch": 1.8891939380628158, + "grad_norm": 2.0709540843963623, + "learning_rate": 5e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7504069805145264, + "num_tokens": 445098064.0, + "step": 17203 + }, + { + "epoch": 1.8893037557654293, + "grad_norm": 1.832844853401184, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7399507761001587, + "num_tokens": 445125019.0, + "step": 17204 + }, + { + "epoch": 1.889413573468043, + "grad_norm": 1.8258966207504272, + "learning_rate": 5e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7329092025756836, + "num_tokens": 445151209.0, + "step": 17205 + }, + { + "epoch": 1.8895233911706568, + "grad_norm": 1.587598443031311, + "learning_rate": 5e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7677616477012634, + "num_tokens": 445178851.0, + "step": 17206 + }, + { + "epoch": 1.8896332088732704, + "grad_norm": 1.6527912616729736, + "learning_rate": 5e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7117506265640259, + "num_tokens": 445211552.0, + "step": 17207 + }, + { + "epoch": 1.8897430265758839, + "grad_norm": 1.8917995691299438, + "learning_rate": 5e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7368403673171997, + "num_tokens": 445237031.0, + "step": 17208 + }, + { + "epoch": 1.8898528442784976, + "grad_norm": 2.0819921493530273, + "learning_rate": 5e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7473065257072449, + "num_tokens": 445256517.0, + "step": 17209 + }, + { + "epoch": 1.8899626619811114, + "grad_norm": 1.7744687795639038, + "learning_rate": 5e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.713634192943573, + "num_tokens": 445286159.0, + "step": 17210 + }, + { + "epoch": 1.8900724796837252, + "grad_norm": 1.596329689025879, + "learning_rate": 5e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7169211506843567, + "num_tokens": 445317048.0, + "step": 17211 + }, + { + "epoch": 1.8901822973863387, + "grad_norm": 1.7188448905944824, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7194305658340454, + "num_tokens": 445345292.0, + "step": 17212 + }, + { + "epoch": 1.8902921150889522, + "grad_norm": 1.835376501083374, + "learning_rate": 5e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7468389272689819, + "num_tokens": 445369451.0, + "step": 17213 + }, + { + "epoch": 1.890401932791566, + "grad_norm": 1.9358192682266235, + "learning_rate": 5e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7498851418495178, + "num_tokens": 445390776.0, + "step": 17214 + }, + { + "epoch": 1.8905117504941797, + "grad_norm": 1.724295735359192, + "learning_rate": 5e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7213034629821777, + "num_tokens": 445418306.0, + "step": 17215 + }, + { + "epoch": 1.8906215681967933, + "grad_norm": 1.8918899297714233, + "learning_rate": 5e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7336748838424683, + "num_tokens": 445440828.0, + "step": 17216 + }, + { + "epoch": 1.890731385899407, + "grad_norm": 1.6820231676101685, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7426417469978333, + "num_tokens": 445466737.0, + "step": 17217 + }, + { + "epoch": 1.8908412036020206, + "grad_norm": 1.9769935607910156, + "learning_rate": 5e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7515840530395508, + "num_tokens": 445487345.0, + "step": 17218 + }, + { + "epoch": 1.8909510213046343, + "grad_norm": 2.101475715637207, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7168165445327759, + "num_tokens": 445508297.0, + "step": 17219 + }, + { + "epoch": 1.891060839007248, + "grad_norm": 1.8617316484451294, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7420578002929688, + "num_tokens": 445531920.0, + "step": 17220 + }, + { + "epoch": 1.8911706567098616, + "grad_norm": 1.8328044414520264, + "learning_rate": 5e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7382236123085022, + "num_tokens": 445556584.0, + "step": 17221 + }, + { + "epoch": 1.8912804744124752, + "grad_norm": 1.7150462865829468, + "learning_rate": 5e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7128855586051941, + "num_tokens": 445586296.0, + "step": 17222 + }, + { + "epoch": 1.891390292115089, + "grad_norm": 1.5542302131652832, + "learning_rate": 5e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7260857224464417, + "num_tokens": 445618019.0, + "step": 17223 + }, + { + "epoch": 1.8915001098177027, + "grad_norm": 1.7647476196289062, + "learning_rate": 5e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7427324056625366, + "num_tokens": 445641926.0, + "step": 17224 + }, + { + "epoch": 1.8916099275203164, + "grad_norm": 1.8771486282348633, + "learning_rate": 5e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.724099338054657, + "num_tokens": 445666304.0, + "step": 17225 + }, + { + "epoch": 1.89171974522293, + "grad_norm": 1.7811613082885742, + "learning_rate": 5e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7294843792915344, + "num_tokens": 445693302.0, + "step": 17226 + }, + { + "epoch": 1.8918295629255435, + "grad_norm": 1.9170275926589966, + "learning_rate": 5e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7454798221588135, + "num_tokens": 445715043.0, + "step": 17227 + }, + { + "epoch": 1.8919393806281573, + "grad_norm": 1.9406696557998657, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7524890303611755, + "num_tokens": 445736124.0, + "step": 17228 + }, + { + "epoch": 1.892049198330771, + "grad_norm": 1.7848926782608032, + "learning_rate": 5e-06, + "loss": 0.835, + "mean_token_accuracy": 0.735684335231781, + "num_tokens": 445763991.0, + "step": 17229 + }, + { + "epoch": 1.8921590160333845, + "grad_norm": 1.6957865953445435, + "learning_rate": 5e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7308202981948853, + "num_tokens": 445790919.0, + "step": 17230 + }, + { + "epoch": 1.892268833735998, + "grad_norm": 1.6973543167114258, + "learning_rate": 5e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7100263833999634, + "num_tokens": 445817301.0, + "step": 17231 + }, + { + "epoch": 1.8923786514386118, + "grad_norm": 2.0050718784332275, + "learning_rate": 5e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7331749200820923, + "num_tokens": 445840494.0, + "step": 17232 + }, + { + "epoch": 1.8924884691412256, + "grad_norm": 2.0632236003875732, + "learning_rate": 5e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.722712516784668, + "num_tokens": 445864998.0, + "step": 17233 + }, + { + "epoch": 1.8925982868438394, + "grad_norm": 1.6847549676895142, + "learning_rate": 5e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7292944192886353, + "num_tokens": 445892034.0, + "step": 17234 + }, + { + "epoch": 1.8927081045464529, + "grad_norm": 1.8881813287734985, + "learning_rate": 5e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7245566248893738, + "num_tokens": 445919102.0, + "step": 17235 + }, + { + "epoch": 1.8928179222490664, + "grad_norm": 1.9975522756576538, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7260679006576538, + "num_tokens": 445943699.0, + "step": 17236 + }, + { + "epoch": 1.8929277399516802, + "grad_norm": 1.7738322019577026, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7356013059616089, + "num_tokens": 445967979.0, + "step": 17237 + }, + { + "epoch": 1.893037557654294, + "grad_norm": 1.7689826488494873, + "learning_rate": 5e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7341304421424866, + "num_tokens": 445995287.0, + "step": 17238 + }, + { + "epoch": 1.8931473753569077, + "grad_norm": 1.9467912912368774, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7371189594268799, + "num_tokens": 446017814.0, + "step": 17239 + }, + { + "epoch": 1.8932571930595212, + "grad_norm": 1.6603083610534668, + "learning_rate": 5e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7133467793464661, + "num_tokens": 446047130.0, + "step": 17240 + }, + { + "epoch": 1.8933670107621348, + "grad_norm": 1.643007755279541, + "learning_rate": 5e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7205802202224731, + "num_tokens": 446077991.0, + "step": 17241 + }, + { + "epoch": 1.8934768284647485, + "grad_norm": 1.8431847095489502, + "learning_rate": 5e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.754774272441864, + "num_tokens": 446101476.0, + "step": 17242 + }, + { + "epoch": 1.8935866461673623, + "grad_norm": 1.8256686925888062, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7137417793273926, + "num_tokens": 446128580.0, + "step": 17243 + }, + { + "epoch": 1.8936964638699758, + "grad_norm": 1.545047640800476, + "learning_rate": 5e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7352553606033325, + "num_tokens": 446161742.0, + "step": 17244 + }, + { + "epoch": 1.8938062815725893, + "grad_norm": 1.7677346467971802, + "learning_rate": 5e-06, + "loss": 0.893, + "mean_token_accuracy": 0.715329110622406, + "num_tokens": 446187036.0, + "step": 17245 + }, + { + "epoch": 1.893916099275203, + "grad_norm": 1.955057144165039, + "learning_rate": 5e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7243046164512634, + "num_tokens": 446210116.0, + "step": 17246 + }, + { + "epoch": 1.8940259169778169, + "grad_norm": 1.9418100118637085, + "learning_rate": 5e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7468776702880859, + "num_tokens": 446231726.0, + "step": 17247 + }, + { + "epoch": 1.8941357346804306, + "grad_norm": 1.8737406730651855, + "learning_rate": 5e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7154553532600403, + "num_tokens": 446256795.0, + "step": 17248 + }, + { + "epoch": 1.8942455523830442, + "grad_norm": 1.9149597883224487, + "learning_rate": 5e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7113152146339417, + "num_tokens": 446280775.0, + "step": 17249 + }, + { + "epoch": 1.8943553700856577, + "grad_norm": 1.780314326286316, + "learning_rate": 5e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7280237674713135, + "num_tokens": 446308973.0, + "step": 17250 + }, + { + "epoch": 1.8944651877882714, + "grad_norm": 1.7013996839523315, + "learning_rate": 5e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7274285554885864, + "num_tokens": 446338026.0, + "step": 17251 + }, + { + "epoch": 1.8945750054908852, + "grad_norm": 1.7976816892623901, + "learning_rate": 5e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7488534450531006, + "num_tokens": 446362319.0, + "step": 17252 + }, + { + "epoch": 1.894684823193499, + "grad_norm": 1.755617380142212, + "learning_rate": 5e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7447414398193359, + "num_tokens": 446391842.0, + "step": 17253 + }, + { + "epoch": 1.8947946408961125, + "grad_norm": 1.8424384593963623, + "learning_rate": 5e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7333859205245972, + "num_tokens": 446416757.0, + "step": 17254 + }, + { + "epoch": 1.894904458598726, + "grad_norm": 1.6560933589935303, + "learning_rate": 5e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7186247110366821, + "num_tokens": 446447863.0, + "step": 17255 + }, + { + "epoch": 1.8950142763013398, + "grad_norm": 1.9658427238464355, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7537510395050049, + "num_tokens": 446469320.0, + "step": 17256 + }, + { + "epoch": 1.8951240940039535, + "grad_norm": 1.7763488292694092, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7361506223678589, + "num_tokens": 446492871.0, + "step": 17257 + }, + { + "epoch": 1.895233911706567, + "grad_norm": 2.028102159500122, + "learning_rate": 5e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7473143339157104, + "num_tokens": 446512111.0, + "step": 17258 + }, + { + "epoch": 1.8953437294091806, + "grad_norm": 1.8260667324066162, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7222228050231934, + "num_tokens": 446536287.0, + "step": 17259 + }, + { + "epoch": 1.8954535471117944, + "grad_norm": 1.6055537462234497, + "learning_rate": 5e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7050159573554993, + "num_tokens": 446569386.0, + "step": 17260 + }, + { + "epoch": 1.8955633648144081, + "grad_norm": 1.9415405988693237, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7325494289398193, + "num_tokens": 446591082.0, + "step": 17261 + }, + { + "epoch": 1.8956731825170219, + "grad_norm": 1.9193052053451538, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7358711957931519, + "num_tokens": 446616387.0, + "step": 17262 + }, + { + "epoch": 1.8957830002196354, + "grad_norm": 1.8932793140411377, + "learning_rate": 5e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.748961329460144, + "num_tokens": 446638249.0, + "step": 17263 + }, + { + "epoch": 1.895892817922249, + "grad_norm": 1.7594364881515503, + "learning_rate": 5e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.722912073135376, + "num_tokens": 446667147.0, + "step": 17264 + }, + { + "epoch": 1.8960026356248627, + "grad_norm": 1.8840574026107788, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7444349527359009, + "num_tokens": 446690016.0, + "step": 17265 + }, + { + "epoch": 1.8961124533274765, + "grad_norm": 1.540522575378418, + "learning_rate": 5e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7425289154052734, + "num_tokens": 446721192.0, + "step": 17266 + }, + { + "epoch": 1.89622227103009, + "grad_norm": 1.9146898984909058, + "learning_rate": 5e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7047173380851746, + "num_tokens": 446745946.0, + "step": 17267 + }, + { + "epoch": 1.8963320887327038, + "grad_norm": 1.8946155309677124, + "learning_rate": 5e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7461419701576233, + "num_tokens": 446768911.0, + "step": 17268 + }, + { + "epoch": 1.8964419064353173, + "grad_norm": 2.058760643005371, + "learning_rate": 5e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7467260360717773, + "num_tokens": 446786777.0, + "step": 17269 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 1.8566668033599854, + "learning_rate": 5e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7409132719039917, + "num_tokens": 446809162.0, + "step": 17270 + }, + { + "epoch": 1.8966615418405448, + "grad_norm": 2.1331734657287598, + "learning_rate": 5e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7372704148292542, + "num_tokens": 446829343.0, + "step": 17271 + }, + { + "epoch": 1.8967713595431583, + "grad_norm": 1.5169198513031006, + "learning_rate": 5e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7142845392227173, + "num_tokens": 446862942.0, + "step": 17272 + }, + { + "epoch": 1.8968811772457719, + "grad_norm": 1.8188493251800537, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7457848191261292, + "num_tokens": 446886824.0, + "step": 17273 + }, + { + "epoch": 1.8969909949483856, + "grad_norm": 1.7152701616287231, + "learning_rate": 5e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.6992560029029846, + "num_tokens": 446914170.0, + "step": 17274 + }, + { + "epoch": 1.8971008126509994, + "grad_norm": 2.1362979412078857, + "learning_rate": 5e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.736402153968811, + "num_tokens": 446933643.0, + "step": 17275 + }, + { + "epoch": 1.8972106303536131, + "grad_norm": 2.0168111324310303, + "learning_rate": 5e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7436615228652954, + "num_tokens": 446953317.0, + "step": 17276 + }, + { + "epoch": 1.8973204480562267, + "grad_norm": 1.6149877309799194, + "learning_rate": 5e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7119189500808716, + "num_tokens": 446985265.0, + "step": 17277 + }, + { + "epoch": 1.8974302657588402, + "grad_norm": 1.8959954977035522, + "learning_rate": 5e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7419130802154541, + "num_tokens": 447005981.0, + "step": 17278 + }, + { + "epoch": 1.897540083461454, + "grad_norm": 1.6746710538864136, + "learning_rate": 5e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7333840131759644, + "num_tokens": 447034048.0, + "step": 17279 + }, + { + "epoch": 1.8976499011640677, + "grad_norm": 2.2080094814300537, + "learning_rate": 5e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.734764039516449, + "num_tokens": 447051616.0, + "step": 17280 + }, + { + "epoch": 1.8977597188666813, + "grad_norm": 1.7503446340560913, + "learning_rate": 5e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7061009407043457, + "num_tokens": 447080085.0, + "step": 17281 + }, + { + "epoch": 1.897869536569295, + "grad_norm": 1.770688533782959, + "learning_rate": 5e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7221241593360901, + "num_tokens": 447106451.0, + "step": 17282 + }, + { + "epoch": 1.8979793542719086, + "grad_norm": 1.6453338861465454, + "learning_rate": 5e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.75074702501297, + "num_tokens": 447135773.0, + "step": 17283 + }, + { + "epoch": 1.8980891719745223, + "grad_norm": 1.8224802017211914, + "learning_rate": 5e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7216979265213013, + "num_tokens": 447157547.0, + "step": 17284 + }, + { + "epoch": 1.898198989677136, + "grad_norm": 1.8789180517196655, + "learning_rate": 5e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7479282021522522, + "num_tokens": 447178550.0, + "step": 17285 + }, + { + "epoch": 1.8983088073797496, + "grad_norm": 1.7352977991104126, + "learning_rate": 5e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7556558847427368, + "num_tokens": 447203080.0, + "step": 17286 + }, + { + "epoch": 1.8984186250823631, + "grad_norm": 1.8640944957733154, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7369219660758972, + "num_tokens": 447225190.0, + "step": 17287 + }, + { + "epoch": 1.898528442784977, + "grad_norm": 1.9204137325286865, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7344939708709717, + "num_tokens": 447249503.0, + "step": 17288 + }, + { + "epoch": 1.8986382604875907, + "grad_norm": 1.7742786407470703, + "learning_rate": 5e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7192853689193726, + "num_tokens": 447275670.0, + "step": 17289 + }, + { + "epoch": 1.8987480781902044, + "grad_norm": 1.6339397430419922, + "learning_rate": 5e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.712807297706604, + "num_tokens": 447308331.0, + "step": 17290 + }, + { + "epoch": 1.898857895892818, + "grad_norm": 2.0914859771728516, + "learning_rate": 5e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7422392964363098, + "num_tokens": 447328592.0, + "step": 17291 + }, + { + "epoch": 1.8989677135954315, + "grad_norm": 1.6365162134170532, + "learning_rate": 5e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.6998405456542969, + "num_tokens": 447361564.0, + "step": 17292 + }, + { + "epoch": 1.8990775312980452, + "grad_norm": 1.872673511505127, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7518874406814575, + "num_tokens": 447387297.0, + "step": 17293 + }, + { + "epoch": 1.899187349000659, + "grad_norm": 1.7524334192276, + "learning_rate": 5e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7537133097648621, + "num_tokens": 447411647.0, + "step": 17294 + }, + { + "epoch": 1.8992971667032725, + "grad_norm": 1.6603597402572632, + "learning_rate": 5e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7338520884513855, + "num_tokens": 447439629.0, + "step": 17295 + }, + { + "epoch": 1.899406984405886, + "grad_norm": 1.8826922178268433, + "learning_rate": 5e-06, + "loss": 0.7569, + "mean_token_accuracy": 0.7501510381698608, + "num_tokens": 447462391.0, + "step": 17296 + }, + { + "epoch": 1.8995168021084998, + "grad_norm": 1.8214651346206665, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7293623685836792, + "num_tokens": 447487688.0, + "step": 17297 + }, + { + "epoch": 1.8996266198111136, + "grad_norm": 1.8014575242996216, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7295277118682861, + "num_tokens": 447514000.0, + "step": 17298 + }, + { + "epoch": 1.8997364375137273, + "grad_norm": 1.7184852361679077, + "learning_rate": 5e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7379416823387146, + "num_tokens": 447542422.0, + "step": 17299 + }, + { + "epoch": 1.8998462552163409, + "grad_norm": 1.8743467330932617, + "learning_rate": 5e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7489438652992249, + "num_tokens": 447565103.0, + "step": 17300 + }, + { + "epoch": 1.8999560729189544, + "grad_norm": 1.8040748834609985, + "learning_rate": 5e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7332983016967773, + "num_tokens": 447590435.0, + "step": 17301 + }, + { + "epoch": 1.9000658906215682, + "grad_norm": 1.9021278619766235, + "learning_rate": 5e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7448412775993347, + "num_tokens": 447614250.0, + "step": 17302 + }, + { + "epoch": 1.900175708324182, + "grad_norm": 1.89555823802948, + "learning_rate": 5e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7348544597625732, + "num_tokens": 447636163.0, + "step": 17303 + }, + { + "epoch": 1.9002855260267957, + "grad_norm": 2.0137779712677, + "learning_rate": 5e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7364178895950317, + "num_tokens": 447657393.0, + "step": 17304 + }, + { + "epoch": 1.9003953437294092, + "grad_norm": 1.838613748550415, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7364886999130249, + "num_tokens": 447681811.0, + "step": 17305 + }, + { + "epoch": 1.9005051614320227, + "grad_norm": 1.9092670679092407, + "learning_rate": 5e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7140257358551025, + "num_tokens": 447707265.0, + "step": 17306 + }, + { + "epoch": 1.9006149791346365, + "grad_norm": 1.8881946802139282, + "learning_rate": 5e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.715343713760376, + "num_tokens": 447733345.0, + "step": 17307 + }, + { + "epoch": 1.9007247968372503, + "grad_norm": 1.843899130821228, + "learning_rate": 5e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7250277400016785, + "num_tokens": 447757495.0, + "step": 17308 + }, + { + "epoch": 1.9008346145398638, + "grad_norm": 1.7250030040740967, + "learning_rate": 5e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7252970933914185, + "num_tokens": 447784395.0, + "step": 17309 + }, + { + "epoch": 1.9009444322424773, + "grad_norm": 1.728872537612915, + "learning_rate": 5e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7250539660453796, + "num_tokens": 447813041.0, + "step": 17310 + }, + { + "epoch": 1.901054249945091, + "grad_norm": 1.7699273824691772, + "learning_rate": 5e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.734340250492096, + "num_tokens": 447839023.0, + "step": 17311 + }, + { + "epoch": 1.9011640676477048, + "grad_norm": 1.7252459526062012, + "learning_rate": 5e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7154822945594788, + "num_tokens": 447867931.0, + "step": 17312 + }, + { + "epoch": 1.9012738853503186, + "grad_norm": 1.7914206981658936, + "learning_rate": 5e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7323853373527527, + "num_tokens": 447893509.0, + "step": 17313 + }, + { + "epoch": 1.9013837030529321, + "grad_norm": 1.6994045972824097, + "learning_rate": 5e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7364774346351624, + "num_tokens": 447920047.0, + "step": 17314 + }, + { + "epoch": 1.9014935207555457, + "grad_norm": 1.6871534585952759, + "learning_rate": 5e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7300546169281006, + "num_tokens": 447946884.0, + "step": 17315 + }, + { + "epoch": 1.9016033384581594, + "grad_norm": 2.1646878719329834, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7453877925872803, + "num_tokens": 447965115.0, + "step": 17316 + }, + { + "epoch": 1.9017131561607732, + "grad_norm": 1.6030352115631104, + "learning_rate": 5e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7508819103240967, + "num_tokens": 447991960.0, + "step": 17317 + }, + { + "epoch": 1.9018229738633867, + "grad_norm": 1.8709830045700073, + "learning_rate": 5e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7422646284103394, + "num_tokens": 448016206.0, + "step": 17318 + }, + { + "epoch": 1.9019327915660005, + "grad_norm": 1.7749435901641846, + "learning_rate": 5e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7280187606811523, + "num_tokens": 448044972.0, + "step": 17319 + }, + { + "epoch": 1.902042609268614, + "grad_norm": 1.8935978412628174, + "learning_rate": 5e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7399394512176514, + "num_tokens": 448068491.0, + "step": 17320 + }, + { + "epoch": 1.9021524269712278, + "grad_norm": 1.9333820343017578, + "learning_rate": 5e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7282819747924805, + "num_tokens": 448091542.0, + "step": 17321 + }, + { + "epoch": 1.9022622446738415, + "grad_norm": 1.8000874519348145, + "learning_rate": 5e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7357333898544312, + "num_tokens": 448116289.0, + "step": 17322 + }, + { + "epoch": 1.902372062376455, + "grad_norm": 1.7476781606674194, + "learning_rate": 5e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7261930704116821, + "num_tokens": 448144575.0, + "step": 17323 + }, + { + "epoch": 1.9024818800790686, + "grad_norm": 1.6248856782913208, + "learning_rate": 5e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7295289635658264, + "num_tokens": 448175682.0, + "step": 17324 + }, + { + "epoch": 1.9025916977816824, + "grad_norm": 1.8383742570877075, + "learning_rate": 5e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7509313821792603, + "num_tokens": 448199172.0, + "step": 17325 + }, + { + "epoch": 1.9027015154842961, + "grad_norm": 1.7835277318954468, + "learning_rate": 5e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7346706390380859, + "num_tokens": 448225266.0, + "step": 17326 + }, + { + "epoch": 1.9028113331869099, + "grad_norm": 1.9707533121109009, + "learning_rate": 5e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7410216927528381, + "num_tokens": 448245322.0, + "step": 17327 + }, + { + "epoch": 1.9029211508895234, + "grad_norm": 2.0177414417266846, + "learning_rate": 5e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7353024482727051, + "num_tokens": 448266976.0, + "step": 17328 + }, + { + "epoch": 1.903030968592137, + "grad_norm": 1.7581249475479126, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7513222694396973, + "num_tokens": 448293868.0, + "step": 17329 + }, + { + "epoch": 1.9031407862947507, + "grad_norm": 1.778549075126648, + "learning_rate": 5e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7258172035217285, + "num_tokens": 448320250.0, + "step": 17330 + }, + { + "epoch": 1.9032506039973645, + "grad_norm": 1.832100749015808, + "learning_rate": 5e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7407314777374268, + "num_tokens": 448344554.0, + "step": 17331 + }, + { + "epoch": 1.903360421699978, + "grad_norm": 2.1181859970092773, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.712538480758667, + "num_tokens": 448375987.0, + "step": 17332 + }, + { + "epoch": 1.9034702394025917, + "grad_norm": 1.750828504562378, + "learning_rate": 5e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7121962308883667, + "num_tokens": 448404909.0, + "step": 17333 + }, + { + "epoch": 1.9035800571052053, + "grad_norm": 1.8384877443313599, + "learning_rate": 5e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7138783931732178, + "num_tokens": 448434109.0, + "step": 17334 + }, + { + "epoch": 1.903689874807819, + "grad_norm": 1.5239670276641846, + "learning_rate": 5e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7245861291885376, + "num_tokens": 448468126.0, + "step": 17335 + }, + { + "epoch": 1.9037996925104328, + "grad_norm": 1.9721546173095703, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7252671122550964, + "num_tokens": 448491358.0, + "step": 17336 + }, + { + "epoch": 1.9039095102130463, + "grad_norm": 1.9351874589920044, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7264523506164551, + "num_tokens": 448515446.0, + "step": 17337 + }, + { + "epoch": 1.9040193279156599, + "grad_norm": 1.6746138334274292, + "learning_rate": 5e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7519480586051941, + "num_tokens": 448540900.0, + "step": 17338 + }, + { + "epoch": 1.9041291456182736, + "grad_norm": 1.9888999462127686, + "learning_rate": 5e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7426519393920898, + "num_tokens": 448560992.0, + "step": 17339 + }, + { + "epoch": 1.9042389633208874, + "grad_norm": 1.7510042190551758, + "learning_rate": 5e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7395113706588745, + "num_tokens": 448588140.0, + "step": 17340 + }, + { + "epoch": 1.9043487810235011, + "grad_norm": 1.958162784576416, + "learning_rate": 5e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7348645329475403, + "num_tokens": 448610701.0, + "step": 17341 + }, + { + "epoch": 1.9044585987261147, + "grad_norm": 1.6797131299972534, + "learning_rate": 5e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7307987213134766, + "num_tokens": 448640444.0, + "step": 17342 + }, + { + "epoch": 1.9045684164287282, + "grad_norm": 1.862914800643921, + "learning_rate": 5e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7262865900993347, + "num_tokens": 448666896.0, + "step": 17343 + }, + { + "epoch": 1.904678234131342, + "grad_norm": 1.7576344013214111, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7319326400756836, + "num_tokens": 448693203.0, + "step": 17344 + }, + { + "epoch": 1.9047880518339557, + "grad_norm": 1.816226840019226, + "learning_rate": 5e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.730750322341919, + "num_tokens": 448718634.0, + "step": 17345 + }, + { + "epoch": 1.9048978695365693, + "grad_norm": 1.861789345741272, + "learning_rate": 5e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7451230883598328, + "num_tokens": 448741126.0, + "step": 17346 + }, + { + "epoch": 1.9050076872391828, + "grad_norm": 1.7512328624725342, + "learning_rate": 5e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7105275392532349, + "num_tokens": 448770685.0, + "step": 17347 + }, + { + "epoch": 1.9051175049417965, + "grad_norm": 1.8523739576339722, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7321789264678955, + "num_tokens": 448794337.0, + "step": 17348 + }, + { + "epoch": 1.9052273226444103, + "grad_norm": 1.6890010833740234, + "learning_rate": 5e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7413697242736816, + "num_tokens": 448823270.0, + "step": 17349 + }, + { + "epoch": 1.905337140347024, + "grad_norm": 1.9743337631225586, + "learning_rate": 5e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7547149658203125, + "num_tokens": 448845218.0, + "step": 17350 + }, + { + "epoch": 1.9054469580496376, + "grad_norm": 1.788206696510315, + "learning_rate": 5e-06, + "loss": 0.969, + "mean_token_accuracy": 0.6995961666107178, + "num_tokens": 448873805.0, + "step": 17351 + }, + { + "epoch": 1.9055567757522511, + "grad_norm": 1.8267568349838257, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7253130078315735, + "num_tokens": 448899105.0, + "step": 17352 + }, + { + "epoch": 1.9056665934548649, + "grad_norm": 1.8535497188568115, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7196652889251709, + "num_tokens": 448926449.0, + "step": 17353 + }, + { + "epoch": 1.9057764111574786, + "grad_norm": 1.7097125053405762, + "learning_rate": 5e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7343196868896484, + "num_tokens": 448955431.0, + "step": 17354 + }, + { + "epoch": 1.9058862288600924, + "grad_norm": 1.9043813943862915, + "learning_rate": 5e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7278821468353271, + "num_tokens": 448979534.0, + "step": 17355 + }, + { + "epoch": 1.905996046562706, + "grad_norm": 2.0057883262634277, + "learning_rate": 5e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7422953844070435, + "num_tokens": 449000032.0, + "step": 17356 + }, + { + "epoch": 1.9061058642653195, + "grad_norm": 1.9459779262542725, + "learning_rate": 5e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7361956834793091, + "num_tokens": 449020607.0, + "step": 17357 + }, + { + "epoch": 1.9062156819679332, + "grad_norm": 1.7341315746307373, + "learning_rate": 5e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7133885622024536, + "num_tokens": 449049545.0, + "step": 17358 + }, + { + "epoch": 1.906325499670547, + "grad_norm": 1.8536359071731567, + "learning_rate": 5e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7306707501411438, + "num_tokens": 449072819.0, + "step": 17359 + }, + { + "epoch": 1.9064353173731605, + "grad_norm": 1.7730849981307983, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7155823111534119, + "num_tokens": 449099337.0, + "step": 17360 + }, + { + "epoch": 1.906545135075774, + "grad_norm": 1.482181191444397, + "learning_rate": 5e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7143241763114929, + "num_tokens": 449134660.0, + "step": 17361 + }, + { + "epoch": 1.9066549527783878, + "grad_norm": 1.9140119552612305, + "learning_rate": 5e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7145335674285889, + "num_tokens": 449159944.0, + "step": 17362 + }, + { + "epoch": 1.9067647704810016, + "grad_norm": 2.1128644943237305, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7362923622131348, + "num_tokens": 449178265.0, + "step": 17363 + }, + { + "epoch": 1.9068745881836153, + "grad_norm": 1.795424461364746, + "learning_rate": 5e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7566491365432739, + "num_tokens": 449200425.0, + "step": 17364 + }, + { + "epoch": 1.9069844058862289, + "grad_norm": 1.6554700136184692, + "learning_rate": 5e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7368319034576416, + "num_tokens": 449227557.0, + "step": 17365 + }, + { + "epoch": 1.9070942235888424, + "grad_norm": 1.8194167613983154, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7473452687263489, + "num_tokens": 449252145.0, + "step": 17366 + }, + { + "epoch": 1.9072040412914562, + "grad_norm": 1.7449959516525269, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.738226056098938, + "num_tokens": 449279987.0, + "step": 17367 + }, + { + "epoch": 1.90731385899407, + "grad_norm": 1.895358681678772, + "learning_rate": 5e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7438154220581055, + "num_tokens": 449301361.0, + "step": 17368 + }, + { + "epoch": 1.9074236766966837, + "grad_norm": 1.6057639122009277, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7101058959960938, + "num_tokens": 449333029.0, + "step": 17369 + }, + { + "epoch": 1.9075334943992972, + "grad_norm": 1.9908775091171265, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7468134164810181, + "num_tokens": 449355105.0, + "step": 17370 + }, + { + "epoch": 1.9076433121019107, + "grad_norm": 1.9112848043441772, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7198917865753174, + "num_tokens": 449379721.0, + "step": 17371 + }, + { + "epoch": 1.9077531298045245, + "grad_norm": 1.8235739469528198, + "learning_rate": 5e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7314948439598083, + "num_tokens": 449406311.0, + "step": 17372 + }, + { + "epoch": 1.9078629475071383, + "grad_norm": 2.0241832733154297, + "learning_rate": 5e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7627178430557251, + "num_tokens": 449426496.0, + "step": 17373 + }, + { + "epoch": 1.9079727652097518, + "grad_norm": 1.716698169708252, + "learning_rate": 5e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7405315637588501, + "num_tokens": 449452894.0, + "step": 17374 + }, + { + "epoch": 1.9080825829123653, + "grad_norm": 1.8937448263168335, + "learning_rate": 5e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7430391311645508, + "num_tokens": 449476139.0, + "step": 17375 + }, + { + "epoch": 1.908192400614979, + "grad_norm": 1.8306182622909546, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7347273826599121, + "num_tokens": 449501158.0, + "step": 17376 + }, + { + "epoch": 1.9083022183175928, + "grad_norm": 1.5860484838485718, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7277700304985046, + "num_tokens": 449532502.0, + "step": 17377 + }, + { + "epoch": 1.9084120360202066, + "grad_norm": 1.6410382986068726, + "learning_rate": 5e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7369803190231323, + "num_tokens": 449560255.0, + "step": 17378 + }, + { + "epoch": 1.9085218537228201, + "grad_norm": 1.9046103954315186, + "learning_rate": 5e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7343850135803223, + "num_tokens": 449583850.0, + "step": 17379 + }, + { + "epoch": 1.9086316714254337, + "grad_norm": 1.8043467998504639, + "learning_rate": 5e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7115319967269897, + "num_tokens": 449609866.0, + "step": 17380 + }, + { + "epoch": 1.9087414891280474, + "grad_norm": 2.0069985389709473, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7229321002960205, + "num_tokens": 449634366.0, + "step": 17381 + }, + { + "epoch": 1.9088513068306612, + "grad_norm": 1.8619061708450317, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7239745259284973, + "num_tokens": 449659815.0, + "step": 17382 + }, + { + "epoch": 1.9089611245332747, + "grad_norm": 1.5748093128204346, + "learning_rate": 5e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7204435467720032, + "num_tokens": 449691382.0, + "step": 17383 + }, + { + "epoch": 1.9090709422358885, + "grad_norm": 1.833677887916565, + "learning_rate": 5e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7241560220718384, + "num_tokens": 449717509.0, + "step": 17384 + }, + { + "epoch": 1.909180759938502, + "grad_norm": 1.6407229900360107, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7173295021057129, + "num_tokens": 449745670.0, + "step": 17385 + }, + { + "epoch": 1.9092905776411158, + "grad_norm": 1.5670394897460938, + "learning_rate": 5e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7252751588821411, + "num_tokens": 449774071.0, + "step": 17386 + }, + { + "epoch": 1.9094003953437295, + "grad_norm": 1.811698079109192, + "learning_rate": 5e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7469824552536011, + "num_tokens": 449798057.0, + "step": 17387 + }, + { + "epoch": 1.909510213046343, + "grad_norm": 1.7987221479415894, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7241104245185852, + "num_tokens": 449823191.0, + "step": 17388 + }, + { + "epoch": 1.9096200307489566, + "grad_norm": 1.9663591384887695, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7317834496498108, + "num_tokens": 449846125.0, + "step": 17389 + }, + { + "epoch": 1.9097298484515703, + "grad_norm": 1.6707782745361328, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7381911873817444, + "num_tokens": 449872377.0, + "step": 17390 + }, + { + "epoch": 1.909839666154184, + "grad_norm": 2.194838285446167, + "learning_rate": 5e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7456170320510864, + "num_tokens": 449890056.0, + "step": 17391 + }, + { + "epoch": 1.9099494838567979, + "grad_norm": 1.7942498922348022, + "learning_rate": 5e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7584820985794067, + "num_tokens": 449913025.0, + "step": 17392 + }, + { + "epoch": 1.9100593015594114, + "grad_norm": 1.7569434642791748, + "learning_rate": 5e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7355912923812866, + "num_tokens": 449941346.0, + "step": 17393 + }, + { + "epoch": 1.910169119262025, + "grad_norm": 1.9427070617675781, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7540165185928345, + "num_tokens": 449962421.0, + "step": 17394 + }, + { + "epoch": 1.9102789369646387, + "grad_norm": 2.0704457759857178, + "learning_rate": 5e-06, + "loss": 0.851, + "mean_token_accuracy": 0.724909782409668, + "num_tokens": 449982826.0, + "step": 17395 + }, + { + "epoch": 1.9103887546672524, + "grad_norm": 1.8772189617156982, + "learning_rate": 5e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7451282143592834, + "num_tokens": 450008487.0, + "step": 17396 + }, + { + "epoch": 1.910498572369866, + "grad_norm": 1.9579849243164062, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7496082782745361, + "num_tokens": 450030199.0, + "step": 17397 + }, + { + "epoch": 1.9106083900724797, + "grad_norm": 1.6187865734100342, + "learning_rate": 5e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7232779264450073, + "num_tokens": 450061507.0, + "step": 17398 + }, + { + "epoch": 1.9107182077750933, + "grad_norm": 1.650835394859314, + "learning_rate": 5e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7363162040710449, + "num_tokens": 450092423.0, + "step": 17399 + }, + { + "epoch": 1.910828025477707, + "grad_norm": 1.9774327278137207, + "learning_rate": 5e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.712226152420044, + "num_tokens": 450115497.0, + "step": 17400 + }, + { + "epoch": 1.9109378431803208, + "grad_norm": 1.866721749305725, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7407698631286621, + "num_tokens": 450138613.0, + "step": 17401 + }, + { + "epoch": 1.9110476608829343, + "grad_norm": 1.7804588079452515, + "learning_rate": 5e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7490090727806091, + "num_tokens": 450163578.0, + "step": 17402 + }, + { + "epoch": 1.9111574785855479, + "grad_norm": 1.7537630796432495, + "learning_rate": 5e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7405573129653931, + "num_tokens": 450189572.0, + "step": 17403 + }, + { + "epoch": 1.9112672962881616, + "grad_norm": 1.878229022026062, + "learning_rate": 5e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7380576729774475, + "num_tokens": 450210675.0, + "step": 17404 + }, + { + "epoch": 1.9113771139907754, + "grad_norm": 1.703792929649353, + "learning_rate": 5e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7491307258605957, + "num_tokens": 450237732.0, + "step": 17405 + }, + { + "epoch": 1.9114869316933891, + "grad_norm": 1.667617678642273, + "learning_rate": 5e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.721688449382782, + "num_tokens": 450268591.0, + "step": 17406 + }, + { + "epoch": 1.9115967493960027, + "grad_norm": 1.507468819618225, + "learning_rate": 5e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7522045373916626, + "num_tokens": 450298725.0, + "step": 17407 + }, + { + "epoch": 1.9117065670986162, + "grad_norm": 1.8002376556396484, + "learning_rate": 5e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7433559894561768, + "num_tokens": 450323781.0, + "step": 17408 + }, + { + "epoch": 1.91181638480123, + "grad_norm": 1.7891796827316284, + "learning_rate": 5e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7155078053474426, + "num_tokens": 450351826.0, + "step": 17409 + }, + { + "epoch": 1.9119262025038437, + "grad_norm": 1.9779057502746582, + "learning_rate": 5e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7537564635276794, + "num_tokens": 450374196.0, + "step": 17410 + }, + { + "epoch": 1.9120360202064572, + "grad_norm": 1.714423418045044, + "learning_rate": 5e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7141640782356262, + "num_tokens": 450404148.0, + "step": 17411 + }, + { + "epoch": 1.9121458379090708, + "grad_norm": 1.8702738285064697, + "learning_rate": 5e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7334402203559875, + "num_tokens": 450429009.0, + "step": 17412 + }, + { + "epoch": 1.9122556556116845, + "grad_norm": 1.8718425035476685, + "learning_rate": 5e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7406537532806396, + "num_tokens": 450451483.0, + "step": 17413 + }, + { + "epoch": 1.9123654733142983, + "grad_norm": 1.8658946752548218, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7417545318603516, + "num_tokens": 450474995.0, + "step": 17414 + }, + { + "epoch": 1.912475291016912, + "grad_norm": 1.6135929822921753, + "learning_rate": 5e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7135663032531738, + "num_tokens": 450505450.0, + "step": 17415 + }, + { + "epoch": 1.9125851087195256, + "grad_norm": 1.709682583808899, + "learning_rate": 5e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7526128888130188, + "num_tokens": 450531550.0, + "step": 17416 + }, + { + "epoch": 1.9126949264221391, + "grad_norm": 1.7236568927764893, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7271692156791687, + "num_tokens": 450560288.0, + "step": 17417 + }, + { + "epoch": 1.9128047441247529, + "grad_norm": 1.9715793132781982, + "learning_rate": 5e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7375916242599487, + "num_tokens": 450581497.0, + "step": 17418 + }, + { + "epoch": 1.9129145618273666, + "grad_norm": 1.7412216663360596, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7384072542190552, + "num_tokens": 450607793.0, + "step": 17419 + }, + { + "epoch": 1.9130243795299804, + "grad_norm": 1.630895733833313, + "learning_rate": 5e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7398749589920044, + "num_tokens": 450636411.0, + "step": 17420 + }, + { + "epoch": 1.913134197232594, + "grad_norm": 1.8949185609817505, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7264348268508911, + "num_tokens": 450660724.0, + "step": 17421 + }, + { + "epoch": 1.9132440149352075, + "grad_norm": 1.644724726676941, + "learning_rate": 5e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7069753408432007, + "num_tokens": 450693609.0, + "step": 17422 + }, + { + "epoch": 1.9133538326378212, + "grad_norm": 2.065845489501953, + "learning_rate": 5e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.736420750617981, + "num_tokens": 450716043.0, + "step": 17423 + }, + { + "epoch": 1.913463650340435, + "grad_norm": 1.739206314086914, + "learning_rate": 5e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.716931939125061, + "num_tokens": 450746619.0, + "step": 17424 + }, + { + "epoch": 1.9135734680430485, + "grad_norm": 1.850972294807434, + "learning_rate": 5e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7417763471603394, + "num_tokens": 450768787.0, + "step": 17425 + }, + { + "epoch": 1.913683285745662, + "grad_norm": 1.7678985595703125, + "learning_rate": 5e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7470372319221497, + "num_tokens": 450794837.0, + "step": 17426 + }, + { + "epoch": 1.9137931034482758, + "grad_norm": 1.7946341037750244, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7395638227462769, + "num_tokens": 450819572.0, + "step": 17427 + }, + { + "epoch": 1.9139029211508896, + "grad_norm": 1.7353415489196777, + "learning_rate": 5e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7330040335655212, + "num_tokens": 450845070.0, + "step": 17428 + }, + { + "epoch": 1.9140127388535033, + "grad_norm": 1.8034312725067139, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7327290773391724, + "num_tokens": 450869751.0, + "step": 17429 + }, + { + "epoch": 1.9141225565561168, + "grad_norm": 1.6711785793304443, + "learning_rate": 5e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7308504581451416, + "num_tokens": 450898116.0, + "step": 17430 + }, + { + "epoch": 1.9142323742587304, + "grad_norm": 1.732992172241211, + "learning_rate": 5e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7181050777435303, + "num_tokens": 450927918.0, + "step": 17431 + }, + { + "epoch": 1.9143421919613441, + "grad_norm": 1.8670711517333984, + "learning_rate": 5e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7395893931388855, + "num_tokens": 450951357.0, + "step": 17432 + }, + { + "epoch": 1.914452009663958, + "grad_norm": 1.7649465799331665, + "learning_rate": 5e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7209600210189819, + "num_tokens": 450979292.0, + "step": 17433 + }, + { + "epoch": 1.9145618273665717, + "grad_norm": 1.8895784616470337, + "learning_rate": 5e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7289525866508484, + "num_tokens": 451003527.0, + "step": 17434 + }, + { + "epoch": 1.9146716450691852, + "grad_norm": 2.1062469482421875, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7396107912063599, + "num_tokens": 451023580.0, + "step": 17435 + }, + { + "epoch": 1.9147814627717987, + "grad_norm": 1.8741295337677002, + "learning_rate": 5e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7301477193832397, + "num_tokens": 451047248.0, + "step": 17436 + }, + { + "epoch": 1.9148912804744125, + "grad_norm": 1.7611143589019775, + "learning_rate": 5e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7205452919006348, + "num_tokens": 451076814.0, + "step": 17437 + }, + { + "epoch": 1.9150010981770262, + "grad_norm": 1.8866980075836182, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.742422342300415, + "num_tokens": 451098887.0, + "step": 17438 + }, + { + "epoch": 1.9151109158796398, + "grad_norm": 1.9208253622055054, + "learning_rate": 5e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.72709721326828, + "num_tokens": 451124037.0, + "step": 17439 + }, + { + "epoch": 1.9152207335822533, + "grad_norm": 1.9935108423233032, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7422905564308167, + "num_tokens": 451144595.0, + "step": 17440 + }, + { + "epoch": 1.915330551284867, + "grad_norm": 1.903615951538086, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7420870065689087, + "num_tokens": 451167834.0, + "step": 17441 + }, + { + "epoch": 1.9154403689874808, + "grad_norm": 1.6217172145843506, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7147560119628906, + "num_tokens": 451198516.0, + "step": 17442 + }, + { + "epoch": 1.9155501866900946, + "grad_norm": 1.819555401802063, + "learning_rate": 5e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7300978899002075, + "num_tokens": 451223935.0, + "step": 17443 + }, + { + "epoch": 1.9156600043927081, + "grad_norm": 1.8910167217254639, + "learning_rate": 5e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7275360822677612, + "num_tokens": 451248113.0, + "step": 17444 + }, + { + "epoch": 1.9157698220953217, + "grad_norm": 1.984660267829895, + "learning_rate": 5e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7503257989883423, + "num_tokens": 451270434.0, + "step": 17445 + }, + { + "epoch": 1.9158796397979354, + "grad_norm": 1.9343024492263794, + "learning_rate": 5e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7205190658569336, + "num_tokens": 451294615.0, + "step": 17446 + }, + { + "epoch": 1.9159894575005492, + "grad_norm": 1.9356611967086792, + "learning_rate": 5e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7159556746482849, + "num_tokens": 451318177.0, + "step": 17447 + }, + { + "epoch": 1.9160992752031627, + "grad_norm": 1.8090335130691528, + "learning_rate": 5e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.713214635848999, + "num_tokens": 451345117.0, + "step": 17448 + }, + { + "epoch": 1.9162090929057765, + "grad_norm": 1.8714104890823364, + "learning_rate": 5e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7550815343856812, + "num_tokens": 451368880.0, + "step": 17449 + }, + { + "epoch": 1.91631891060839, + "grad_norm": 1.983302354812622, + "learning_rate": 5e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.714566171169281, + "num_tokens": 451390625.0, + "step": 17450 + }, + { + "epoch": 1.9164287283110037, + "grad_norm": 1.9120335578918457, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7262020707130432, + "num_tokens": 451413279.0, + "step": 17451 + }, + { + "epoch": 1.9165385460136175, + "grad_norm": 1.8575897216796875, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7370196580886841, + "num_tokens": 451435634.0, + "step": 17452 + }, + { + "epoch": 1.916648363716231, + "grad_norm": 1.7249839305877686, + "learning_rate": 5e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.708854079246521, + "num_tokens": 451464690.0, + "step": 17453 + }, + { + "epoch": 1.9167581814188446, + "grad_norm": 1.6379882097244263, + "learning_rate": 5e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7335201501846313, + "num_tokens": 451492647.0, + "step": 17454 + }, + { + "epoch": 1.9168679991214583, + "grad_norm": 2.049295425415039, + "learning_rate": 5e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7320070266723633, + "num_tokens": 451514386.0, + "step": 17455 + }, + { + "epoch": 1.916977816824072, + "grad_norm": 1.7280850410461426, + "learning_rate": 5e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.731005072593689, + "num_tokens": 451539082.0, + "step": 17456 + }, + { + "epoch": 1.9170876345266858, + "grad_norm": 1.75678551197052, + "learning_rate": 5e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7345826625823975, + "num_tokens": 451567628.0, + "step": 17457 + }, + { + "epoch": 1.9171974522292994, + "grad_norm": 1.8055956363677979, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7181518077850342, + "num_tokens": 451596485.0, + "step": 17458 + }, + { + "epoch": 1.917307269931913, + "grad_norm": 2.006009578704834, + "learning_rate": 5e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7339526414871216, + "num_tokens": 451621474.0, + "step": 17459 + }, + { + "epoch": 1.9174170876345267, + "grad_norm": 1.788099765777588, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7452757358551025, + "num_tokens": 451646166.0, + "step": 17460 + }, + { + "epoch": 1.9175269053371404, + "grad_norm": 2.1680667400360107, + "learning_rate": 5e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7349504828453064, + "num_tokens": 451664385.0, + "step": 17461 + }, + { + "epoch": 1.917636723039754, + "grad_norm": 1.709085464477539, + "learning_rate": 5e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7276972532272339, + "num_tokens": 451690915.0, + "step": 17462 + }, + { + "epoch": 1.9177465407423677, + "grad_norm": 2.065636157989502, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7360585927963257, + "num_tokens": 451712435.0, + "step": 17463 + }, + { + "epoch": 1.9178563584449813, + "grad_norm": 1.7827967405319214, + "learning_rate": 5e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7462078928947449, + "num_tokens": 451738040.0, + "step": 17464 + }, + { + "epoch": 1.917966176147595, + "grad_norm": 1.9753586053848267, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7373645305633545, + "num_tokens": 451760379.0, + "step": 17465 + }, + { + "epoch": 1.9180759938502088, + "grad_norm": 1.6440993547439575, + "learning_rate": 5e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7283511757850647, + "num_tokens": 451790146.0, + "step": 17466 + }, + { + "epoch": 1.9181858115528223, + "grad_norm": 1.5767868757247925, + "learning_rate": 5e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7103856801986694, + "num_tokens": 451821864.0, + "step": 17467 + }, + { + "epoch": 1.9182956292554358, + "grad_norm": 1.7246325016021729, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.727792501449585, + "num_tokens": 451848551.0, + "step": 17468 + }, + { + "epoch": 1.9184054469580496, + "grad_norm": 1.4303083419799805, + "learning_rate": 5e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7314633131027222, + "num_tokens": 451882825.0, + "step": 17469 + }, + { + "epoch": 1.9185152646606634, + "grad_norm": 1.7479323148727417, + "learning_rate": 5e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7032092213630676, + "num_tokens": 451912681.0, + "step": 17470 + }, + { + "epoch": 1.9186250823632771, + "grad_norm": 1.5706373453140259, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7169527411460876, + "num_tokens": 451941881.0, + "step": 17471 + }, + { + "epoch": 1.9187349000658906, + "grad_norm": 2.2423253059387207, + "learning_rate": 5e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.7513322234153748, + "num_tokens": 451958239.0, + "step": 17472 + }, + { + "epoch": 1.9188447177685042, + "grad_norm": 1.6509824991226196, + "learning_rate": 5e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7358357310295105, + "num_tokens": 451988376.0, + "step": 17473 + }, + { + "epoch": 1.918954535471118, + "grad_norm": 1.7291854619979858, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7274132966995239, + "num_tokens": 452015172.0, + "step": 17474 + }, + { + "epoch": 1.9190643531737317, + "grad_norm": 1.6074362993240356, + "learning_rate": 5e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6921172738075256, + "num_tokens": 452050424.0, + "step": 17475 + }, + { + "epoch": 1.9191741708763452, + "grad_norm": 1.8262302875518799, + "learning_rate": 5e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7178837656974792, + "num_tokens": 452074310.0, + "step": 17476 + }, + { + "epoch": 1.9192839885789588, + "grad_norm": 1.7005059719085693, + "learning_rate": 5e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7137674689292908, + "num_tokens": 452106294.0, + "step": 17477 + }, + { + "epoch": 1.9193938062815725, + "grad_norm": 1.7891312837600708, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7350014448165894, + "num_tokens": 452131497.0, + "step": 17478 + }, + { + "epoch": 1.9195036239841863, + "grad_norm": 1.826900601387024, + "learning_rate": 5e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7391799688339233, + "num_tokens": 452155625.0, + "step": 17479 + }, + { + "epoch": 1.9196134416868, + "grad_norm": 1.7353748083114624, + "learning_rate": 5e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7239940166473389, + "num_tokens": 452183305.0, + "step": 17480 + }, + { + "epoch": 1.9197232593894136, + "grad_norm": 1.835089921951294, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7232299447059631, + "num_tokens": 452208657.0, + "step": 17481 + }, + { + "epoch": 1.919833077092027, + "grad_norm": 1.7217912673950195, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.725318431854248, + "num_tokens": 452236970.0, + "step": 17482 + }, + { + "epoch": 1.9199428947946409, + "grad_norm": 1.8803077936172485, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7369245290756226, + "num_tokens": 452261927.0, + "step": 17483 + }, + { + "epoch": 1.9200527124972546, + "grad_norm": 1.8113113641738892, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7495843172073364, + "num_tokens": 452287639.0, + "step": 17484 + }, + { + "epoch": 1.9201625301998684, + "grad_norm": 1.9684758186340332, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7245694398880005, + "num_tokens": 452310119.0, + "step": 17485 + }, + { + "epoch": 1.920272347902482, + "grad_norm": 1.5042144060134888, + "learning_rate": 5e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7153006792068481, + "num_tokens": 452345802.0, + "step": 17486 + }, + { + "epoch": 1.9203821656050954, + "grad_norm": 1.8679133653640747, + "learning_rate": 5e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7359732389450073, + "num_tokens": 452370375.0, + "step": 17487 + }, + { + "epoch": 1.9204919833077092, + "grad_norm": 1.6334048509597778, + "learning_rate": 5e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.730790376663208, + "num_tokens": 452398409.0, + "step": 17488 + }, + { + "epoch": 1.920601801010323, + "grad_norm": 1.904453158378601, + "learning_rate": 5e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7210524082183838, + "num_tokens": 452421635.0, + "step": 17489 + }, + { + "epoch": 1.9207116187129365, + "grad_norm": 1.756850004196167, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7153996229171753, + "num_tokens": 452448012.0, + "step": 17490 + }, + { + "epoch": 1.92082143641555, + "grad_norm": 1.8514031171798706, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7405287623405457, + "num_tokens": 452471008.0, + "step": 17491 + }, + { + "epoch": 1.9209312541181638, + "grad_norm": 1.9142194986343384, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7460334300994873, + "num_tokens": 452493293.0, + "step": 17492 + }, + { + "epoch": 1.9210410718207775, + "grad_norm": 1.6707342863082886, + "learning_rate": 5e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7256134152412415, + "num_tokens": 452523035.0, + "step": 17493 + }, + { + "epoch": 1.9211508895233913, + "grad_norm": 1.87190580368042, + "learning_rate": 5e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7211896181106567, + "num_tokens": 452548855.0, + "step": 17494 + }, + { + "epoch": 1.9212607072260048, + "grad_norm": 1.6351324319839478, + "learning_rate": 5e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7010173201560974, + "num_tokens": 452582715.0, + "step": 17495 + }, + { + "epoch": 1.9213705249286184, + "grad_norm": 1.8315635919570923, + "learning_rate": 5e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7246999740600586, + "num_tokens": 452606765.0, + "step": 17496 + }, + { + "epoch": 1.9214803426312321, + "grad_norm": 1.6429136991500854, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7334775328636169, + "num_tokens": 452634983.0, + "step": 17497 + }, + { + "epoch": 1.9215901603338459, + "grad_norm": 1.8894877433776855, + "learning_rate": 5e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7430591583251953, + "num_tokens": 452659008.0, + "step": 17498 + }, + { + "epoch": 1.9216999780364594, + "grad_norm": 1.7046875953674316, + "learning_rate": 5e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7522169351577759, + "num_tokens": 452685878.0, + "step": 17499 + }, + { + "epoch": 1.9218097957390732, + "grad_norm": 1.9550725221633911, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7223673462867737, + "num_tokens": 452707909.0, + "step": 17500 + }, + { + "epoch": 1.9219196134416867, + "grad_norm": 1.7057644128799438, + "learning_rate": 5e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7126767039299011, + "num_tokens": 452737867.0, + "step": 17501 + }, + { + "epoch": 1.9220294311443005, + "grad_norm": 1.9210014343261719, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7494427561759949, + "num_tokens": 452759078.0, + "step": 17502 + }, + { + "epoch": 1.9221392488469142, + "grad_norm": 1.8124865293502808, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7625765800476074, + "num_tokens": 452781289.0, + "step": 17503 + }, + { + "epoch": 1.9222490665495278, + "grad_norm": 1.863808274269104, + "learning_rate": 5e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.729341447353363, + "num_tokens": 452807182.0, + "step": 17504 + }, + { + "epoch": 1.9223588842521413, + "grad_norm": 1.9181599617004395, + "learning_rate": 5e-06, + "loss": 0.866, + "mean_token_accuracy": 0.730523943901062, + "num_tokens": 452828911.0, + "step": 17505 + }, + { + "epoch": 1.922468701954755, + "grad_norm": 1.8894166946411133, + "learning_rate": 5e-06, + "loss": 0.782, + "mean_token_accuracy": 0.744103729724884, + "num_tokens": 452853610.0, + "step": 17506 + }, + { + "epoch": 1.9225785196573688, + "grad_norm": 1.9567210674285889, + "learning_rate": 5e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7444332838058472, + "num_tokens": 452875107.0, + "step": 17507 + }, + { + "epoch": 1.9226883373599826, + "grad_norm": 1.8113130331039429, + "learning_rate": 5e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.708149790763855, + "num_tokens": 452905745.0, + "step": 17508 + }, + { + "epoch": 1.922798155062596, + "grad_norm": 2.1232268810272217, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7496756315231323, + "num_tokens": 452925499.0, + "step": 17509 + }, + { + "epoch": 1.9229079727652096, + "grad_norm": 1.959511160850525, + "learning_rate": 5e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7435474991798401, + "num_tokens": 452947604.0, + "step": 17510 + }, + { + "epoch": 1.9230177904678234, + "grad_norm": 2.233372211456299, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7537779211997986, + "num_tokens": 452964519.0, + "step": 17511 + }, + { + "epoch": 1.9231276081704372, + "grad_norm": 1.8137946128845215, + "learning_rate": 5e-06, + "loss": 0.967, + "mean_token_accuracy": 0.6958852410316467, + "num_tokens": 452992732.0, + "step": 17512 + }, + { + "epoch": 1.9232374258730507, + "grad_norm": 1.8627344369888306, + "learning_rate": 5e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7321084141731262, + "num_tokens": 453017773.0, + "step": 17513 + }, + { + "epoch": 1.9233472435756644, + "grad_norm": 1.9257400035858154, + "learning_rate": 5e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7351528406143188, + "num_tokens": 453042371.0, + "step": 17514 + }, + { + "epoch": 1.923457061278278, + "grad_norm": 1.7792435884475708, + "learning_rate": 5e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7529977560043335, + "num_tokens": 453066487.0, + "step": 17515 + }, + { + "epoch": 1.9235668789808917, + "grad_norm": 1.807796835899353, + "learning_rate": 5e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7140852808952332, + "num_tokens": 453090895.0, + "step": 17516 + }, + { + "epoch": 1.9236766966835055, + "grad_norm": 1.7239794731140137, + "learning_rate": 5e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7477192878723145, + "num_tokens": 453115759.0, + "step": 17517 + }, + { + "epoch": 1.923786514386119, + "grad_norm": 1.7960354089736938, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7561813592910767, + "num_tokens": 453139911.0, + "step": 17518 + }, + { + "epoch": 1.9238963320887326, + "grad_norm": 1.983521819114685, + "learning_rate": 5e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7275184392929077, + "num_tokens": 453162770.0, + "step": 17519 + }, + { + "epoch": 1.9240061497913463, + "grad_norm": 1.8131970167160034, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7599265575408936, + "num_tokens": 453186909.0, + "step": 17520 + }, + { + "epoch": 1.92411596749396, + "grad_norm": 1.646714210510254, + "learning_rate": 5e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7264631986618042, + "num_tokens": 453218054.0, + "step": 17521 + }, + { + "epoch": 1.9242257851965738, + "grad_norm": 1.8657357692718506, + "learning_rate": 5e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7365763187408447, + "num_tokens": 453244360.0, + "step": 17522 + }, + { + "epoch": 1.9243356028991874, + "grad_norm": 1.7787855863571167, + "learning_rate": 5e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7397737503051758, + "num_tokens": 453270702.0, + "step": 17523 + }, + { + "epoch": 1.924445420601801, + "grad_norm": 1.8956576585769653, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7433019280433655, + "num_tokens": 453292279.0, + "step": 17524 + }, + { + "epoch": 1.9245552383044147, + "grad_norm": 1.8104819059371948, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7364636659622192, + "num_tokens": 453315528.0, + "step": 17525 + }, + { + "epoch": 1.9246650560070284, + "grad_norm": 2.097205877304077, + "learning_rate": 5e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7180854082107544, + "num_tokens": 453336922.0, + "step": 17526 + }, + { + "epoch": 1.924774873709642, + "grad_norm": 1.8407264947891235, + "learning_rate": 5e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7190483212471008, + "num_tokens": 453365005.0, + "step": 17527 + }, + { + "epoch": 1.9248846914122557, + "grad_norm": 1.983275055885315, + "learning_rate": 5e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7250781655311584, + "num_tokens": 453387441.0, + "step": 17528 + }, + { + "epoch": 1.9249945091148692, + "grad_norm": 1.7552194595336914, + "learning_rate": 5e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7243388891220093, + "num_tokens": 453415434.0, + "step": 17529 + }, + { + "epoch": 1.925104326817483, + "grad_norm": 1.8965831995010376, + "learning_rate": 5e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7295199036598206, + "num_tokens": 453438695.0, + "step": 17530 + }, + { + "epoch": 1.9252141445200968, + "grad_norm": 1.9147233963012695, + "learning_rate": 5e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7391021847724915, + "num_tokens": 453462487.0, + "step": 17531 + }, + { + "epoch": 1.9253239622227103, + "grad_norm": 1.9400620460510254, + "learning_rate": 5e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7468615770339966, + "num_tokens": 453483032.0, + "step": 17532 + }, + { + "epoch": 1.9254337799253238, + "grad_norm": 2.199230432510376, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7471153736114502, + "num_tokens": 453501509.0, + "step": 17533 + }, + { + "epoch": 1.9255435976279376, + "grad_norm": 1.7463688850402832, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7570788860321045, + "num_tokens": 453525716.0, + "step": 17534 + }, + { + "epoch": 1.9256534153305513, + "grad_norm": 1.5894581079483032, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7261673212051392, + "num_tokens": 453555424.0, + "step": 17535 + }, + { + "epoch": 1.925763233033165, + "grad_norm": 1.5950106382369995, + "learning_rate": 5e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7289897203445435, + "num_tokens": 453586602.0, + "step": 17536 + }, + { + "epoch": 1.9258730507357786, + "grad_norm": 2.0950582027435303, + "learning_rate": 5e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7087645530700684, + "num_tokens": 453609593.0, + "step": 17537 + }, + { + "epoch": 1.9259828684383922, + "grad_norm": 1.6704787015914917, + "learning_rate": 5e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7044006586074829, + "num_tokens": 453643324.0, + "step": 17538 + }, + { + "epoch": 1.926092686141006, + "grad_norm": 1.6131097078323364, + "learning_rate": 5e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7033090591430664, + "num_tokens": 453674269.0, + "step": 17539 + }, + { + "epoch": 1.9262025038436197, + "grad_norm": 1.7104649543762207, + "learning_rate": 5e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7199418544769287, + "num_tokens": 453702377.0, + "step": 17540 + }, + { + "epoch": 1.9263123215462332, + "grad_norm": 1.7512097358703613, + "learning_rate": 5e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7388687133789062, + "num_tokens": 453726888.0, + "step": 17541 + }, + { + "epoch": 1.9264221392488468, + "grad_norm": 1.9000619649887085, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.749306321144104, + "num_tokens": 453750114.0, + "step": 17542 + }, + { + "epoch": 1.9265319569514605, + "grad_norm": 1.5606809854507446, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7114938497543335, + "num_tokens": 453784672.0, + "step": 17543 + }, + { + "epoch": 1.9266417746540743, + "grad_norm": 1.663169026374817, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7413064241409302, + "num_tokens": 453813549.0, + "step": 17544 + }, + { + "epoch": 1.926751592356688, + "grad_norm": 1.8018414974212646, + "learning_rate": 5e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7221118211746216, + "num_tokens": 453839233.0, + "step": 17545 + }, + { + "epoch": 1.9268614100593016, + "grad_norm": 1.5649518966674805, + "learning_rate": 5e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7225000858306885, + "num_tokens": 453869435.0, + "step": 17546 + }, + { + "epoch": 1.926971227761915, + "grad_norm": 1.557254433631897, + "learning_rate": 5e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7399944067001343, + "num_tokens": 453897794.0, + "step": 17547 + }, + { + "epoch": 1.9270810454645289, + "grad_norm": 1.736660122871399, + "learning_rate": 5e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7305366396903992, + "num_tokens": 453923921.0, + "step": 17548 + }, + { + "epoch": 1.9271908631671426, + "grad_norm": 1.833896517753601, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7281317710876465, + "num_tokens": 453949782.0, + "step": 17549 + }, + { + "epoch": 1.9273006808697564, + "grad_norm": 1.681841492652893, + "learning_rate": 5e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.684259831905365, + "num_tokens": 453982469.0, + "step": 17550 + }, + { + "epoch": 1.92741049857237, + "grad_norm": 1.9851665496826172, + "learning_rate": 5e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7321571707725525, + "num_tokens": 454004537.0, + "step": 17551 + }, + { + "epoch": 1.9275203162749834, + "grad_norm": 1.7174278497695923, + "learning_rate": 5e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7373825311660767, + "num_tokens": 454031146.0, + "step": 17552 + }, + { + "epoch": 1.9276301339775972, + "grad_norm": 1.5931161642074585, + "learning_rate": 5e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7491798400878906, + "num_tokens": 454059312.0, + "step": 17553 + }, + { + "epoch": 1.927739951680211, + "grad_norm": 1.8986917734146118, + "learning_rate": 5e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.729739248752594, + "num_tokens": 454081390.0, + "step": 17554 + }, + { + "epoch": 1.9278497693828245, + "grad_norm": 1.9296178817749023, + "learning_rate": 5e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7255281209945679, + "num_tokens": 454105938.0, + "step": 17555 + }, + { + "epoch": 1.927959587085438, + "grad_norm": 1.5552562475204468, + "learning_rate": 5e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7089115381240845, + "num_tokens": 454142655.0, + "step": 17556 + }, + { + "epoch": 1.9280694047880518, + "grad_norm": 1.7446867227554321, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7322397232055664, + "num_tokens": 454171260.0, + "step": 17557 + }, + { + "epoch": 1.9281792224906655, + "grad_norm": 1.9255918264389038, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7219929695129395, + "num_tokens": 454194115.0, + "step": 17558 + }, + { + "epoch": 1.9282890401932793, + "grad_norm": 1.8818202018737793, + "learning_rate": 5e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7211253643035889, + "num_tokens": 454217532.0, + "step": 17559 + }, + { + "epoch": 1.9283988578958928, + "grad_norm": 1.8730453252792358, + "learning_rate": 5e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7131316661834717, + "num_tokens": 454240287.0, + "step": 17560 + }, + { + "epoch": 1.9285086755985064, + "grad_norm": 1.756299376487732, + "learning_rate": 5e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7095307111740112, + "num_tokens": 454270473.0, + "step": 17561 + }, + { + "epoch": 1.9286184933011201, + "grad_norm": 1.9334877729415894, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7416149377822876, + "num_tokens": 454292380.0, + "step": 17562 + }, + { + "epoch": 1.9287283110037339, + "grad_norm": 2.0250489711761475, + "learning_rate": 5e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7288366556167603, + "num_tokens": 454313842.0, + "step": 17563 + }, + { + "epoch": 1.9288381287063474, + "grad_norm": 1.6893293857574463, + "learning_rate": 5e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7101790904998779, + "num_tokens": 454345359.0, + "step": 17564 + }, + { + "epoch": 1.9289479464089612, + "grad_norm": 1.8524727821350098, + "learning_rate": 5e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7268823981285095, + "num_tokens": 454370467.0, + "step": 17565 + }, + { + "epoch": 1.9290577641115747, + "grad_norm": 1.5837833881378174, + "learning_rate": 5e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7087277770042419, + "num_tokens": 454399666.0, + "step": 17566 + }, + { + "epoch": 1.9291675818141885, + "grad_norm": 1.80618155002594, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7582724094390869, + "num_tokens": 454422095.0, + "step": 17567 + }, + { + "epoch": 1.9292773995168022, + "grad_norm": 1.743051528930664, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7324158549308777, + "num_tokens": 454449221.0, + "step": 17568 + }, + { + "epoch": 1.9293872172194158, + "grad_norm": 1.6842679977416992, + "learning_rate": 5e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7314082384109497, + "num_tokens": 454475271.0, + "step": 17569 + }, + { + "epoch": 1.9294970349220293, + "grad_norm": 1.8444762229919434, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7382453083992004, + "num_tokens": 454500052.0, + "step": 17570 + }, + { + "epoch": 1.929606852624643, + "grad_norm": 1.7302212715148926, + "learning_rate": 5e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7269435524940491, + "num_tokens": 454528604.0, + "step": 17571 + }, + { + "epoch": 1.9297166703272568, + "grad_norm": 1.8890750408172607, + "learning_rate": 5e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.739892840385437, + "num_tokens": 454551395.0, + "step": 17572 + }, + { + "epoch": 1.9298264880298706, + "grad_norm": 1.7348588705062866, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7379278540611267, + "num_tokens": 454577323.0, + "step": 17573 + }, + { + "epoch": 1.929936305732484, + "grad_norm": 1.7418532371520996, + "learning_rate": 5e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7298181056976318, + "num_tokens": 454603063.0, + "step": 17574 + }, + { + "epoch": 1.9300461234350976, + "grad_norm": 1.8217968940734863, + "learning_rate": 5e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.715319037437439, + "num_tokens": 454626983.0, + "step": 17575 + }, + { + "epoch": 1.9301559411377114, + "grad_norm": 1.9311089515686035, + "learning_rate": 5e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7206263542175293, + "num_tokens": 454649786.0, + "step": 17576 + }, + { + "epoch": 1.9302657588403251, + "grad_norm": 1.7248964309692383, + "learning_rate": 5e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7113844752311707, + "num_tokens": 454677321.0, + "step": 17577 + }, + { + "epoch": 1.9303755765429387, + "grad_norm": 1.4789241552352905, + "learning_rate": 5e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7168716192245483, + "num_tokens": 454715100.0, + "step": 17578 + }, + { + "epoch": 1.9304853942455524, + "grad_norm": 1.6847031116485596, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7400417327880859, + "num_tokens": 454744630.0, + "step": 17579 + }, + { + "epoch": 1.930595211948166, + "grad_norm": 1.7251120805740356, + "learning_rate": 5e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7417094707489014, + "num_tokens": 454772845.0, + "step": 17580 + }, + { + "epoch": 1.9307050296507797, + "grad_norm": 1.7055951356887817, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7287131547927856, + "num_tokens": 454801126.0, + "step": 17581 + }, + { + "epoch": 1.9308148473533935, + "grad_norm": 1.7855397462844849, + "learning_rate": 5e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7541471123695374, + "num_tokens": 454825326.0, + "step": 17582 + }, + { + "epoch": 1.930924665056007, + "grad_norm": 1.5162463188171387, + "learning_rate": 5e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7174459099769592, + "num_tokens": 454859007.0, + "step": 17583 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 1.5457453727722168, + "learning_rate": 5e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7131290435791016, + "num_tokens": 454892326.0, + "step": 17584 + }, + { + "epoch": 1.9311443004612343, + "grad_norm": 1.7522372007369995, + "learning_rate": 5e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7343810200691223, + "num_tokens": 454917379.0, + "step": 17585 + }, + { + "epoch": 1.931254118163848, + "grad_norm": 1.8113577365875244, + "learning_rate": 5e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7432316541671753, + "num_tokens": 454940407.0, + "step": 17586 + }, + { + "epoch": 1.9313639358664618, + "grad_norm": 1.8166085481643677, + "learning_rate": 5e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7190468907356262, + "num_tokens": 454965882.0, + "step": 17587 + }, + { + "epoch": 1.9314737535690754, + "grad_norm": 1.680357575416565, + "learning_rate": 5e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7040934562683105, + "num_tokens": 454995071.0, + "step": 17588 + }, + { + "epoch": 1.931583571271689, + "grad_norm": 1.6264636516571045, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7502877116203308, + "num_tokens": 455024708.0, + "step": 17589 + }, + { + "epoch": 1.9316933889743026, + "grad_norm": 1.8231346607208252, + "learning_rate": 5e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7119585275650024, + "num_tokens": 455051114.0, + "step": 17590 + }, + { + "epoch": 1.9318032066769164, + "grad_norm": 1.903985857963562, + "learning_rate": 5e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.715623140335083, + "num_tokens": 455074360.0, + "step": 17591 + }, + { + "epoch": 1.93191302437953, + "grad_norm": 1.6714609861373901, + "learning_rate": 5e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7646472454071045, + "num_tokens": 455099039.0, + "step": 17592 + }, + { + "epoch": 1.9320228420821435, + "grad_norm": 1.5365122556686401, + "learning_rate": 5e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7302775382995605, + "num_tokens": 455130219.0, + "step": 17593 + }, + { + "epoch": 1.9321326597847572, + "grad_norm": 1.848434329032898, + "learning_rate": 5e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.721107006072998, + "num_tokens": 455154313.0, + "step": 17594 + }, + { + "epoch": 1.932242477487371, + "grad_norm": 1.7603422403335571, + "learning_rate": 5e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7301334142684937, + "num_tokens": 455182951.0, + "step": 17595 + }, + { + "epoch": 1.9323522951899847, + "grad_norm": 1.8298513889312744, + "learning_rate": 5e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.728480339050293, + "num_tokens": 455208154.0, + "step": 17596 + }, + { + "epoch": 1.9324621128925983, + "grad_norm": 1.769021987915039, + "learning_rate": 5e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.729913592338562, + "num_tokens": 455233148.0, + "step": 17597 + }, + { + "epoch": 1.9325719305952118, + "grad_norm": 1.7852269411087036, + "learning_rate": 5e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7368165254592896, + "num_tokens": 455258551.0, + "step": 17598 + }, + { + "epoch": 1.9326817482978256, + "grad_norm": 1.6615062952041626, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7633668184280396, + "num_tokens": 455286545.0, + "step": 17599 + }, + { + "epoch": 1.9327915660004393, + "grad_norm": 1.7583189010620117, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7221858501434326, + "num_tokens": 455313193.0, + "step": 17600 + }, + { + "epoch": 1.932901383703053, + "grad_norm": 1.9316229820251465, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7372567057609558, + "num_tokens": 455333670.0, + "step": 17601 + }, + { + "epoch": 1.9330112014056666, + "grad_norm": 1.680383324623108, + "learning_rate": 5e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.750324010848999, + "num_tokens": 455359237.0, + "step": 17602 + }, + { + "epoch": 1.9331210191082802, + "grad_norm": 1.9248254299163818, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7259809970855713, + "num_tokens": 455383118.0, + "step": 17603 + }, + { + "epoch": 1.933230836810894, + "grad_norm": 1.7119032144546509, + "learning_rate": 5e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7137157917022705, + "num_tokens": 455412978.0, + "step": 17604 + }, + { + "epoch": 1.9333406545135077, + "grad_norm": 1.7910170555114746, + "learning_rate": 5e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7490037083625793, + "num_tokens": 455436454.0, + "step": 17605 + }, + { + "epoch": 1.9334504722161212, + "grad_norm": 1.8654582500457764, + "learning_rate": 5e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7189536094665527, + "num_tokens": 455461343.0, + "step": 17606 + }, + { + "epoch": 1.9335602899187347, + "grad_norm": 1.645266056060791, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7285782694816589, + "num_tokens": 455491116.0, + "step": 17607 + }, + { + "epoch": 1.9336701076213485, + "grad_norm": 1.9820948839187622, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7301836013793945, + "num_tokens": 455512485.0, + "step": 17608 + }, + { + "epoch": 1.9337799253239623, + "grad_norm": 1.9552282094955444, + "learning_rate": 5e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7491361498832703, + "num_tokens": 455535084.0, + "step": 17609 + }, + { + "epoch": 1.933889743026576, + "grad_norm": 1.8320245742797852, + "learning_rate": 5e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7217687368392944, + "num_tokens": 455561897.0, + "step": 17610 + }, + { + "epoch": 1.9339995607291895, + "grad_norm": 1.7897990942001343, + "learning_rate": 5e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7234361171722412, + "num_tokens": 455587266.0, + "step": 17611 + }, + { + "epoch": 1.934109378431803, + "grad_norm": 1.7976027727127075, + "learning_rate": 5e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7070913910865784, + "num_tokens": 455613993.0, + "step": 17612 + }, + { + "epoch": 1.9342191961344168, + "grad_norm": 2.1657309532165527, + "learning_rate": 5e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.742778480052948, + "num_tokens": 455633581.0, + "step": 17613 + }, + { + "epoch": 1.9343290138370306, + "grad_norm": 1.770070195198059, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7398212552070618, + "num_tokens": 455657576.0, + "step": 17614 + }, + { + "epoch": 1.9344388315396444, + "grad_norm": 1.681262731552124, + "learning_rate": 5e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7508513927459717, + "num_tokens": 455682690.0, + "step": 17615 + }, + { + "epoch": 1.934548649242258, + "grad_norm": 1.8905012607574463, + "learning_rate": 5e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7451299428939819, + "num_tokens": 455704947.0, + "step": 17616 + }, + { + "epoch": 1.9346584669448714, + "grad_norm": 1.8158403635025024, + "learning_rate": 5e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7584865689277649, + "num_tokens": 455728485.0, + "step": 17617 + }, + { + "epoch": 1.9347682846474852, + "grad_norm": 1.7018629312515259, + "learning_rate": 5e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7256877422332764, + "num_tokens": 455755815.0, + "step": 17618 + }, + { + "epoch": 1.934878102350099, + "grad_norm": 1.8415395021438599, + "learning_rate": 5e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6954675912857056, + "num_tokens": 455781742.0, + "step": 17619 + }, + { + "epoch": 1.9349879200527125, + "grad_norm": 1.8155242204666138, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7427217960357666, + "num_tokens": 455807841.0, + "step": 17620 + }, + { + "epoch": 1.935097737755326, + "grad_norm": 1.8252105712890625, + "learning_rate": 5e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7339930534362793, + "num_tokens": 455833844.0, + "step": 17621 + }, + { + "epoch": 1.9352075554579398, + "grad_norm": 1.7736228704452515, + "learning_rate": 5e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7524048089981079, + "num_tokens": 455857256.0, + "step": 17622 + }, + { + "epoch": 1.9353173731605535, + "grad_norm": 1.8872640132904053, + "learning_rate": 5e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7300539612770081, + "num_tokens": 455880693.0, + "step": 17623 + }, + { + "epoch": 1.9354271908631673, + "grad_norm": 1.7021194696426392, + "learning_rate": 5e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7260547876358032, + "num_tokens": 455909631.0, + "step": 17624 + }, + { + "epoch": 1.9355370085657808, + "grad_norm": 1.6612282991409302, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7385745048522949, + "num_tokens": 455937979.0, + "step": 17625 + }, + { + "epoch": 1.9356468262683943, + "grad_norm": 2.0131802558898926, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7546336650848389, + "num_tokens": 455957772.0, + "step": 17626 + }, + { + "epoch": 1.935756643971008, + "grad_norm": 1.6477570533752441, + "learning_rate": 5e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7148221731185913, + "num_tokens": 455990997.0, + "step": 17627 + }, + { + "epoch": 1.9358664616736219, + "grad_norm": 1.8407299518585205, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.713054358959198, + "num_tokens": 456016966.0, + "step": 17628 + }, + { + "epoch": 1.9359762793762354, + "grad_norm": 1.729464054107666, + "learning_rate": 5e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.731862485408783, + "num_tokens": 456045308.0, + "step": 17629 + }, + { + "epoch": 1.9360860970788492, + "grad_norm": 1.6310805082321167, + "learning_rate": 5e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7153768539428711, + "num_tokens": 456075047.0, + "step": 17630 + }, + { + "epoch": 1.9361959147814627, + "grad_norm": 1.9318339824676514, + "learning_rate": 5e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7316120862960815, + "num_tokens": 456099792.0, + "step": 17631 + }, + { + "epoch": 1.9363057324840764, + "grad_norm": 1.8533320426940918, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7521042823791504, + "num_tokens": 456124882.0, + "step": 17632 + }, + { + "epoch": 1.9364155501866902, + "grad_norm": 1.5756611824035645, + "learning_rate": 5e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7367956638336182, + "num_tokens": 456153705.0, + "step": 17633 + }, + { + "epoch": 1.9365253678893037, + "grad_norm": 1.7004226446151733, + "learning_rate": 5e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7205793261528015, + "num_tokens": 456181194.0, + "step": 17634 + }, + { + "epoch": 1.9366351855919173, + "grad_norm": 1.8681998252868652, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7378481030464172, + "num_tokens": 456204634.0, + "step": 17635 + }, + { + "epoch": 1.936745003294531, + "grad_norm": 1.6598711013793945, + "learning_rate": 5e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7351250648498535, + "num_tokens": 456233510.0, + "step": 17636 + }, + { + "epoch": 1.9368548209971448, + "grad_norm": 1.7649853229522705, + "learning_rate": 5e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.717795729637146, + "num_tokens": 456260622.0, + "step": 17637 + }, + { + "epoch": 1.9369646386997585, + "grad_norm": 1.7870100736618042, + "learning_rate": 5e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7149529457092285, + "num_tokens": 456287654.0, + "step": 17638 + }, + { + "epoch": 1.937074456402372, + "grad_norm": 1.8158191442489624, + "learning_rate": 5e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7465523481369019, + "num_tokens": 456313941.0, + "step": 17639 + }, + { + "epoch": 1.9371842741049856, + "grad_norm": 1.962577223777771, + "learning_rate": 5e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7446733713150024, + "num_tokens": 456334326.0, + "step": 17640 + }, + { + "epoch": 1.9372940918075994, + "grad_norm": 1.5908989906311035, + "learning_rate": 5e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7281606197357178, + "num_tokens": 456364533.0, + "step": 17641 + }, + { + "epoch": 1.9374039095102131, + "grad_norm": 1.7520993947982788, + "learning_rate": 5e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7238249182701111, + "num_tokens": 456390860.0, + "step": 17642 + }, + { + "epoch": 1.9375137272128267, + "grad_norm": 1.704050898551941, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7204960584640503, + "num_tokens": 456418871.0, + "step": 17643 + }, + { + "epoch": 1.9376235449154404, + "grad_norm": 1.5355993509292603, + "learning_rate": 5e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.726387083530426, + "num_tokens": 456451922.0, + "step": 17644 + }, + { + "epoch": 1.937733362618054, + "grad_norm": 1.9137787818908691, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7203866243362427, + "num_tokens": 456475116.0, + "step": 17645 + }, + { + "epoch": 1.9378431803206677, + "grad_norm": 1.659997820854187, + "learning_rate": 5e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7272359132766724, + "num_tokens": 456504078.0, + "step": 17646 + }, + { + "epoch": 1.9379529980232815, + "grad_norm": 1.8681774139404297, + "learning_rate": 5e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.713787317276001, + "num_tokens": 456529793.0, + "step": 17647 + }, + { + "epoch": 1.938062815725895, + "grad_norm": 2.0444509983062744, + "learning_rate": 5e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7243300080299377, + "num_tokens": 456551607.0, + "step": 17648 + }, + { + "epoch": 1.9381726334285085, + "grad_norm": 1.8109655380249023, + "learning_rate": 5e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7345674633979797, + "num_tokens": 456577822.0, + "step": 17649 + }, + { + "epoch": 1.9382824511311223, + "grad_norm": 1.8094980716705322, + "learning_rate": 5e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7075663805007935, + "num_tokens": 456602286.0, + "step": 17650 + }, + { + "epoch": 1.938392268833736, + "grad_norm": 1.6649606227874756, + "learning_rate": 5e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7307862043380737, + "num_tokens": 456630838.0, + "step": 17651 + }, + { + "epoch": 1.9385020865363498, + "grad_norm": 1.793846607208252, + "learning_rate": 5e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.734573483467102, + "num_tokens": 456657822.0, + "step": 17652 + }, + { + "epoch": 1.9386119042389633, + "grad_norm": 1.7458213567733765, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7355514764785767, + "num_tokens": 456684112.0, + "step": 17653 + }, + { + "epoch": 1.9387217219415769, + "grad_norm": 1.7013086080551147, + "learning_rate": 5e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.727580189704895, + "num_tokens": 456711911.0, + "step": 17654 + }, + { + "epoch": 1.9388315396441906, + "grad_norm": 1.8998924493789673, + "learning_rate": 5e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7159035205841064, + "num_tokens": 456738282.0, + "step": 17655 + }, + { + "epoch": 1.9389413573468044, + "grad_norm": 1.9058783054351807, + "learning_rate": 5e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7286797761917114, + "num_tokens": 456762349.0, + "step": 17656 + }, + { + "epoch": 1.939051175049418, + "grad_norm": 1.7120075225830078, + "learning_rate": 5e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7435369491577148, + "num_tokens": 456792009.0, + "step": 17657 + }, + { + "epoch": 1.9391609927520315, + "grad_norm": 1.9432909488677979, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7376731038093567, + "num_tokens": 456812483.0, + "step": 17658 + }, + { + "epoch": 1.9392708104546452, + "grad_norm": 1.904591679573059, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7383531332015991, + "num_tokens": 456836193.0, + "step": 17659 + }, + { + "epoch": 1.939380628157259, + "grad_norm": 1.9603450298309326, + "learning_rate": 5e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7511563301086426, + "num_tokens": 456856010.0, + "step": 17660 + }, + { + "epoch": 1.9394904458598727, + "grad_norm": 1.764951229095459, + "learning_rate": 5e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7259572744369507, + "num_tokens": 456882845.0, + "step": 17661 + }, + { + "epoch": 1.9396002635624863, + "grad_norm": 1.821359395980835, + "learning_rate": 5e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7582185864448547, + "num_tokens": 456906438.0, + "step": 17662 + }, + { + "epoch": 1.9397100812650998, + "grad_norm": 1.6835308074951172, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7328280210494995, + "num_tokens": 456934945.0, + "step": 17663 + }, + { + "epoch": 1.9398198989677136, + "grad_norm": 1.7383102178573608, + "learning_rate": 5e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7284490466117859, + "num_tokens": 456964923.0, + "step": 17664 + }, + { + "epoch": 1.9399297166703273, + "grad_norm": 1.9204882383346558, + "learning_rate": 5e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7342109680175781, + "num_tokens": 456986931.0, + "step": 17665 + }, + { + "epoch": 1.940039534372941, + "grad_norm": 1.8754143714904785, + "learning_rate": 5e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.727706789970398, + "num_tokens": 457012625.0, + "step": 17666 + }, + { + "epoch": 1.9401493520755546, + "grad_norm": 1.8078219890594482, + "learning_rate": 5e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7622120380401611, + "num_tokens": 457038911.0, + "step": 17667 + }, + { + "epoch": 1.9402591697781681, + "grad_norm": 1.8323758840560913, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7363010048866272, + "num_tokens": 457065757.0, + "step": 17668 + }, + { + "epoch": 1.940368987480782, + "grad_norm": 1.9695323705673218, + "learning_rate": 5e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7180413007736206, + "num_tokens": 457089538.0, + "step": 17669 + }, + { + "epoch": 1.9404788051833957, + "grad_norm": 1.7671873569488525, + "learning_rate": 5e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7223927974700928, + "num_tokens": 457118685.0, + "step": 17670 + }, + { + "epoch": 1.9405886228860092, + "grad_norm": 1.9086008071899414, + "learning_rate": 5e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7084898948669434, + "num_tokens": 457143928.0, + "step": 17671 + }, + { + "epoch": 1.9406984405886227, + "grad_norm": 1.5168477296829224, + "learning_rate": 5e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7186880707740784, + "num_tokens": 457177667.0, + "step": 17672 + }, + { + "epoch": 1.9408082582912365, + "grad_norm": 1.7433589696884155, + "learning_rate": 5e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.721538782119751, + "num_tokens": 457205335.0, + "step": 17673 + }, + { + "epoch": 1.9409180759938502, + "grad_norm": 1.7778499126434326, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7511914968490601, + "num_tokens": 457230598.0, + "step": 17674 + }, + { + "epoch": 1.941027893696464, + "grad_norm": 2.0972959995269775, + "learning_rate": 5e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7423496246337891, + "num_tokens": 457250412.0, + "step": 17675 + }, + { + "epoch": 1.9411377113990775, + "grad_norm": 1.779673457145691, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.732313334941864, + "num_tokens": 457276831.0, + "step": 17676 + }, + { + "epoch": 1.941247529101691, + "grad_norm": 1.802371859550476, + "learning_rate": 5e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.729210615158081, + "num_tokens": 457302293.0, + "step": 17677 + }, + { + "epoch": 1.9413573468043048, + "grad_norm": 1.8505069017410278, + "learning_rate": 5e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7227510213851929, + "num_tokens": 457329361.0, + "step": 17678 + }, + { + "epoch": 1.9414671645069186, + "grad_norm": 1.910211205482483, + "learning_rate": 5e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7195676565170288, + "num_tokens": 457352835.0, + "step": 17679 + }, + { + "epoch": 1.9415769822095323, + "grad_norm": 1.7226567268371582, + "learning_rate": 5e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7428326606750488, + "num_tokens": 457382898.0, + "step": 17680 + }, + { + "epoch": 1.9416867999121459, + "grad_norm": 1.7795840501785278, + "learning_rate": 5e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7337318658828735, + "num_tokens": 457407462.0, + "step": 17681 + }, + { + "epoch": 1.9417966176147594, + "grad_norm": 1.7157857418060303, + "learning_rate": 5e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7289992570877075, + "num_tokens": 457434170.0, + "step": 17682 + }, + { + "epoch": 1.9419064353173732, + "grad_norm": 1.549450159072876, + "learning_rate": 5e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7205915451049805, + "num_tokens": 457464915.0, + "step": 17683 + }, + { + "epoch": 1.942016253019987, + "grad_norm": 1.8533258438110352, + "learning_rate": 5e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7402113676071167, + "num_tokens": 457489565.0, + "step": 17684 + }, + { + "epoch": 1.9421260707226005, + "grad_norm": 1.9622437953948975, + "learning_rate": 5e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7230556607246399, + "num_tokens": 457510658.0, + "step": 17685 + }, + { + "epoch": 1.942235888425214, + "grad_norm": 1.736830711364746, + "learning_rate": 5e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.717897891998291, + "num_tokens": 457539500.0, + "step": 17686 + }, + { + "epoch": 1.9423457061278278, + "grad_norm": 1.8008742332458496, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7182990312576294, + "num_tokens": 457565807.0, + "step": 17687 + }, + { + "epoch": 1.9424555238304415, + "grad_norm": 1.639905333518982, + "learning_rate": 5e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7278246879577637, + "num_tokens": 457593689.0, + "step": 17688 + }, + { + "epoch": 1.9425653415330553, + "grad_norm": 1.579420804977417, + "learning_rate": 5e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7200571894645691, + "num_tokens": 457626830.0, + "step": 17689 + }, + { + "epoch": 1.9426751592356688, + "grad_norm": 1.6578768491744995, + "learning_rate": 5e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7108516693115234, + "num_tokens": 457658222.0, + "step": 17690 + }, + { + "epoch": 1.9427849769382823, + "grad_norm": 1.9992612600326538, + "learning_rate": 5e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7446131706237793, + "num_tokens": 457678581.0, + "step": 17691 + }, + { + "epoch": 1.942894794640896, + "grad_norm": 1.6188583374023438, + "learning_rate": 5e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7282688617706299, + "num_tokens": 457708174.0, + "step": 17692 + }, + { + "epoch": 1.9430046123435099, + "grad_norm": 1.6840860843658447, + "learning_rate": 5e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7127986550331116, + "num_tokens": 457736317.0, + "step": 17693 + }, + { + "epoch": 1.9431144300461234, + "grad_norm": 1.5702444314956665, + "learning_rate": 5e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7442253828048706, + "num_tokens": 457766151.0, + "step": 17694 + }, + { + "epoch": 1.9432242477487371, + "grad_norm": 1.5957248210906982, + "learning_rate": 5e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.721346378326416, + "num_tokens": 457795126.0, + "step": 17695 + }, + { + "epoch": 1.9433340654513507, + "grad_norm": 1.5986049175262451, + "learning_rate": 5e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7339333891868591, + "num_tokens": 457825953.0, + "step": 17696 + }, + { + "epoch": 1.9434438831539644, + "grad_norm": 1.9140102863311768, + "learning_rate": 5e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7427140474319458, + "num_tokens": 457850043.0, + "step": 17697 + }, + { + "epoch": 1.9435537008565782, + "grad_norm": 1.9401158094406128, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7515672445297241, + "num_tokens": 457871963.0, + "step": 17698 + }, + { + "epoch": 1.9436635185591917, + "grad_norm": 1.8764389753341675, + "learning_rate": 5e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.741935670375824, + "num_tokens": 457894389.0, + "step": 17699 + }, + { + "epoch": 1.9437733362618053, + "grad_norm": 1.8437968492507935, + "learning_rate": 5e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7242312431335449, + "num_tokens": 457919490.0, + "step": 17700 + }, + { + "epoch": 1.943883153964419, + "grad_norm": 1.7193509340286255, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7508896589279175, + "num_tokens": 457946800.0, + "step": 17701 + }, + { + "epoch": 1.9439929716670328, + "grad_norm": 1.678694486618042, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7475182414054871, + "num_tokens": 457975193.0, + "step": 17702 + }, + { + "epoch": 1.9441027893696465, + "grad_norm": 1.859932541847229, + "learning_rate": 5e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7244527339935303, + "num_tokens": 457999285.0, + "step": 17703 + }, + { + "epoch": 1.94421260707226, + "grad_norm": 1.6661064624786377, + "learning_rate": 5e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7319342494010925, + "num_tokens": 458030535.0, + "step": 17704 + }, + { + "epoch": 1.9443224247748736, + "grad_norm": 1.900983452796936, + "learning_rate": 5e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7392014265060425, + "num_tokens": 458052684.0, + "step": 17705 + }, + { + "epoch": 1.9444322424774874, + "grad_norm": 1.6580480337142944, + "learning_rate": 5e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7036352157592773, + "num_tokens": 458083026.0, + "step": 17706 + }, + { + "epoch": 1.9445420601801011, + "grad_norm": 1.7518099546432495, + "learning_rate": 5e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7400047779083252, + "num_tokens": 458109386.0, + "step": 17707 + }, + { + "epoch": 1.9446518778827147, + "grad_norm": 1.8043254613876343, + "learning_rate": 5e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7111799716949463, + "num_tokens": 458137984.0, + "step": 17708 + }, + { + "epoch": 1.9447616955853284, + "grad_norm": 1.7519234418869019, + "learning_rate": 5e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7212854623794556, + "num_tokens": 458168510.0, + "step": 17709 + }, + { + "epoch": 1.944871513287942, + "grad_norm": 1.6999815702438354, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7171169519424438, + "num_tokens": 458197420.0, + "step": 17710 + }, + { + "epoch": 1.9449813309905557, + "grad_norm": 1.941283941268921, + "learning_rate": 5e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7632390260696411, + "num_tokens": 458216604.0, + "step": 17711 + }, + { + "epoch": 1.9450911486931695, + "grad_norm": 1.6225641965866089, + "learning_rate": 5e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.72867351770401, + "num_tokens": 458246690.0, + "step": 17712 + }, + { + "epoch": 1.945200966395783, + "grad_norm": 1.708722710609436, + "learning_rate": 5e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7218787670135498, + "num_tokens": 458274907.0, + "step": 17713 + }, + { + "epoch": 1.9453107840983965, + "grad_norm": 1.766063928604126, + "learning_rate": 5e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7045167684555054, + "num_tokens": 458304831.0, + "step": 17714 + }, + { + "epoch": 1.9454206018010103, + "grad_norm": 1.6799321174621582, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7189679145812988, + "num_tokens": 458335773.0, + "step": 17715 + }, + { + "epoch": 1.945530419503624, + "grad_norm": 1.920580267906189, + "learning_rate": 5e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7328243255615234, + "num_tokens": 458357186.0, + "step": 17716 + }, + { + "epoch": 1.9456402372062378, + "grad_norm": 1.5858665704727173, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7233967781066895, + "num_tokens": 458388425.0, + "step": 17717 + }, + { + "epoch": 1.9457500549088513, + "grad_norm": 1.8141101598739624, + "learning_rate": 5e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7570847272872925, + "num_tokens": 458413201.0, + "step": 17718 + }, + { + "epoch": 1.9458598726114649, + "grad_norm": 1.665570616722107, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7526092529296875, + "num_tokens": 458440009.0, + "step": 17719 + }, + { + "epoch": 1.9459696903140786, + "grad_norm": 1.7928022146224976, + "learning_rate": 5e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7452719807624817, + "num_tokens": 458465685.0, + "step": 17720 + }, + { + "epoch": 1.9460795080166924, + "grad_norm": 2.2848968505859375, + "learning_rate": 5e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7359617948532104, + "num_tokens": 458483204.0, + "step": 17721 + }, + { + "epoch": 1.946189325719306, + "grad_norm": 2.0591208934783936, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7344675064086914, + "num_tokens": 458503875.0, + "step": 17722 + }, + { + "epoch": 1.9462991434219195, + "grad_norm": 1.742123007774353, + "learning_rate": 5e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7192267775535583, + "num_tokens": 458530308.0, + "step": 17723 + }, + { + "epoch": 1.9464089611245332, + "grad_norm": 1.882705807685852, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.746893584728241, + "num_tokens": 458550357.0, + "step": 17724 + }, + { + "epoch": 1.946518778827147, + "grad_norm": 1.9260653257369995, + "learning_rate": 5e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7205171585083008, + "num_tokens": 458575548.0, + "step": 17725 + }, + { + "epoch": 1.9466285965297607, + "grad_norm": 1.6800369024276733, + "learning_rate": 5e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7383955717086792, + "num_tokens": 458606326.0, + "step": 17726 + }, + { + "epoch": 1.9467384142323743, + "grad_norm": 1.7979084253311157, + "learning_rate": 5e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7587885856628418, + "num_tokens": 458630483.0, + "step": 17727 + }, + { + "epoch": 1.9468482319349878, + "grad_norm": 1.6727566719055176, + "learning_rate": 5e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7470027208328247, + "num_tokens": 458657995.0, + "step": 17728 + }, + { + "epoch": 1.9469580496376016, + "grad_norm": 1.7915512323379517, + "learning_rate": 5e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7569554448127747, + "num_tokens": 458682558.0, + "step": 17729 + }, + { + "epoch": 1.9470678673402153, + "grad_norm": 1.8808205127716064, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7485183477401733, + "num_tokens": 458707316.0, + "step": 17730 + }, + { + "epoch": 1.947177685042829, + "grad_norm": 1.9923646450042725, + "learning_rate": 5e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.740788459777832, + "num_tokens": 458729102.0, + "step": 17731 + }, + { + "epoch": 1.9472875027454426, + "grad_norm": 1.901458978652954, + "learning_rate": 5e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7538560628890991, + "num_tokens": 458750958.0, + "step": 17732 + }, + { + "epoch": 1.9473973204480561, + "grad_norm": 2.0495212078094482, + "learning_rate": 5e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.76552414894104, + "num_tokens": 458770823.0, + "step": 17733 + }, + { + "epoch": 1.94750713815067, + "grad_norm": 2.094618558883667, + "learning_rate": 5e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7271684408187866, + "num_tokens": 458794612.0, + "step": 17734 + }, + { + "epoch": 1.9476169558532836, + "grad_norm": 1.7622966766357422, + "learning_rate": 5e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.747812032699585, + "num_tokens": 458820654.0, + "step": 17735 + }, + { + "epoch": 1.9477267735558972, + "grad_norm": 1.670383095741272, + "learning_rate": 5e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.7435365319252014, + "num_tokens": 458848875.0, + "step": 17736 + }, + { + "epoch": 1.9478365912585107, + "grad_norm": 1.6769177913665771, + "learning_rate": 5e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7253851890563965, + "num_tokens": 458876696.0, + "step": 17737 + }, + { + "epoch": 1.9479464089611245, + "grad_norm": 1.8093527555465698, + "learning_rate": 5e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7475296258926392, + "num_tokens": 458901111.0, + "step": 17738 + }, + { + "epoch": 1.9480562266637382, + "grad_norm": 1.7076468467712402, + "learning_rate": 5e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.716839075088501, + "num_tokens": 458929092.0, + "step": 17739 + }, + { + "epoch": 1.948166044366352, + "grad_norm": 1.7129201889038086, + "learning_rate": 5e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7273634672164917, + "num_tokens": 458957872.0, + "step": 17740 + }, + { + "epoch": 1.9482758620689655, + "grad_norm": 2.0280516147613525, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7414015531539917, + "num_tokens": 458978594.0, + "step": 17741 + }, + { + "epoch": 1.948385679771579, + "grad_norm": 1.9007333517074585, + "learning_rate": 5e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7457521557807922, + "num_tokens": 459000524.0, + "step": 17742 + }, + { + "epoch": 1.9484954974741928, + "grad_norm": 1.7709821462631226, + "learning_rate": 5e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7285935282707214, + "num_tokens": 459027083.0, + "step": 17743 + }, + { + "epoch": 1.9486053151768066, + "grad_norm": 1.7267425060272217, + "learning_rate": 5e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7357802391052246, + "num_tokens": 459055694.0, + "step": 17744 + }, + { + "epoch": 1.94871513287942, + "grad_norm": 1.9298686981201172, + "learning_rate": 5e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7383775115013123, + "num_tokens": 459079635.0, + "step": 17745 + }, + { + "epoch": 1.9488249505820339, + "grad_norm": 2.0010826587677, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7457329034805298, + "num_tokens": 459100671.0, + "step": 17746 + }, + { + "epoch": 1.9489347682846474, + "grad_norm": 1.7221580743789673, + "learning_rate": 5e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7422874569892883, + "num_tokens": 459128019.0, + "step": 17747 + }, + { + "epoch": 1.9490445859872612, + "grad_norm": 1.8593732118606567, + "learning_rate": 5e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.709210991859436, + "num_tokens": 459155755.0, + "step": 17748 + }, + { + "epoch": 1.949154403689875, + "grad_norm": 1.9441173076629639, + "learning_rate": 5e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7197742462158203, + "num_tokens": 459178804.0, + "step": 17749 + }, + { + "epoch": 1.9492642213924884, + "grad_norm": 1.6160392761230469, + "learning_rate": 5e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7275460958480835, + "num_tokens": 459209270.0, + "step": 17750 + }, + { + "epoch": 1.949374039095102, + "grad_norm": 1.7631993293762207, + "learning_rate": 5e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.729685366153717, + "num_tokens": 459235733.0, + "step": 17751 + }, + { + "epoch": 1.9494838567977157, + "grad_norm": 1.8636648654937744, + "learning_rate": 5e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7200049757957458, + "num_tokens": 459261205.0, + "step": 17752 + }, + { + "epoch": 1.9495936745003295, + "grad_norm": 1.8790744543075562, + "learning_rate": 5e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7196587324142456, + "num_tokens": 459286668.0, + "step": 17753 + }, + { + "epoch": 1.9497034922029433, + "grad_norm": 1.6930670738220215, + "learning_rate": 5e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7172070741653442, + "num_tokens": 459316644.0, + "step": 17754 + }, + { + "epoch": 1.9498133099055568, + "grad_norm": 1.8962137699127197, + "learning_rate": 5e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7186383008956909, + "num_tokens": 459341778.0, + "step": 17755 + }, + { + "epoch": 1.9499231276081703, + "grad_norm": 1.7231621742248535, + "learning_rate": 5e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7361205816268921, + "num_tokens": 459366375.0, + "step": 17756 + }, + { + "epoch": 1.950032945310784, + "grad_norm": 1.960585355758667, + "learning_rate": 5e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7251084446907043, + "num_tokens": 459388374.0, + "step": 17757 + }, + { + "epoch": 1.9501427630133978, + "grad_norm": 1.7113577127456665, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7292918562889099, + "num_tokens": 459415564.0, + "step": 17758 + }, + { + "epoch": 1.9502525807160114, + "grad_norm": 1.752156376838684, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7362749576568604, + "num_tokens": 459442002.0, + "step": 17759 + }, + { + "epoch": 1.9503623984186251, + "grad_norm": 1.5736825466156006, + "learning_rate": 5e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7259107828140259, + "num_tokens": 459472300.0, + "step": 17760 + }, + { + "epoch": 1.9504722161212387, + "grad_norm": 1.8992611169815063, + "learning_rate": 5e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7409638166427612, + "num_tokens": 459496929.0, + "step": 17761 + }, + { + "epoch": 1.9505820338238524, + "grad_norm": 1.7954155206680298, + "learning_rate": 5e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7417089939117432, + "num_tokens": 459522119.0, + "step": 17762 + }, + { + "epoch": 1.9506918515264662, + "grad_norm": 1.6748948097229004, + "learning_rate": 5e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7103934288024902, + "num_tokens": 459552207.0, + "step": 17763 + }, + { + "epoch": 1.9508016692290797, + "grad_norm": 1.660521388053894, + "learning_rate": 5e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7417136430740356, + "num_tokens": 459579353.0, + "step": 17764 + }, + { + "epoch": 1.9509114869316933, + "grad_norm": 1.8714816570281982, + "learning_rate": 5e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7293790578842163, + "num_tokens": 459603385.0, + "step": 17765 + }, + { + "epoch": 1.951021304634307, + "grad_norm": 1.8512061834335327, + "learning_rate": 5e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7447165250778198, + "num_tokens": 459627687.0, + "step": 17766 + }, + { + "epoch": 1.9511311223369208, + "grad_norm": 1.7510496377944946, + "learning_rate": 5e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7154905200004578, + "num_tokens": 459656899.0, + "step": 17767 + }, + { + "epoch": 1.9512409400395345, + "grad_norm": 1.690168023109436, + "learning_rate": 5e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7453400492668152, + "num_tokens": 459686265.0, + "step": 17768 + }, + { + "epoch": 1.951350757742148, + "grad_norm": 1.6695702075958252, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7562010288238525, + "num_tokens": 459713833.0, + "step": 17769 + }, + { + "epoch": 1.9514605754447616, + "grad_norm": 1.5615155696868896, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7329739332199097, + "num_tokens": 459747438.0, + "step": 17770 + }, + { + "epoch": 1.9515703931473753, + "grad_norm": 1.7154793739318848, + "learning_rate": 5e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7344229221343994, + "num_tokens": 459774600.0, + "step": 17771 + }, + { + "epoch": 1.951680210849989, + "grad_norm": 1.9213141202926636, + "learning_rate": 5e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7331385612487793, + "num_tokens": 459798343.0, + "step": 17772 + }, + { + "epoch": 1.9517900285526026, + "grad_norm": 1.6127350330352783, + "learning_rate": 5e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7219713926315308, + "num_tokens": 459831488.0, + "step": 17773 + }, + { + "epoch": 1.9518998462552162, + "grad_norm": 2.0575103759765625, + "learning_rate": 5e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7424086332321167, + "num_tokens": 459851364.0, + "step": 17774 + }, + { + "epoch": 1.95200966395783, + "grad_norm": 1.720365047454834, + "learning_rate": 5e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7318520545959473, + "num_tokens": 459877752.0, + "step": 17775 + }, + { + "epoch": 1.9521194816604437, + "grad_norm": 1.8158048391342163, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7234702110290527, + "num_tokens": 459902051.0, + "step": 17776 + }, + { + "epoch": 1.9522292993630574, + "grad_norm": 2.1006951332092285, + "learning_rate": 5e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7457962036132812, + "num_tokens": 459921272.0, + "step": 17777 + }, + { + "epoch": 1.952339117065671, + "grad_norm": 1.5229103565216064, + "learning_rate": 5e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7190713882446289, + "num_tokens": 459954441.0, + "step": 17778 + }, + { + "epoch": 1.9524489347682845, + "grad_norm": 1.932137370109558, + "learning_rate": 5e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7269512414932251, + "num_tokens": 459977682.0, + "step": 17779 + }, + { + "epoch": 1.9525587524708983, + "grad_norm": 1.839302897453308, + "learning_rate": 5e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7262124419212341, + "num_tokens": 460003758.0, + "step": 17780 + }, + { + "epoch": 1.952668570173512, + "grad_norm": 1.699752688407898, + "learning_rate": 5e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7288110256195068, + "num_tokens": 460031028.0, + "step": 17781 + }, + { + "epoch": 1.9527783878761258, + "grad_norm": 1.8408739566802979, + "learning_rate": 5e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7329285144805908, + "num_tokens": 460054657.0, + "step": 17782 + }, + { + "epoch": 1.9528882055787393, + "grad_norm": 1.6761999130249023, + "learning_rate": 5e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7457205057144165, + "num_tokens": 460080117.0, + "step": 17783 + }, + { + "epoch": 1.9529980232813529, + "grad_norm": 1.7802715301513672, + "learning_rate": 5e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7137173414230347, + "num_tokens": 460108829.0, + "step": 17784 + }, + { + "epoch": 1.9531078409839666, + "grad_norm": 1.7500516176223755, + "learning_rate": 5e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7416595220565796, + "num_tokens": 460133536.0, + "step": 17785 + }, + { + "epoch": 1.9532176586865804, + "grad_norm": 1.7577358484268188, + "learning_rate": 5e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.732860267162323, + "num_tokens": 460161505.0, + "step": 17786 + }, + { + "epoch": 1.953327476389194, + "grad_norm": 1.8686983585357666, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.761665940284729, + "num_tokens": 460183629.0, + "step": 17787 + }, + { + "epoch": 1.9534372940918074, + "grad_norm": 1.8014440536499023, + "learning_rate": 5e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.723645806312561, + "num_tokens": 460209060.0, + "step": 17788 + }, + { + "epoch": 1.9535471117944212, + "grad_norm": 1.637545108795166, + "learning_rate": 5e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7176936864852905, + "num_tokens": 460239846.0, + "step": 17789 + }, + { + "epoch": 1.953656929497035, + "grad_norm": 1.9416913986206055, + "learning_rate": 5e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7464845180511475, + "num_tokens": 460262282.0, + "step": 17790 + }, + { + "epoch": 1.9537667471996487, + "grad_norm": 1.7313424348831177, + "learning_rate": 5e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7508127689361572, + "num_tokens": 460288530.0, + "step": 17791 + }, + { + "epoch": 1.9538765649022622, + "grad_norm": 1.7145849466323853, + "learning_rate": 5e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7325525283813477, + "num_tokens": 460317319.0, + "step": 17792 + }, + { + "epoch": 1.9539863826048758, + "grad_norm": 1.8339720964431763, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7361338138580322, + "num_tokens": 460341584.0, + "step": 17793 + }, + { + "epoch": 1.9540962003074895, + "grad_norm": 1.5924949645996094, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7212404012680054, + "num_tokens": 460375235.0, + "step": 17794 + }, + { + "epoch": 1.9542060180101033, + "grad_norm": 1.9162312746047974, + "learning_rate": 5e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7231009602546692, + "num_tokens": 460402039.0, + "step": 17795 + }, + { + "epoch": 1.954315835712717, + "grad_norm": 1.6873234510421753, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7233167290687561, + "num_tokens": 460431290.0, + "step": 17796 + }, + { + "epoch": 1.9544256534153306, + "grad_norm": 1.7507882118225098, + "learning_rate": 5e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7471069693565369, + "num_tokens": 460456531.0, + "step": 17797 + }, + { + "epoch": 1.9545354711179441, + "grad_norm": 1.9336209297180176, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7612415552139282, + "num_tokens": 460478616.0, + "step": 17798 + }, + { + "epoch": 1.9546452888205579, + "grad_norm": 2.1223654747009277, + "learning_rate": 5e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7176386713981628, + "num_tokens": 460499065.0, + "step": 17799 + }, + { + "epoch": 1.9547551065231716, + "grad_norm": 1.6299866437911987, + "learning_rate": 5e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7485334873199463, + "num_tokens": 460528675.0, + "step": 17800 + }, + { + "epoch": 1.9548649242257852, + "grad_norm": 1.7969470024108887, + "learning_rate": 5e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.745610237121582, + "num_tokens": 460553535.0, + "step": 17801 + }, + { + "epoch": 1.9549747419283987, + "grad_norm": 1.50965416431427, + "learning_rate": 5e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7437303066253662, + "num_tokens": 460586176.0, + "step": 17802 + }, + { + "epoch": 1.9550845596310125, + "grad_norm": 1.6415752172470093, + "learning_rate": 5e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7310410141944885, + "num_tokens": 460612945.0, + "step": 17803 + }, + { + "epoch": 1.9551943773336262, + "grad_norm": 1.9802260398864746, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7309942245483398, + "num_tokens": 460634043.0, + "step": 17804 + }, + { + "epoch": 1.95530419503624, + "grad_norm": 1.5859146118164062, + "learning_rate": 5e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7285373210906982, + "num_tokens": 460663160.0, + "step": 17805 + }, + { + "epoch": 1.9554140127388535, + "grad_norm": 1.7595548629760742, + "learning_rate": 5e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.734213650226593, + "num_tokens": 460687152.0, + "step": 17806 + }, + { + "epoch": 1.955523830441467, + "grad_norm": 1.9073013067245483, + "learning_rate": 5e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7282769083976746, + "num_tokens": 460711128.0, + "step": 17807 + }, + { + "epoch": 1.9556336481440808, + "grad_norm": 1.9115757942199707, + "learning_rate": 5e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7388268709182739, + "num_tokens": 460732712.0, + "step": 17808 + }, + { + "epoch": 1.9557434658466946, + "grad_norm": 1.6796196699142456, + "learning_rate": 5e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7341258525848389, + "num_tokens": 460759389.0, + "step": 17809 + }, + { + "epoch": 1.955853283549308, + "grad_norm": 1.5801297426223755, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7494431734085083, + "num_tokens": 460788404.0, + "step": 17810 + }, + { + "epoch": 1.9559631012519219, + "grad_norm": 1.7281700372695923, + "learning_rate": 5e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7244497537612915, + "num_tokens": 460817100.0, + "step": 17811 + }, + { + "epoch": 1.9560729189545354, + "grad_norm": 1.8117719888687134, + "learning_rate": 5e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.71641606092453, + "num_tokens": 460844687.0, + "step": 17812 + }, + { + "epoch": 1.9561827366571491, + "grad_norm": 1.7598917484283447, + "learning_rate": 5e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7203541994094849, + "num_tokens": 460872358.0, + "step": 17813 + }, + { + "epoch": 1.956292554359763, + "grad_norm": 1.8833177089691162, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7286567687988281, + "num_tokens": 460895634.0, + "step": 17814 + }, + { + "epoch": 1.9564023720623764, + "grad_norm": 1.8188135623931885, + "learning_rate": 5e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7269946336746216, + "num_tokens": 460921268.0, + "step": 17815 + }, + { + "epoch": 1.95651218976499, + "grad_norm": 1.778507113456726, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7178677916526794, + "num_tokens": 460950280.0, + "step": 17816 + }, + { + "epoch": 1.9566220074676037, + "grad_norm": 1.928638219833374, + "learning_rate": 5e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7321949005126953, + "num_tokens": 460973472.0, + "step": 17817 + }, + { + "epoch": 1.9567318251702175, + "grad_norm": 1.8544502258300781, + "learning_rate": 5e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7098517417907715, + "num_tokens": 460998013.0, + "step": 17818 + }, + { + "epoch": 1.9568416428728312, + "grad_norm": 1.6619906425476074, + "learning_rate": 5e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7143752574920654, + "num_tokens": 461027250.0, + "step": 17819 + }, + { + "epoch": 1.9569514605754448, + "grad_norm": 2.037222385406494, + "learning_rate": 5e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7264417409896851, + "num_tokens": 461048645.0, + "step": 17820 + }, + { + "epoch": 1.9570612782780583, + "grad_norm": 1.7932215929031372, + "learning_rate": 5e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7023518085479736, + "num_tokens": 461075934.0, + "step": 17821 + }, + { + "epoch": 1.957171095980672, + "grad_norm": 1.7661789655685425, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7395316958427429, + "num_tokens": 461102027.0, + "step": 17822 + }, + { + "epoch": 1.9572809136832858, + "grad_norm": 1.765676498413086, + "learning_rate": 5e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7491596341133118, + "num_tokens": 461127907.0, + "step": 17823 + }, + { + "epoch": 1.9573907313858994, + "grad_norm": 1.797275424003601, + "learning_rate": 5e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7045412063598633, + "num_tokens": 461155170.0, + "step": 17824 + }, + { + "epoch": 1.9575005490885131, + "grad_norm": 1.7764780521392822, + "learning_rate": 5e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7079277038574219, + "num_tokens": 461183289.0, + "step": 17825 + }, + { + "epoch": 1.9576103667911267, + "grad_norm": 1.5147067308425903, + "learning_rate": 5e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7402796149253845, + "num_tokens": 461216629.0, + "step": 17826 + }, + { + "epoch": 1.9577201844937404, + "grad_norm": 1.8436977863311768, + "learning_rate": 5e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7391838431358337, + "num_tokens": 461241211.0, + "step": 17827 + }, + { + "epoch": 1.9578300021963542, + "grad_norm": 1.498555302619934, + "learning_rate": 5e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7138168215751648, + "num_tokens": 461274588.0, + "step": 17828 + }, + { + "epoch": 1.9579398198989677, + "grad_norm": 1.7367103099822998, + "learning_rate": 5e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7291320562362671, + "num_tokens": 461300403.0, + "step": 17829 + }, + { + "epoch": 1.9580496376015812, + "grad_norm": 1.7520047426223755, + "learning_rate": 5e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7223440408706665, + "num_tokens": 461327429.0, + "step": 17830 + }, + { + "epoch": 1.958159455304195, + "grad_norm": 1.6391894817352295, + "learning_rate": 5e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7137366533279419, + "num_tokens": 461358397.0, + "step": 17831 + }, + { + "epoch": 1.9582692730068088, + "grad_norm": 1.6106644868850708, + "learning_rate": 5e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7299391031265259, + "num_tokens": 461386931.0, + "step": 17832 + }, + { + "epoch": 1.9583790907094225, + "grad_norm": 2.021778106689453, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7473322153091431, + "num_tokens": 461407517.0, + "step": 17833 + }, + { + "epoch": 1.958488908412036, + "grad_norm": 1.973831295967102, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7645436525344849, + "num_tokens": 461426953.0, + "step": 17834 + }, + { + "epoch": 1.9585987261146496, + "grad_norm": 1.7126716375350952, + "learning_rate": 5e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7178446054458618, + "num_tokens": 461455620.0, + "step": 17835 + }, + { + "epoch": 1.9587085438172633, + "grad_norm": 1.7871170043945312, + "learning_rate": 5e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7325317859649658, + "num_tokens": 461478591.0, + "step": 17836 + }, + { + "epoch": 1.958818361519877, + "grad_norm": 1.6906872987747192, + "learning_rate": 5e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7322167158126831, + "num_tokens": 461507638.0, + "step": 17837 + }, + { + "epoch": 1.9589281792224906, + "grad_norm": 1.879494547843933, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7537623643875122, + "num_tokens": 461529845.0, + "step": 17838 + }, + { + "epoch": 1.9590379969251042, + "grad_norm": 1.9105713367462158, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7356327772140503, + "num_tokens": 461552368.0, + "step": 17839 + }, + { + "epoch": 1.959147814627718, + "grad_norm": 1.8347643613815308, + "learning_rate": 5e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7398555874824524, + "num_tokens": 461575315.0, + "step": 17840 + }, + { + "epoch": 1.9592576323303317, + "grad_norm": 1.6565985679626465, + "learning_rate": 5e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7196714878082275, + "num_tokens": 461606974.0, + "step": 17841 + }, + { + "epoch": 1.9593674500329454, + "grad_norm": 1.941915512084961, + "learning_rate": 5e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7479919791221619, + "num_tokens": 461629095.0, + "step": 17842 + }, + { + "epoch": 1.959477267735559, + "grad_norm": 1.785884976387024, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7291539311408997, + "num_tokens": 461656455.0, + "step": 17843 + }, + { + "epoch": 1.9595870854381725, + "grad_norm": 1.744615077972412, + "learning_rate": 5e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.6985877156257629, + "num_tokens": 461684149.0, + "step": 17844 + }, + { + "epoch": 1.9596969031407863, + "grad_norm": 1.8319514989852905, + "learning_rate": 5e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7510210275650024, + "num_tokens": 461706687.0, + "step": 17845 + }, + { + "epoch": 1.9598067208434, + "grad_norm": 1.7471222877502441, + "learning_rate": 5e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7404279708862305, + "num_tokens": 461732861.0, + "step": 17846 + }, + { + "epoch": 1.9599165385460138, + "grad_norm": 1.8256386518478394, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7388172745704651, + "num_tokens": 461757921.0, + "step": 17847 + }, + { + "epoch": 1.9600263562486273, + "grad_norm": 1.7928189039230347, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7316832542419434, + "num_tokens": 461782716.0, + "step": 17848 + }, + { + "epoch": 1.9601361739512408, + "grad_norm": 1.7301826477050781, + "learning_rate": 5e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7327465415000916, + "num_tokens": 461809612.0, + "step": 17849 + }, + { + "epoch": 1.9602459916538546, + "grad_norm": 1.9466068744659424, + "learning_rate": 5e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7450372576713562, + "num_tokens": 461830476.0, + "step": 17850 + }, + { + "epoch": 1.9603558093564684, + "grad_norm": 1.9070831537246704, + "learning_rate": 5e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7216686606407166, + "num_tokens": 461854940.0, + "step": 17851 + }, + { + "epoch": 1.960465627059082, + "grad_norm": 1.8447123765945435, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7320663928985596, + "num_tokens": 461877319.0, + "step": 17852 + }, + { + "epoch": 1.9605754447616954, + "grad_norm": 1.714259147644043, + "learning_rate": 5e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7334769368171692, + "num_tokens": 461905276.0, + "step": 17853 + }, + { + "epoch": 1.9606852624643092, + "grad_norm": 1.603852391242981, + "learning_rate": 5e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7335329651832581, + "num_tokens": 461935618.0, + "step": 17854 + }, + { + "epoch": 1.960795080166923, + "grad_norm": 1.6338520050048828, + "learning_rate": 5e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7263765335083008, + "num_tokens": 461966293.0, + "step": 17855 + }, + { + "epoch": 1.9609048978695367, + "grad_norm": 1.7390902042388916, + "learning_rate": 5e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7207534313201904, + "num_tokens": 461995021.0, + "step": 17856 + }, + { + "epoch": 1.9610147155721502, + "grad_norm": 1.8876621723175049, + "learning_rate": 5e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7545607089996338, + "num_tokens": 462016510.0, + "step": 17857 + }, + { + "epoch": 1.9611245332747638, + "grad_norm": 1.9894438982009888, + "learning_rate": 5e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7305312156677246, + "num_tokens": 462039692.0, + "step": 17858 + }, + { + "epoch": 1.9612343509773775, + "grad_norm": 1.7103636264801025, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.734593391418457, + "num_tokens": 462065648.0, + "step": 17859 + }, + { + "epoch": 1.9613441686799913, + "grad_norm": 1.767217993736267, + "learning_rate": 5e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7391384840011597, + "num_tokens": 462090503.0, + "step": 17860 + }, + { + "epoch": 1.961453986382605, + "grad_norm": 1.829524278640747, + "learning_rate": 5e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7259811758995056, + "num_tokens": 462113932.0, + "step": 17861 + }, + { + "epoch": 1.9615638040852186, + "grad_norm": 1.920941710472107, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7535296082496643, + "num_tokens": 462136498.0, + "step": 17862 + }, + { + "epoch": 1.9616736217878321, + "grad_norm": 1.7513961791992188, + "learning_rate": 5e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7249319553375244, + "num_tokens": 462165163.0, + "step": 17863 + }, + { + "epoch": 1.9617834394904459, + "grad_norm": 1.8673434257507324, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7636088132858276, + "num_tokens": 462187567.0, + "step": 17864 + }, + { + "epoch": 1.9618932571930596, + "grad_norm": 1.929266095161438, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7558441162109375, + "num_tokens": 462210855.0, + "step": 17865 + }, + { + "epoch": 1.9620030748956732, + "grad_norm": 1.5490375757217407, + "learning_rate": 5e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7072052955627441, + "num_tokens": 462248346.0, + "step": 17866 + }, + { + "epoch": 1.9621128925982867, + "grad_norm": 1.8321630954742432, + "learning_rate": 5e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7266483306884766, + "num_tokens": 462274972.0, + "step": 17867 + }, + { + "epoch": 1.9622227103009005, + "grad_norm": 1.734989047050476, + "learning_rate": 5e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7527920007705688, + "num_tokens": 462299827.0, + "step": 17868 + }, + { + "epoch": 1.9623325280035142, + "grad_norm": 1.6846965551376343, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7462658882141113, + "num_tokens": 462327314.0, + "step": 17869 + }, + { + "epoch": 1.962442345706128, + "grad_norm": 1.84532630443573, + "learning_rate": 5e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.737558126449585, + "num_tokens": 462350754.0, + "step": 17870 + }, + { + "epoch": 1.9625521634087415, + "grad_norm": 1.8067835569381714, + "learning_rate": 5e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7542070150375366, + "num_tokens": 462377295.0, + "step": 17871 + }, + { + "epoch": 1.962661981111355, + "grad_norm": 1.6955318450927734, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7561413049697876, + "num_tokens": 462401594.0, + "step": 17872 + }, + { + "epoch": 1.9627717988139688, + "grad_norm": 1.8095914125442505, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7311751842498779, + "num_tokens": 462426806.0, + "step": 17873 + }, + { + "epoch": 1.9628816165165826, + "grad_norm": 1.9448922872543335, + "learning_rate": 5e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7195698022842407, + "num_tokens": 462450116.0, + "step": 17874 + }, + { + "epoch": 1.962991434219196, + "grad_norm": 1.6985628604888916, + "learning_rate": 5e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7278517484664917, + "num_tokens": 462479904.0, + "step": 17875 + }, + { + "epoch": 1.9631012519218098, + "grad_norm": 1.5548386573791504, + "learning_rate": 5e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7204105854034424, + "num_tokens": 462515740.0, + "step": 17876 + }, + { + "epoch": 1.9632110696244234, + "grad_norm": 1.7479701042175293, + "learning_rate": 5e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7289139032363892, + "num_tokens": 462543401.0, + "step": 17877 + }, + { + "epoch": 1.9633208873270371, + "grad_norm": 1.738829493522644, + "learning_rate": 5e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7377363443374634, + "num_tokens": 462569614.0, + "step": 17878 + }, + { + "epoch": 1.963430705029651, + "grad_norm": 1.7310223579406738, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7522493600845337, + "num_tokens": 462597607.0, + "step": 17879 + }, + { + "epoch": 1.9635405227322644, + "grad_norm": 1.7914109230041504, + "learning_rate": 5e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7448199987411499, + "num_tokens": 462621943.0, + "step": 17880 + }, + { + "epoch": 1.963650340434878, + "grad_norm": 1.8491251468658447, + "learning_rate": 5e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7435413002967834, + "num_tokens": 462645178.0, + "step": 17881 + }, + { + "epoch": 1.9637601581374917, + "grad_norm": 1.737386703491211, + "learning_rate": 5e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7350764870643616, + "num_tokens": 462672843.0, + "step": 17882 + }, + { + "epoch": 1.9638699758401055, + "grad_norm": 1.6774744987487793, + "learning_rate": 5e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7296715974807739, + "num_tokens": 462701560.0, + "step": 17883 + }, + { + "epoch": 1.9639797935427192, + "grad_norm": 1.6530606746673584, + "learning_rate": 5e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7314357161521912, + "num_tokens": 462728320.0, + "step": 17884 + }, + { + "epoch": 1.9640896112453328, + "grad_norm": 1.8476972579956055, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7313583493232727, + "num_tokens": 462753101.0, + "step": 17885 + }, + { + "epoch": 1.9641994289479463, + "grad_norm": 1.9124984741210938, + "learning_rate": 5e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7039880752563477, + "num_tokens": 462779408.0, + "step": 17886 + }, + { + "epoch": 1.96430924665056, + "grad_norm": 1.603926658630371, + "learning_rate": 5e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7325060963630676, + "num_tokens": 462807064.0, + "step": 17887 + }, + { + "epoch": 1.9644190643531738, + "grad_norm": 1.825365662574768, + "learning_rate": 5e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7320995330810547, + "num_tokens": 462831867.0, + "step": 17888 + }, + { + "epoch": 1.9645288820557874, + "grad_norm": 1.6030827760696411, + "learning_rate": 5e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7175798416137695, + "num_tokens": 462863051.0, + "step": 17889 + }, + { + "epoch": 1.964638699758401, + "grad_norm": 1.9391074180603027, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7649798393249512, + "num_tokens": 462883487.0, + "step": 17890 + }, + { + "epoch": 1.9647485174610146, + "grad_norm": 1.7625914812088013, + "learning_rate": 5e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7101059556007385, + "num_tokens": 462911988.0, + "step": 17891 + }, + { + "epoch": 1.9648583351636284, + "grad_norm": 1.8132692575454712, + "learning_rate": 5e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7304611206054688, + "num_tokens": 462935215.0, + "step": 17892 + }, + { + "epoch": 1.9649681528662422, + "grad_norm": 1.7347756624221802, + "learning_rate": 5e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7354928255081177, + "num_tokens": 462962626.0, + "step": 17893 + }, + { + "epoch": 1.9650779705688557, + "grad_norm": 1.8259191513061523, + "learning_rate": 5e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7481590509414673, + "num_tokens": 462986063.0, + "step": 17894 + }, + { + "epoch": 1.9651877882714692, + "grad_norm": 1.9324570894241333, + "learning_rate": 5e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7391601800918579, + "num_tokens": 463008129.0, + "step": 17895 + }, + { + "epoch": 1.965297605974083, + "grad_norm": 1.886938214302063, + "learning_rate": 5e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7424326539039612, + "num_tokens": 463030482.0, + "step": 17896 + }, + { + "epoch": 1.9654074236766967, + "grad_norm": 1.7110577821731567, + "learning_rate": 5e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7250765562057495, + "num_tokens": 463057841.0, + "step": 17897 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 2.0147557258605957, + "learning_rate": 5e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7361763715744019, + "num_tokens": 463079681.0, + "step": 17898 + }, + { + "epoch": 1.965627059081924, + "grad_norm": 1.9710462093353271, + "learning_rate": 5e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7537312507629395, + "num_tokens": 463100329.0, + "step": 17899 + }, + { + "epoch": 1.9657368767845376, + "grad_norm": 1.863777756690979, + "learning_rate": 5e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7537412643432617, + "num_tokens": 463125055.0, + "step": 17900 + }, + { + "epoch": 1.9658466944871513, + "grad_norm": 1.6222339868545532, + "learning_rate": 5e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.725537896156311, + "num_tokens": 463157714.0, + "step": 17901 + }, + { + "epoch": 1.965956512189765, + "grad_norm": 1.8816308975219727, + "learning_rate": 5e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.724124550819397, + "num_tokens": 463181664.0, + "step": 17902 + }, + { + "epoch": 1.9660663298923786, + "grad_norm": 1.9057934284210205, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7518229484558105, + "num_tokens": 463206853.0, + "step": 17903 + }, + { + "epoch": 1.9661761475949922, + "grad_norm": 1.6851897239685059, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7408044338226318, + "num_tokens": 463235875.0, + "step": 17904 + }, + { + "epoch": 1.966285965297606, + "grad_norm": 2.048015832901001, + "learning_rate": 5e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7385134696960449, + "num_tokens": 463259317.0, + "step": 17905 + }, + { + "epoch": 1.9663957830002197, + "grad_norm": 1.7694815397262573, + "learning_rate": 5e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7717318534851074, + "num_tokens": 463285024.0, + "step": 17906 + }, + { + "epoch": 1.9665056007028334, + "grad_norm": 1.5092750787734985, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7325558066368103, + "num_tokens": 463317767.0, + "step": 17907 + }, + { + "epoch": 1.966615418405447, + "grad_norm": 1.7608287334442139, + "learning_rate": 5e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7347247004508972, + "num_tokens": 463342225.0, + "step": 17908 + }, + { + "epoch": 1.9667252361080605, + "grad_norm": 1.8412367105484009, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7325970530509949, + "num_tokens": 463366976.0, + "step": 17909 + }, + { + "epoch": 1.9668350538106742, + "grad_norm": 1.7383253574371338, + "learning_rate": 5e-06, + "loss": 0.788, + "mean_token_accuracy": 0.75031578540802, + "num_tokens": 463392404.0, + "step": 17910 + }, + { + "epoch": 1.966944871513288, + "grad_norm": 1.749420166015625, + "learning_rate": 5e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7443742752075195, + "num_tokens": 463417991.0, + "step": 17911 + }, + { + "epoch": 1.9670546892159018, + "grad_norm": 1.798777461051941, + "learning_rate": 5e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7273215055465698, + "num_tokens": 463444730.0, + "step": 17912 + }, + { + "epoch": 1.9671645069185153, + "grad_norm": 1.7196545600891113, + "learning_rate": 5e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7191072702407837, + "num_tokens": 463473674.0, + "step": 17913 + }, + { + "epoch": 1.9672743246211288, + "grad_norm": 2.0100150108337402, + "learning_rate": 5e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.724777102470398, + "num_tokens": 463497724.0, + "step": 17914 + }, + { + "epoch": 1.9673841423237426, + "grad_norm": 1.7130497694015503, + "learning_rate": 5e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.7483989000320435, + "num_tokens": 463523187.0, + "step": 17915 + }, + { + "epoch": 1.9674939600263563, + "grad_norm": 1.736028790473938, + "learning_rate": 5e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7173658609390259, + "num_tokens": 463551133.0, + "step": 17916 + }, + { + "epoch": 1.9676037777289699, + "grad_norm": 1.7882386445999146, + "learning_rate": 5e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7369248867034912, + "num_tokens": 463574106.0, + "step": 17917 + }, + { + "epoch": 1.9677135954315834, + "grad_norm": 1.8100306987762451, + "learning_rate": 5e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7225806713104248, + "num_tokens": 463598497.0, + "step": 17918 + }, + { + "epoch": 1.9678234131341972, + "grad_norm": 1.8174909353256226, + "learning_rate": 5e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7384003400802612, + "num_tokens": 463623067.0, + "step": 17919 + }, + { + "epoch": 1.967933230836811, + "grad_norm": 1.6457444429397583, + "learning_rate": 5e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.736424446105957, + "num_tokens": 463652109.0, + "step": 17920 + }, + { + "epoch": 1.9680430485394247, + "grad_norm": 1.7580029964447021, + "learning_rate": 5e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7434496283531189, + "num_tokens": 463679858.0, + "step": 17921 + }, + { + "epoch": 1.9681528662420382, + "grad_norm": 1.8768360614776611, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7522273063659668, + "num_tokens": 463702615.0, + "step": 17922 + }, + { + "epoch": 1.9682626839446518, + "grad_norm": 1.918993592262268, + "learning_rate": 5e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7310610413551331, + "num_tokens": 463725997.0, + "step": 17923 + }, + { + "epoch": 1.9683725016472655, + "grad_norm": 1.7768934965133667, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.737845778465271, + "num_tokens": 463753926.0, + "step": 17924 + }, + { + "epoch": 1.9684823193498793, + "grad_norm": 1.6239635944366455, + "learning_rate": 5e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7064059972763062, + "num_tokens": 463783632.0, + "step": 17925 + }, + { + "epoch": 1.9685921370524928, + "grad_norm": 1.785370945930481, + "learning_rate": 5e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7355341911315918, + "num_tokens": 463807030.0, + "step": 17926 + }, + { + "epoch": 1.9687019547551066, + "grad_norm": 1.688337802886963, + "learning_rate": 5e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7348416447639465, + "num_tokens": 463835733.0, + "step": 17927 + }, + { + "epoch": 1.96881177245772, + "grad_norm": 2.1379408836364746, + "learning_rate": 5e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7316226959228516, + "num_tokens": 463855341.0, + "step": 17928 + }, + { + "epoch": 1.9689215901603339, + "grad_norm": 1.8844397068023682, + "learning_rate": 5e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7466278076171875, + "num_tokens": 463876297.0, + "step": 17929 + }, + { + "epoch": 1.9690314078629476, + "grad_norm": 1.7320703268051147, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7472184896469116, + "num_tokens": 463901910.0, + "step": 17930 + }, + { + "epoch": 1.9691412255655611, + "grad_norm": 1.9093832969665527, + "learning_rate": 5e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7285651564598083, + "num_tokens": 463928558.0, + "step": 17931 + }, + { + "epoch": 1.9692510432681747, + "grad_norm": 1.840781331062317, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7486677169799805, + "num_tokens": 463952559.0, + "step": 17932 + }, + { + "epoch": 1.9693608609707884, + "grad_norm": 1.829139232635498, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7381542325019836, + "num_tokens": 463976971.0, + "step": 17933 + }, + { + "epoch": 1.9694706786734022, + "grad_norm": 1.8366827964782715, + "learning_rate": 5e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7288132905960083, + "num_tokens": 464000834.0, + "step": 17934 + }, + { + "epoch": 1.969580496376016, + "grad_norm": 1.648757815361023, + "learning_rate": 5e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7232214212417603, + "num_tokens": 464029929.0, + "step": 17935 + }, + { + "epoch": 1.9696903140786295, + "grad_norm": 1.6273632049560547, + "learning_rate": 5e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7310130000114441, + "num_tokens": 464058429.0, + "step": 17936 + }, + { + "epoch": 1.969800131781243, + "grad_norm": 1.6470961570739746, + "learning_rate": 5e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7098044157028198, + "num_tokens": 464085853.0, + "step": 17937 + }, + { + "epoch": 1.9699099494838568, + "grad_norm": 1.7294808626174927, + "learning_rate": 5e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7307916879653931, + "num_tokens": 464114179.0, + "step": 17938 + }, + { + "epoch": 1.9700197671864705, + "grad_norm": 1.5947010517120361, + "learning_rate": 5e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.6976314783096313, + "num_tokens": 464149480.0, + "step": 17939 + }, + { + "epoch": 1.970129584889084, + "grad_norm": 1.7686705589294434, + "learning_rate": 5e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7114133238792419, + "num_tokens": 464177785.0, + "step": 17940 + }, + { + "epoch": 1.9702394025916978, + "grad_norm": 1.8949248790740967, + "learning_rate": 5e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7128744721412659, + "num_tokens": 464201341.0, + "step": 17941 + }, + { + "epoch": 1.9703492202943114, + "grad_norm": 1.6018718481063843, + "learning_rate": 5e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7075070738792419, + "num_tokens": 464232549.0, + "step": 17942 + }, + { + "epoch": 1.9704590379969251, + "grad_norm": 2.0672755241394043, + "learning_rate": 5e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7385932207107544, + "num_tokens": 464252202.0, + "step": 17943 + }, + { + "epoch": 1.9705688556995389, + "grad_norm": 1.7867885828018188, + "learning_rate": 5e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7452638149261475, + "num_tokens": 464277779.0, + "step": 17944 + }, + { + "epoch": 1.9706786734021524, + "grad_norm": 1.6230669021606445, + "learning_rate": 5e-06, + "loss": 0.845, + "mean_token_accuracy": 0.731990396976471, + "num_tokens": 464306918.0, + "step": 17945 + }, + { + "epoch": 1.970788491104766, + "grad_norm": 1.6172798871994019, + "learning_rate": 5e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7291996479034424, + "num_tokens": 464338437.0, + "step": 17946 + }, + { + "epoch": 1.9708983088073797, + "grad_norm": 1.5917916297912598, + "learning_rate": 5e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7376798987388611, + "num_tokens": 464371138.0, + "step": 17947 + }, + { + "epoch": 1.9710081265099935, + "grad_norm": 1.7805171012878418, + "learning_rate": 5e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7114105224609375, + "num_tokens": 464397865.0, + "step": 17948 + }, + { + "epoch": 1.9711179442126072, + "grad_norm": 1.6766860485076904, + "learning_rate": 5e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7137948274612427, + "num_tokens": 464428682.0, + "step": 17949 + }, + { + "epoch": 1.9712277619152208, + "grad_norm": 1.686690092086792, + "learning_rate": 5e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7290138602256775, + "num_tokens": 464457449.0, + "step": 17950 + }, + { + "epoch": 1.9713375796178343, + "grad_norm": 2.110774040222168, + "learning_rate": 5e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.742080569267273, + "num_tokens": 464476434.0, + "step": 17951 + }, + { + "epoch": 1.971447397320448, + "grad_norm": 1.7275620698928833, + "learning_rate": 5e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7232175469398499, + "num_tokens": 464504108.0, + "step": 17952 + }, + { + "epoch": 1.9715572150230618, + "grad_norm": 1.8797893524169922, + "learning_rate": 5e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7159094214439392, + "num_tokens": 464528263.0, + "step": 17953 + }, + { + "epoch": 1.9716670327256753, + "grad_norm": 1.7829735279083252, + "learning_rate": 5e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7493914365768433, + "num_tokens": 464551642.0, + "step": 17954 + }, + { + "epoch": 1.9717768504282889, + "grad_norm": 1.9216725826263428, + "learning_rate": 5e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.729724109172821, + "num_tokens": 464574320.0, + "step": 17955 + }, + { + "epoch": 1.9718866681309026, + "grad_norm": 1.8511099815368652, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7340552806854248, + "num_tokens": 464597198.0, + "step": 17956 + }, + { + "epoch": 1.9719964858335164, + "grad_norm": 1.7420570850372314, + "learning_rate": 5e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7030283808708191, + "num_tokens": 464626294.0, + "step": 17957 + }, + { + "epoch": 1.9721063035361301, + "grad_norm": 1.7982842922210693, + "learning_rate": 5e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7263482809066772, + "num_tokens": 464652265.0, + "step": 17958 + }, + { + "epoch": 1.9722161212387437, + "grad_norm": 1.982338547706604, + "learning_rate": 5e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7209134101867676, + "num_tokens": 464673494.0, + "step": 17959 + }, + { + "epoch": 1.9723259389413572, + "grad_norm": 1.7497156858444214, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7347822785377502, + "num_tokens": 464700238.0, + "step": 17960 + }, + { + "epoch": 1.972435756643971, + "grad_norm": 1.8353763818740845, + "learning_rate": 5e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7454342246055603, + "num_tokens": 464724472.0, + "step": 17961 + }, + { + "epoch": 1.9725455743465847, + "grad_norm": 1.8121581077575684, + "learning_rate": 5e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7137789726257324, + "num_tokens": 464750950.0, + "step": 17962 + }, + { + "epoch": 1.9726553920491985, + "grad_norm": 1.7173445224761963, + "learning_rate": 5e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7322049140930176, + "num_tokens": 464778667.0, + "step": 17963 + }, + { + "epoch": 1.972765209751812, + "grad_norm": 1.8118287324905396, + "learning_rate": 5e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7494828701019287, + "num_tokens": 464802469.0, + "step": 17964 + }, + { + "epoch": 1.9728750274544256, + "grad_norm": 1.8158199787139893, + "learning_rate": 5e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7460451126098633, + "num_tokens": 464827064.0, + "step": 17965 + }, + { + "epoch": 1.9729848451570393, + "grad_norm": 1.7897560596466064, + "learning_rate": 5e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7366325855255127, + "num_tokens": 464853603.0, + "step": 17966 + }, + { + "epoch": 1.973094662859653, + "grad_norm": 1.9119484424591064, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7547131776809692, + "num_tokens": 464875352.0, + "step": 17967 + }, + { + "epoch": 1.9732044805622666, + "grad_norm": 2.078237771987915, + "learning_rate": 5e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.749058723449707, + "num_tokens": 464895272.0, + "step": 17968 + }, + { + "epoch": 1.9733142982648801, + "grad_norm": 1.6187247037887573, + "learning_rate": 5e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7398449778556824, + "num_tokens": 464924794.0, + "step": 17969 + }, + { + "epoch": 1.973424115967494, + "grad_norm": 1.7757964134216309, + "learning_rate": 5e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.741343080997467, + "num_tokens": 464950667.0, + "step": 17970 + }, + { + "epoch": 1.9735339336701077, + "grad_norm": 1.904464602470398, + "learning_rate": 5e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7231802344322205, + "num_tokens": 464974570.0, + "step": 17971 + }, + { + "epoch": 1.9736437513727214, + "grad_norm": 1.8989347219467163, + "learning_rate": 5e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7145393490791321, + "num_tokens": 465002481.0, + "step": 17972 + }, + { + "epoch": 1.973753569075335, + "grad_norm": 1.951574444770813, + "learning_rate": 5e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7493963241577148, + "num_tokens": 465024834.0, + "step": 17973 + }, + { + "epoch": 1.9738633867779485, + "grad_norm": 1.7919361591339111, + "learning_rate": 5e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7227821946144104, + "num_tokens": 465050073.0, + "step": 17974 + }, + { + "epoch": 1.9739732044805622, + "grad_norm": 2.223116159439087, + "learning_rate": 5e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7291179299354553, + "num_tokens": 465069364.0, + "step": 17975 + }, + { + "epoch": 1.974083022183176, + "grad_norm": 1.963468074798584, + "learning_rate": 5e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7412683367729187, + "num_tokens": 465091080.0, + "step": 17976 + }, + { + "epoch": 1.9741928398857898, + "grad_norm": 1.6494901180267334, + "learning_rate": 5e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7162227630615234, + "num_tokens": 465120436.0, + "step": 17977 + }, + { + "epoch": 1.9743026575884033, + "grad_norm": 1.6320960521697998, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7424758672714233, + "num_tokens": 465148573.0, + "step": 17978 + }, + { + "epoch": 1.9744124752910168, + "grad_norm": 1.8400707244873047, + "learning_rate": 5e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7368863821029663, + "num_tokens": 465171960.0, + "step": 17979 + }, + { + "epoch": 1.9745222929936306, + "grad_norm": 1.7246614694595337, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7474192380905151, + "num_tokens": 465196842.0, + "step": 17980 + }, + { + "epoch": 1.9746321106962443, + "grad_norm": 1.6189892292022705, + "learning_rate": 5e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7175116539001465, + "num_tokens": 465229910.0, + "step": 17981 + }, + { + "epoch": 1.9747419283988579, + "grad_norm": 1.9402979612350464, + "learning_rate": 5e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7210794687271118, + "num_tokens": 465253669.0, + "step": 17982 + }, + { + "epoch": 1.9748517461014714, + "grad_norm": 1.7176319360733032, + "learning_rate": 5e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7344839572906494, + "num_tokens": 465282668.0, + "step": 17983 + }, + { + "epoch": 1.9749615638040852, + "grad_norm": 1.8821183443069458, + "learning_rate": 5e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7410570383071899, + "num_tokens": 465306503.0, + "step": 17984 + }, + { + "epoch": 1.975071381506699, + "grad_norm": 1.8761749267578125, + "learning_rate": 5e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7476288080215454, + "num_tokens": 465328366.0, + "step": 17985 + }, + { + "epoch": 1.9751811992093127, + "grad_norm": 1.8216214179992676, + "learning_rate": 5e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7212648391723633, + "num_tokens": 465353897.0, + "step": 17986 + }, + { + "epoch": 1.9752910169119262, + "grad_norm": 1.7641665935516357, + "learning_rate": 5e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7458447217941284, + "num_tokens": 465378560.0, + "step": 17987 + }, + { + "epoch": 1.9754008346145397, + "grad_norm": 1.8713213205337524, + "learning_rate": 5e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7232652902603149, + "num_tokens": 465402292.0, + "step": 17988 + }, + { + "epoch": 1.9755106523171535, + "grad_norm": 1.8881876468658447, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7446556091308594, + "num_tokens": 465423623.0, + "step": 17989 + }, + { + "epoch": 1.9756204700197673, + "grad_norm": 1.7897493839263916, + "learning_rate": 5e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7372680902481079, + "num_tokens": 465448862.0, + "step": 17990 + }, + { + "epoch": 1.9757302877223808, + "grad_norm": 1.8025431632995605, + "learning_rate": 5e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7387732267379761, + "num_tokens": 465474575.0, + "step": 17991 + }, + { + "epoch": 1.9758401054249946, + "grad_norm": 1.6022529602050781, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7635620832443237, + "num_tokens": 465503899.0, + "step": 17992 + }, + { + "epoch": 1.975949923127608, + "grad_norm": 1.780258059501648, + "learning_rate": 5e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.726045548915863, + "num_tokens": 465528747.0, + "step": 17993 + }, + { + "epoch": 1.9760597408302218, + "grad_norm": 1.5902780294418335, + "learning_rate": 5e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7235893607139587, + "num_tokens": 465564174.0, + "step": 17994 + }, + { + "epoch": 1.9761695585328356, + "grad_norm": 1.91342031955719, + "learning_rate": 5e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7541713714599609, + "num_tokens": 465585415.0, + "step": 17995 + }, + { + "epoch": 1.9762793762354491, + "grad_norm": 1.9548237323760986, + "learning_rate": 5e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7368776798248291, + "num_tokens": 465608081.0, + "step": 17996 + }, + { + "epoch": 1.9763891939380627, + "grad_norm": 1.767051339149475, + "learning_rate": 5e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7353806495666504, + "num_tokens": 465631780.0, + "step": 17997 + }, + { + "epoch": 1.9764990116406764, + "grad_norm": 1.8006905317306519, + "learning_rate": 5e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7327153086662292, + "num_tokens": 465655722.0, + "step": 17998 + }, + { + "epoch": 1.9766088293432902, + "grad_norm": 1.7745814323425293, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7282005548477173, + "num_tokens": 465683844.0, + "step": 17999 + }, + { + "epoch": 1.976718647045904, + "grad_norm": 1.7276873588562012, + "learning_rate": 5e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7346913814544678, + "num_tokens": 465711397.0, + "step": 18000 + }, + { + "epoch": 1.9768284647485175, + "grad_norm": 1.6403772830963135, + "learning_rate": 5e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7519904971122742, + "num_tokens": 465740164.0, + "step": 18001 + }, + { + "epoch": 1.976938282451131, + "grad_norm": 1.9723424911499023, + "learning_rate": 5e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7449188232421875, + "num_tokens": 465761288.0, + "step": 18002 + }, + { + "epoch": 1.9770481001537448, + "grad_norm": 1.9301083087921143, + "learning_rate": 5e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7389240860939026, + "num_tokens": 465786065.0, + "step": 18003 + }, + { + "epoch": 1.9771579178563585, + "grad_norm": 1.7279242277145386, + "learning_rate": 5e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7273892760276794, + "num_tokens": 465815422.0, + "step": 18004 + }, + { + "epoch": 1.977267735558972, + "grad_norm": 1.8391892910003662, + "learning_rate": 5e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7369755506515503, + "num_tokens": 465839695.0, + "step": 18005 + }, + { + "epoch": 1.9773775532615858, + "grad_norm": 1.7497594356536865, + "learning_rate": 5e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7400826811790466, + "num_tokens": 465864302.0, + "step": 18006 + }, + { + "epoch": 1.9774873709641994, + "grad_norm": 1.8821995258331299, + "learning_rate": 5e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7321691513061523, + "num_tokens": 465886489.0, + "step": 18007 + }, + { + "epoch": 1.977597188666813, + "grad_norm": 1.754391074180603, + "learning_rate": 5e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.716781735420227, + "num_tokens": 465911861.0, + "step": 18008 + }, + { + "epoch": 1.9777070063694269, + "grad_norm": 1.7428300380706787, + "learning_rate": 5e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7519400119781494, + "num_tokens": 465937794.0, + "step": 18009 + }, + { + "epoch": 1.9778168240720404, + "grad_norm": 1.8622748851776123, + "learning_rate": 5e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7232638001441956, + "num_tokens": 465963526.0, + "step": 18010 + }, + { + "epoch": 1.977926641774654, + "grad_norm": 1.950326919555664, + "learning_rate": 5e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7234823703765869, + "num_tokens": 465987544.0, + "step": 18011 + }, + { + "epoch": 1.9780364594772677, + "grad_norm": 1.6473976373672485, + "learning_rate": 5e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7200732231140137, + "num_tokens": 466018041.0, + "step": 18012 + }, + { + "epoch": 1.9781462771798815, + "grad_norm": 1.8539849519729614, + "learning_rate": 5e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7394740581512451, + "num_tokens": 466040399.0, + "step": 18013 + }, + { + "epoch": 1.9782560948824952, + "grad_norm": 1.6186550855636597, + "learning_rate": 5e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7427954077720642, + "num_tokens": 466071886.0, + "step": 18014 + }, + { + "epoch": 1.9783659125851087, + "grad_norm": 2.20540189743042, + "learning_rate": 5e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7562011480331421, + "num_tokens": 466090305.0, + "step": 18015 + }, + { + "epoch": 1.9784757302877223, + "grad_norm": 1.908455729484558, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7737476825714111, + "num_tokens": 466110532.0, + "step": 18016 + }, + { + "epoch": 1.978585547990336, + "grad_norm": 1.9032809734344482, + "learning_rate": 5e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.733578085899353, + "num_tokens": 466135874.0, + "step": 18017 + }, + { + "epoch": 1.9786953656929498, + "grad_norm": 1.9489457607269287, + "learning_rate": 5e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.7537487149238586, + "num_tokens": 466157991.0, + "step": 18018 + }, + { + "epoch": 1.9788051833955633, + "grad_norm": 1.8564430475234985, + "learning_rate": 5e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7542015910148621, + "num_tokens": 466180546.0, + "step": 18019 + }, + { + "epoch": 1.9789150010981769, + "grad_norm": 1.709352731704712, + "learning_rate": 5e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7346444725990295, + "num_tokens": 466206897.0, + "step": 18020 + }, + { + "epoch": 1.9790248188007906, + "grad_norm": 1.9598007202148438, + "learning_rate": 5e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7229733467102051, + "num_tokens": 466228801.0, + "step": 18021 + }, + { + "epoch": 1.9791346365034044, + "grad_norm": 1.5832246541976929, + "learning_rate": 5e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7339609861373901, + "num_tokens": 466259483.0, + "step": 18022 + }, + { + "epoch": 1.9792444542060181, + "grad_norm": 1.6813021898269653, + "learning_rate": 5e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7210763692855835, + "num_tokens": 466288223.0, + "step": 18023 + }, + { + "epoch": 1.9793542719086317, + "grad_norm": 1.7783628702163696, + "learning_rate": 5e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7316965460777283, + "num_tokens": 466312730.0, + "step": 18024 + }, + { + "epoch": 1.9794640896112452, + "grad_norm": 1.679127812385559, + "learning_rate": 5e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7309125065803528, + "num_tokens": 466339823.0, + "step": 18025 + }, + { + "epoch": 1.979573907313859, + "grad_norm": 1.8670028448104858, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7492399215698242, + "num_tokens": 466362331.0, + "step": 18026 + }, + { + "epoch": 1.9796837250164727, + "grad_norm": 1.7508890628814697, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7365837693214417, + "num_tokens": 466388371.0, + "step": 18027 + }, + { + "epoch": 1.9797935427190865, + "grad_norm": 1.9965640306472778, + "learning_rate": 5e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7120374441146851, + "num_tokens": 466413058.0, + "step": 18028 + }, + { + "epoch": 1.9799033604217, + "grad_norm": 1.8386540412902832, + "learning_rate": 5e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7294567823410034, + "num_tokens": 466438047.0, + "step": 18029 + }, + { + "epoch": 1.9800131781243135, + "grad_norm": 1.7862372398376465, + "learning_rate": 5e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7398601770401001, + "num_tokens": 466465073.0, + "step": 18030 + }, + { + "epoch": 1.9801229958269273, + "grad_norm": 1.8760223388671875, + "learning_rate": 5e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7411463260650635, + "num_tokens": 466488860.0, + "step": 18031 + }, + { + "epoch": 1.980232813529541, + "grad_norm": 1.8231877088546753, + "learning_rate": 5e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7399417161941528, + "num_tokens": 466512018.0, + "step": 18032 + }, + { + "epoch": 1.9803426312321546, + "grad_norm": 1.593469500541687, + "learning_rate": 5e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7254690527915955, + "num_tokens": 466543849.0, + "step": 18033 + }, + { + "epoch": 1.9804524489347681, + "grad_norm": 1.9494527578353882, + "learning_rate": 5e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7339475154876709, + "num_tokens": 466566285.0, + "step": 18034 + }, + { + "epoch": 1.9805622666373819, + "grad_norm": 1.8585017919540405, + "learning_rate": 5e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.707351803779602, + "num_tokens": 466594702.0, + "step": 18035 + }, + { + "epoch": 1.9806720843399956, + "grad_norm": 1.8187263011932373, + "learning_rate": 5e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7518799304962158, + "num_tokens": 466616901.0, + "step": 18036 + }, + { + "epoch": 1.9807819020426094, + "grad_norm": 1.7005795240402222, + "learning_rate": 5e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7528551816940308, + "num_tokens": 466644441.0, + "step": 18037 + }, + { + "epoch": 1.980891719745223, + "grad_norm": 1.75569748878479, + "learning_rate": 5e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.730156421661377, + "num_tokens": 466671740.0, + "step": 18038 + }, + { + "epoch": 1.9810015374478365, + "grad_norm": 1.814038872718811, + "learning_rate": 5e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7522367238998413, + "num_tokens": 466696399.0, + "step": 18039 + }, + { + "epoch": 1.9811113551504502, + "grad_norm": 1.6925477981567383, + "learning_rate": 5e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.732326328754425, + "num_tokens": 466726753.0, + "step": 18040 + }, + { + "epoch": 1.981221172853064, + "grad_norm": 2.0000643730163574, + "learning_rate": 5e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7321720719337463, + "num_tokens": 466750646.0, + "step": 18041 + }, + { + "epoch": 1.9813309905556777, + "grad_norm": 1.4028379917144775, + "learning_rate": 5e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7168235778808594, + "num_tokens": 466786433.0, + "step": 18042 + }, + { + "epoch": 1.9814408082582913, + "grad_norm": 1.8943207263946533, + "learning_rate": 5e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7231911420822144, + "num_tokens": 466808963.0, + "step": 18043 + }, + { + "epoch": 1.9815506259609048, + "grad_norm": 1.8913055658340454, + "learning_rate": 5e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7345530390739441, + "num_tokens": 466831119.0, + "step": 18044 + }, + { + "epoch": 1.9816604436635186, + "grad_norm": 1.637021780014038, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7533524036407471, + "num_tokens": 466859259.0, + "step": 18045 + }, + { + "epoch": 1.9817702613661323, + "grad_norm": 1.7022796869277954, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7451679706573486, + "num_tokens": 466886502.0, + "step": 18046 + }, + { + "epoch": 1.9818800790687459, + "grad_norm": 1.9167345762252808, + "learning_rate": 5e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7345750331878662, + "num_tokens": 466910012.0, + "step": 18047 + }, + { + "epoch": 1.9819898967713594, + "grad_norm": 1.844614028930664, + "learning_rate": 5e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7216718196868896, + "num_tokens": 466935430.0, + "step": 18048 + }, + { + "epoch": 1.9820997144739732, + "grad_norm": 1.8109383583068848, + "learning_rate": 5e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7322134971618652, + "num_tokens": 466961056.0, + "step": 18049 + }, + { + "epoch": 1.982209532176587, + "grad_norm": 1.7761738300323486, + "learning_rate": 5e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7351800203323364, + "num_tokens": 466986019.0, + "step": 18050 + }, + { + "epoch": 1.9823193498792007, + "grad_norm": 1.8136272430419922, + "learning_rate": 5e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.709406852722168, + "num_tokens": 467011876.0, + "step": 18051 + }, + { + "epoch": 1.9824291675818142, + "grad_norm": 1.8120197057724, + "learning_rate": 5e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7145103216171265, + "num_tokens": 467039544.0, + "step": 18052 + }, + { + "epoch": 1.9825389852844277, + "grad_norm": 1.597671389579773, + "learning_rate": 5e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7382171750068665, + "num_tokens": 467069412.0, + "step": 18053 + }, + { + "epoch": 1.9826488029870415, + "grad_norm": 1.9917678833007812, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7367358803749084, + "num_tokens": 467090884.0, + "step": 18054 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 1.7798765897750854, + "learning_rate": 5e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7214096784591675, + "num_tokens": 467116325.0, + "step": 18055 + }, + { + "epoch": 1.9828684383922688, + "grad_norm": 1.8155616521835327, + "learning_rate": 5e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7373575568199158, + "num_tokens": 467141767.0, + "step": 18056 + }, + { + "epoch": 1.9829782560948825, + "grad_norm": 1.821066975593567, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7278088331222534, + "num_tokens": 467167219.0, + "step": 18057 + }, + { + "epoch": 1.983088073797496, + "grad_norm": 1.7235918045043945, + "learning_rate": 5e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7142966985702515, + "num_tokens": 467194875.0, + "step": 18058 + }, + { + "epoch": 1.9831978915001098, + "grad_norm": 2.0421533584594727, + "learning_rate": 5e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7318717241287231, + "num_tokens": 467215763.0, + "step": 18059 + }, + { + "epoch": 1.9833077092027236, + "grad_norm": 1.9001028537750244, + "learning_rate": 5e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7268586754798889, + "num_tokens": 467241082.0, + "step": 18060 + }, + { + "epoch": 1.9834175269053371, + "grad_norm": 1.6128369569778442, + "learning_rate": 5e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7146260738372803, + "num_tokens": 467273173.0, + "step": 18061 + }, + { + "epoch": 1.9835273446079507, + "grad_norm": 1.7766640186309814, + "learning_rate": 5e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7281486988067627, + "num_tokens": 467297879.0, + "step": 18062 + }, + { + "epoch": 1.9836371623105644, + "grad_norm": 1.6748027801513672, + "learning_rate": 5e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7488488554954529, + "num_tokens": 467326622.0, + "step": 18063 + }, + { + "epoch": 1.9837469800131782, + "grad_norm": 1.868328332901001, + "learning_rate": 5e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7363418936729431, + "num_tokens": 467351041.0, + "step": 18064 + }, + { + "epoch": 1.983856797715792, + "grad_norm": 1.8156055212020874, + "learning_rate": 5e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7121447920799255, + "num_tokens": 467379525.0, + "step": 18065 + }, + { + "epoch": 1.9839666154184055, + "grad_norm": 1.7409789562225342, + "learning_rate": 5e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7275463342666626, + "num_tokens": 467404921.0, + "step": 18066 + }, + { + "epoch": 1.984076433121019, + "grad_norm": 1.8655927181243896, + "learning_rate": 5e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7367829084396362, + "num_tokens": 467426597.0, + "step": 18067 + }, + { + "epoch": 1.9841862508236328, + "grad_norm": 1.7931907176971436, + "learning_rate": 5e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.729827880859375, + "num_tokens": 467450037.0, + "step": 18068 + }, + { + "epoch": 1.9842960685262465, + "grad_norm": 1.8667759895324707, + "learning_rate": 5e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7267106175422668, + "num_tokens": 467475090.0, + "step": 18069 + }, + { + "epoch": 1.98440588622886, + "grad_norm": 1.6627827882766724, + "learning_rate": 5e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7460949420928955, + "num_tokens": 467503413.0, + "step": 18070 + }, + { + "epoch": 1.9845157039314738, + "grad_norm": 1.818359613418579, + "learning_rate": 5e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7179481983184814, + "num_tokens": 467529973.0, + "step": 18071 + }, + { + "epoch": 1.9846255216340873, + "grad_norm": 1.7088724374771118, + "learning_rate": 5e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7015334963798523, + "num_tokens": 467558497.0, + "step": 18072 + }, + { + "epoch": 1.984735339336701, + "grad_norm": 1.743949294090271, + "learning_rate": 5e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7401009798049927, + "num_tokens": 467583163.0, + "step": 18073 + }, + { + "epoch": 1.9848451570393149, + "grad_norm": 1.8461288213729858, + "learning_rate": 5e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7125828266143799, + "num_tokens": 467607787.0, + "step": 18074 + }, + { + "epoch": 1.9849549747419284, + "grad_norm": 1.6483370065689087, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7396442890167236, + "num_tokens": 467633810.0, + "step": 18075 + }, + { + "epoch": 1.985064792444542, + "grad_norm": 1.6478209495544434, + "learning_rate": 5e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7333157062530518, + "num_tokens": 467661481.0, + "step": 18076 + }, + { + "epoch": 1.9851746101471557, + "grad_norm": 2.1645305156707764, + "learning_rate": 5e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7533427476882935, + "num_tokens": 467679582.0, + "step": 18077 + }, + { + "epoch": 1.9852844278497694, + "grad_norm": 1.558143138885498, + "learning_rate": 5e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.732965350151062, + "num_tokens": 467710384.0, + "step": 18078 + }, + { + "epoch": 1.9853942455523832, + "grad_norm": 1.940119981765747, + "learning_rate": 5e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7461143732070923, + "num_tokens": 467731497.0, + "step": 18079 + }, + { + "epoch": 1.9855040632549967, + "grad_norm": 1.8705486059188843, + "learning_rate": 5e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7283962368965149, + "num_tokens": 467756594.0, + "step": 18080 + }, + { + "epoch": 1.9856138809576103, + "grad_norm": 1.5155848264694214, + "learning_rate": 5e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7380990386009216, + "num_tokens": 467787846.0, + "step": 18081 + }, + { + "epoch": 1.985723698660224, + "grad_norm": 1.6265199184417725, + "learning_rate": 5e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7211995720863342, + "num_tokens": 467817965.0, + "step": 18082 + }, + { + "epoch": 1.9858335163628378, + "grad_norm": 2.004483461380005, + "learning_rate": 5e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7395274639129639, + "num_tokens": 467839926.0, + "step": 18083 + }, + { + "epoch": 1.9859433340654513, + "grad_norm": 1.7958801984786987, + "learning_rate": 5e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7358769774436951, + "num_tokens": 467863175.0, + "step": 18084 + }, + { + "epoch": 1.9860531517680649, + "grad_norm": 1.5712335109710693, + "learning_rate": 5e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7315323352813721, + "num_tokens": 467891820.0, + "step": 18085 + }, + { + "epoch": 1.9861629694706786, + "grad_norm": 1.5782686471939087, + "learning_rate": 5e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7287733554840088, + "num_tokens": 467923683.0, + "step": 18086 + }, + { + "epoch": 1.9862727871732924, + "grad_norm": 1.7527856826782227, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7229715585708618, + "num_tokens": 467952118.0, + "step": 18087 + }, + { + "epoch": 1.9863826048759061, + "grad_norm": 1.5450139045715332, + "learning_rate": 5e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7355296611785889, + "num_tokens": 467982242.0, + "step": 18088 + }, + { + "epoch": 1.9864924225785197, + "grad_norm": 1.655632495880127, + "learning_rate": 5e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7458650469779968, + "num_tokens": 468009688.0, + "step": 18089 + }, + { + "epoch": 1.9866022402811332, + "grad_norm": 1.7084262371063232, + "learning_rate": 5e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.732275664806366, + "num_tokens": 468037334.0, + "step": 18090 + }, + { + "epoch": 1.986712057983747, + "grad_norm": 1.7258248329162598, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7595893144607544, + "num_tokens": 468064530.0, + "step": 18091 + }, + { + "epoch": 1.9868218756863607, + "grad_norm": 1.864790916442871, + "learning_rate": 5e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.720598578453064, + "num_tokens": 468089103.0, + "step": 18092 + }, + { + "epoch": 1.9869316933889745, + "grad_norm": 1.7724214792251587, + "learning_rate": 5e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7346248626708984, + "num_tokens": 468116509.0, + "step": 18093 + }, + { + "epoch": 1.987041511091588, + "grad_norm": 1.587701678276062, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.745169460773468, + "num_tokens": 468146907.0, + "step": 18094 + }, + { + "epoch": 1.9871513287942015, + "grad_norm": 1.818946123123169, + "learning_rate": 5e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7187018394470215, + "num_tokens": 468173645.0, + "step": 18095 + }, + { + "epoch": 1.9872611464968153, + "grad_norm": 1.7272250652313232, + "learning_rate": 5e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7375233769416809, + "num_tokens": 468200002.0, + "step": 18096 + }, + { + "epoch": 1.987370964199429, + "grad_norm": 1.6406749486923218, + "learning_rate": 5e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.727759063243866, + "num_tokens": 468230492.0, + "step": 18097 + }, + { + "epoch": 1.9874807819020426, + "grad_norm": 1.738896131515503, + "learning_rate": 5e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7206134796142578, + "num_tokens": 468257615.0, + "step": 18098 + }, + { + "epoch": 1.9875905996046561, + "grad_norm": 1.8220727443695068, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7549938559532166, + "num_tokens": 468282916.0, + "step": 18099 + }, + { + "epoch": 1.9877004173072699, + "grad_norm": 1.7583117485046387, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7337946891784668, + "num_tokens": 468308572.0, + "step": 18100 + }, + { + "epoch": 1.9878102350098836, + "grad_norm": 1.5996683835983276, + "learning_rate": 5e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7306607961654663, + "num_tokens": 468338373.0, + "step": 18101 + }, + { + "epoch": 1.9879200527124974, + "grad_norm": 1.7851475477218628, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7429300546646118, + "num_tokens": 468364205.0, + "step": 18102 + }, + { + "epoch": 1.988029870415111, + "grad_norm": 1.8532768487930298, + "learning_rate": 5e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7297694683074951, + "num_tokens": 468388764.0, + "step": 18103 + }, + { + "epoch": 1.9881396881177245, + "grad_norm": 1.688270092010498, + "learning_rate": 5e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7161343097686768, + "num_tokens": 468416854.0, + "step": 18104 + }, + { + "epoch": 1.9882495058203382, + "grad_norm": 1.7405287027359009, + "learning_rate": 5e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7163622379302979, + "num_tokens": 468446259.0, + "step": 18105 + }, + { + "epoch": 1.988359323522952, + "grad_norm": 1.8282629251480103, + "learning_rate": 5e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7399452328681946, + "num_tokens": 468469949.0, + "step": 18106 + }, + { + "epoch": 1.9884691412255655, + "grad_norm": 1.8516298532485962, + "learning_rate": 5e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.742522120475769, + "num_tokens": 468492228.0, + "step": 18107 + }, + { + "epoch": 1.9885789589281793, + "grad_norm": 1.6734980344772339, + "learning_rate": 5e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7146487236022949, + "num_tokens": 468521099.0, + "step": 18108 + }, + { + "epoch": 1.9886887766307928, + "grad_norm": 1.9204368591308594, + "learning_rate": 5e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.721555233001709, + "num_tokens": 468544207.0, + "step": 18109 + }, + { + "epoch": 1.9887985943334066, + "grad_norm": 1.616159200668335, + "learning_rate": 5e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7196215391159058, + "num_tokens": 468572495.0, + "step": 18110 + }, + { + "epoch": 1.9889084120360203, + "grad_norm": 1.6932488679885864, + "learning_rate": 5e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7414349913597107, + "num_tokens": 468599446.0, + "step": 18111 + }, + { + "epoch": 1.9890182297386338, + "grad_norm": 1.6472238302230835, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7364274859428406, + "num_tokens": 468630873.0, + "step": 18112 + }, + { + "epoch": 1.9891280474412474, + "grad_norm": 1.6252965927124023, + "learning_rate": 5e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7033013105392456, + "num_tokens": 468663838.0, + "step": 18113 + }, + { + "epoch": 1.9892378651438611, + "grad_norm": 1.806769609451294, + "learning_rate": 5e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7408091425895691, + "num_tokens": 468688955.0, + "step": 18114 + }, + { + "epoch": 1.989347682846475, + "grad_norm": 1.8653467893600464, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7431445121765137, + "num_tokens": 468710369.0, + "step": 18115 + }, + { + "epoch": 1.9894575005490887, + "grad_norm": 1.8197109699249268, + "learning_rate": 5e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7615389823913574, + "num_tokens": 468732983.0, + "step": 18116 + }, + { + "epoch": 1.9895673182517022, + "grad_norm": 1.8839945793151855, + "learning_rate": 5e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7200539112091064, + "num_tokens": 468755830.0, + "step": 18117 + }, + { + "epoch": 1.9896771359543157, + "grad_norm": 1.8489481210708618, + "learning_rate": 5e-06, + "loss": 0.819, + "mean_token_accuracy": 0.735745906829834, + "num_tokens": 468780401.0, + "step": 18118 + }, + { + "epoch": 1.9897869536569295, + "grad_norm": 1.9772578477859497, + "learning_rate": 5e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7429102659225464, + "num_tokens": 468800780.0, + "step": 18119 + }, + { + "epoch": 1.9898967713595432, + "grad_norm": 1.75351881980896, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7373605966567993, + "num_tokens": 468827520.0, + "step": 18120 + }, + { + "epoch": 1.9900065890621568, + "grad_norm": 1.7945456504821777, + "learning_rate": 5e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7233302593231201, + "num_tokens": 468852699.0, + "step": 18121 + }, + { + "epoch": 1.9901164067647705, + "grad_norm": 1.5875120162963867, + "learning_rate": 5e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.736945390701294, + "num_tokens": 468882406.0, + "step": 18122 + }, + { + "epoch": 1.990226224467384, + "grad_norm": 1.9723665714263916, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7611296772956848, + "num_tokens": 468903160.0, + "step": 18123 + }, + { + "epoch": 1.9903360421699978, + "grad_norm": 1.6092827320098877, + "learning_rate": 5e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7100878953933716, + "num_tokens": 468934012.0, + "step": 18124 + }, + { + "epoch": 1.9904458598726116, + "grad_norm": 1.6043331623077393, + "learning_rate": 5e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.717856764793396, + "num_tokens": 468964574.0, + "step": 18125 + }, + { + "epoch": 1.9905556775752251, + "grad_norm": 2.140002965927124, + "learning_rate": 5e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7361443042755127, + "num_tokens": 468983885.0, + "step": 18126 + }, + { + "epoch": 1.9906654952778386, + "grad_norm": 1.6474220752716064, + "learning_rate": 5e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.752463161945343, + "num_tokens": 469011449.0, + "step": 18127 + }, + { + "epoch": 1.9907753129804524, + "grad_norm": 1.5884379148483276, + "learning_rate": 5e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7270295023918152, + "num_tokens": 469042231.0, + "step": 18128 + }, + { + "epoch": 1.9908851306830662, + "grad_norm": 1.7419285774230957, + "learning_rate": 5e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7280406951904297, + "num_tokens": 469070241.0, + "step": 18129 + }, + { + "epoch": 1.99099494838568, + "grad_norm": 1.8300851583480835, + "learning_rate": 5e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7148792743682861, + "num_tokens": 469096951.0, + "step": 18130 + }, + { + "epoch": 1.9911047660882935, + "grad_norm": 1.9004555940628052, + "learning_rate": 5e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7518566846847534, + "num_tokens": 469119332.0, + "step": 18131 + }, + { + "epoch": 1.991214583790907, + "grad_norm": 1.7700282335281372, + "learning_rate": 5e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7384944558143616, + "num_tokens": 469144104.0, + "step": 18132 + }, + { + "epoch": 1.9913244014935207, + "grad_norm": 1.9446492195129395, + "learning_rate": 5e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7287460565567017, + "num_tokens": 469167428.0, + "step": 18133 + }, + { + "epoch": 1.9914342191961345, + "grad_norm": 1.8102706670761108, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7195121049880981, + "num_tokens": 469193979.0, + "step": 18134 + }, + { + "epoch": 1.991544036898748, + "grad_norm": 1.9732106924057007, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7450928688049316, + "num_tokens": 469214168.0, + "step": 18135 + }, + { + "epoch": 1.9916538546013618, + "grad_norm": 1.5401833057403564, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7430599331855774, + "num_tokens": 469247410.0, + "step": 18136 + }, + { + "epoch": 1.9917636723039753, + "grad_norm": 1.7780576944351196, + "learning_rate": 5e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7127394676208496, + "num_tokens": 469275550.0, + "step": 18137 + }, + { + "epoch": 1.991873490006589, + "grad_norm": 1.9547154903411865, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7506678104400635, + "num_tokens": 469297144.0, + "step": 18138 + }, + { + "epoch": 1.9919833077092028, + "grad_norm": 1.7569462060928345, + "learning_rate": 5e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7412762641906738, + "num_tokens": 469323597.0, + "step": 18139 + }, + { + "epoch": 1.9920931254118164, + "grad_norm": 1.657960057258606, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7408251166343689, + "num_tokens": 469352411.0, + "step": 18140 + }, + { + "epoch": 1.99220294311443, + "grad_norm": 1.7908222675323486, + "learning_rate": 5e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7133437395095825, + "num_tokens": 469378656.0, + "step": 18141 + }, + { + "epoch": 1.9923127608170437, + "grad_norm": 1.9102822542190552, + "learning_rate": 5e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7336528897285461, + "num_tokens": 469400813.0, + "step": 18142 + }, + { + "epoch": 1.9924225785196574, + "grad_norm": 1.5939459800720215, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7559287548065186, + "num_tokens": 469430623.0, + "step": 18143 + }, + { + "epoch": 1.9925323962222712, + "grad_norm": 1.558977723121643, + "learning_rate": 5e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7330769300460815, + "num_tokens": 469464387.0, + "step": 18144 + }, + { + "epoch": 1.9926422139248847, + "grad_norm": 1.7647120952606201, + "learning_rate": 5e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7365790605545044, + "num_tokens": 469490586.0, + "step": 18145 + }, + { + "epoch": 1.9927520316274983, + "grad_norm": 1.790560007095337, + "learning_rate": 5e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7178804874420166, + "num_tokens": 469516824.0, + "step": 18146 + }, + { + "epoch": 1.992861849330112, + "grad_norm": 1.7799538373947144, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7544360160827637, + "num_tokens": 469541614.0, + "step": 18147 + }, + { + "epoch": 1.9929716670327258, + "grad_norm": 1.904356837272644, + "learning_rate": 5e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.721237301826477, + "num_tokens": 469563863.0, + "step": 18148 + }, + { + "epoch": 1.9930814847353393, + "grad_norm": 2.185204267501831, + "learning_rate": 5e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7513439655303955, + "num_tokens": 469583257.0, + "step": 18149 + }, + { + "epoch": 1.9931913024379528, + "grad_norm": 1.8186862468719482, + "learning_rate": 5e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.726231038570404, + "num_tokens": 469607536.0, + "step": 18150 + }, + { + "epoch": 1.9933011201405666, + "grad_norm": 1.606428861618042, + "learning_rate": 5e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7249138355255127, + "num_tokens": 469637648.0, + "step": 18151 + }, + { + "epoch": 1.9934109378431804, + "grad_norm": 1.7433178424835205, + "learning_rate": 5e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7250651121139526, + "num_tokens": 469663992.0, + "step": 18152 + }, + { + "epoch": 1.993520755545794, + "grad_norm": 1.6360787153244019, + "learning_rate": 5e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7480961084365845, + "num_tokens": 469690838.0, + "step": 18153 + }, + { + "epoch": 1.9936305732484076, + "grad_norm": 1.6743144989013672, + "learning_rate": 5e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.737995982170105, + "num_tokens": 469720908.0, + "step": 18154 + }, + { + "epoch": 1.9937403909510212, + "grad_norm": 1.9907981157302856, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7376198172569275, + "num_tokens": 469743533.0, + "step": 18155 + }, + { + "epoch": 1.993850208653635, + "grad_norm": 1.7501839399337769, + "learning_rate": 5e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7333896160125732, + "num_tokens": 469768979.0, + "step": 18156 + }, + { + "epoch": 1.9939600263562487, + "grad_norm": 2.13859486579895, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7718551754951477, + "num_tokens": 469787102.0, + "step": 18157 + }, + { + "epoch": 1.9940698440588625, + "grad_norm": 1.5998188257217407, + "learning_rate": 5e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7626683712005615, + "num_tokens": 469816565.0, + "step": 18158 + }, + { + "epoch": 1.994179661761476, + "grad_norm": 1.666940689086914, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7407647967338562, + "num_tokens": 469844856.0, + "step": 18159 + }, + { + "epoch": 1.9942894794640895, + "grad_norm": 1.777618169784546, + "learning_rate": 5e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7528975605964661, + "num_tokens": 469868522.0, + "step": 18160 + }, + { + "epoch": 1.9943992971667033, + "grad_norm": 1.72540283203125, + "learning_rate": 5e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7267171740531921, + "num_tokens": 469895774.0, + "step": 18161 + }, + { + "epoch": 1.994509114869317, + "grad_norm": 1.6448907852172852, + "learning_rate": 5e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7288014888763428, + "num_tokens": 469924670.0, + "step": 18162 + }, + { + "epoch": 1.9946189325719306, + "grad_norm": 1.9100383520126343, + "learning_rate": 5e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7471259236335754, + "num_tokens": 469946807.0, + "step": 18163 + }, + { + "epoch": 1.994728750274544, + "grad_norm": 1.8831489086151123, + "learning_rate": 5e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7187467217445374, + "num_tokens": 469971478.0, + "step": 18164 + }, + { + "epoch": 1.9948385679771579, + "grad_norm": 2.121941089630127, + "learning_rate": 5e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7317554950714111, + "num_tokens": 469994328.0, + "step": 18165 + }, + { + "epoch": 1.9949483856797716, + "grad_norm": 1.5659898519515991, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7170038223266602, + "num_tokens": 470026601.0, + "step": 18166 + }, + { + "epoch": 1.9950582033823854, + "grad_norm": 1.9822853803634644, + "learning_rate": 5e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7471714615821838, + "num_tokens": 470046865.0, + "step": 18167 + }, + { + "epoch": 1.995168021084999, + "grad_norm": 1.7490547895431519, + "learning_rate": 5e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7307578325271606, + "num_tokens": 470074052.0, + "step": 18168 + }, + { + "epoch": 1.9952778387876124, + "grad_norm": 1.8420331478118896, + "learning_rate": 5e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.708082914352417, + "num_tokens": 470100799.0, + "step": 18169 + }, + { + "epoch": 1.9953876564902262, + "grad_norm": 1.6723121404647827, + "learning_rate": 5e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7385482788085938, + "num_tokens": 470128102.0, + "step": 18170 + }, + { + "epoch": 1.99549747419284, + "grad_norm": 1.803430199623108, + "learning_rate": 5e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7397482395172119, + "num_tokens": 470153810.0, + "step": 18171 + }, + { + "epoch": 1.9956072918954535, + "grad_norm": 1.819059133529663, + "learning_rate": 5e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7562844753265381, + "num_tokens": 470176836.0, + "step": 18172 + }, + { + "epoch": 1.9957171095980673, + "grad_norm": 1.8508517742156982, + "learning_rate": 5e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7444276809692383, + "num_tokens": 470201500.0, + "step": 18173 + }, + { + "epoch": 1.9958269273006808, + "grad_norm": 1.9086793661117554, + "learning_rate": 5e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7400603890419006, + "num_tokens": 470223853.0, + "step": 18174 + }, + { + "epoch": 1.9959367450032945, + "grad_norm": 1.7153620719909668, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7422024607658386, + "num_tokens": 470247271.0, + "step": 18175 + }, + { + "epoch": 1.9960465627059083, + "grad_norm": 1.645135521888733, + "learning_rate": 5e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7373101711273193, + "num_tokens": 470275120.0, + "step": 18176 + }, + { + "epoch": 1.9961563804085218, + "grad_norm": 1.4532667398452759, + "learning_rate": 5e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7061030864715576, + "num_tokens": 470315623.0, + "step": 18177 + }, + { + "epoch": 1.9962661981111354, + "grad_norm": 1.7443718910217285, + "learning_rate": 5e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7242226600646973, + "num_tokens": 470340985.0, + "step": 18178 + }, + { + "epoch": 1.9963760158137491, + "grad_norm": 1.7319949865341187, + "learning_rate": 5e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7165822982788086, + "num_tokens": 470370490.0, + "step": 18179 + }, + { + "epoch": 1.9964858335163629, + "grad_norm": 1.8237810134887695, + "learning_rate": 5e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7183614373207092, + "num_tokens": 470395816.0, + "step": 18180 + }, + { + "epoch": 1.9965956512189766, + "grad_norm": 2.021106243133545, + "learning_rate": 5e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7311234474182129, + "num_tokens": 470417519.0, + "step": 18181 + }, + { + "epoch": 1.9967054689215902, + "grad_norm": 1.7909424304962158, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.741451621055603, + "num_tokens": 470444204.0, + "step": 18182 + }, + { + "epoch": 1.9968152866242037, + "grad_norm": 1.8331762552261353, + "learning_rate": 5e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7240005731582642, + "num_tokens": 470468193.0, + "step": 18183 + }, + { + "epoch": 1.9969251043268175, + "grad_norm": 1.7677998542785645, + "learning_rate": 5e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7561709880828857, + "num_tokens": 470493282.0, + "step": 18184 + }, + { + "epoch": 1.9970349220294312, + "grad_norm": 1.7545109987258911, + "learning_rate": 5e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7437297105789185, + "num_tokens": 470517842.0, + "step": 18185 + }, + { + "epoch": 1.9971447397320448, + "grad_norm": 1.8035271167755127, + "learning_rate": 5e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7317513227462769, + "num_tokens": 470544990.0, + "step": 18186 + }, + { + "epoch": 1.9972545574346585, + "grad_norm": 1.698325276374817, + "learning_rate": 5e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7075881361961365, + "num_tokens": 470579114.0, + "step": 18187 + }, + { + "epoch": 1.997364375137272, + "grad_norm": 1.6756998300552368, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7390680313110352, + "num_tokens": 470607308.0, + "step": 18188 + }, + { + "epoch": 1.9974741928398858, + "grad_norm": 1.8578683137893677, + "learning_rate": 5e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7026708126068115, + "num_tokens": 470634581.0, + "step": 18189 + }, + { + "epoch": 1.9975840105424996, + "grad_norm": 2.0431933403015137, + "learning_rate": 5e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7403063774108887, + "num_tokens": 470654758.0, + "step": 18190 + }, + { + "epoch": 1.997693828245113, + "grad_norm": 1.6957565546035767, + "learning_rate": 5e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7245112657546997, + "num_tokens": 470682580.0, + "step": 18191 + }, + { + "epoch": 1.9978036459477266, + "grad_norm": 1.5393671989440918, + "learning_rate": 5e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.719671368598938, + "num_tokens": 470716257.0, + "step": 18192 + }, + { + "epoch": 1.9979134636503404, + "grad_norm": 1.8429917097091675, + "learning_rate": 5e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7154654264450073, + "num_tokens": 470740803.0, + "step": 18193 + }, + { + "epoch": 1.9980232813529542, + "grad_norm": 1.5432679653167725, + "learning_rate": 5e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7130914926528931, + "num_tokens": 470772907.0, + "step": 18194 + }, + { + "epoch": 1.998133099055568, + "grad_norm": 1.5377877950668335, + "learning_rate": 5e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7324487566947937, + "num_tokens": 470802295.0, + "step": 18195 + }, + { + "epoch": 1.9982429167581814, + "grad_norm": 1.6053881645202637, + "learning_rate": 5e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7002593278884888, + "num_tokens": 470835337.0, + "step": 18196 + }, + { + "epoch": 1.998352734460795, + "grad_norm": 1.7732747793197632, + "learning_rate": 5e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7310089468955994, + "num_tokens": 470860975.0, + "step": 18197 + }, + { + "epoch": 1.9984625521634087, + "grad_norm": 1.7673777341842651, + "learning_rate": 5e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7522514462471008, + "num_tokens": 470884948.0, + "step": 18198 + }, + { + "epoch": 1.9985723698660225, + "grad_norm": 1.7665283679962158, + "learning_rate": 5e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7371090054512024, + "num_tokens": 470911291.0, + "step": 18199 + }, + { + "epoch": 1.998682187568636, + "grad_norm": 1.9002773761749268, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7451429963111877, + "num_tokens": 470932574.0, + "step": 18200 + }, + { + "epoch": 1.9987920052712496, + "grad_norm": 1.6335846185684204, + "learning_rate": 5e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7022932171821594, + "num_tokens": 470962928.0, + "step": 18201 + }, + { + "epoch": 1.9989018229738633, + "grad_norm": 2.0614752769470215, + "learning_rate": 5e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7185927629470825, + "num_tokens": 470984984.0, + "step": 18202 + }, + { + "epoch": 1.999011640676477, + "grad_norm": 1.897869348526001, + "learning_rate": 5e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7417432069778442, + "num_tokens": 471007432.0, + "step": 18203 + }, + { + "epoch": 1.9991214583790908, + "grad_norm": 1.8029109239578247, + "learning_rate": 5e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7105288505554199, + "num_tokens": 471036431.0, + "step": 18204 + }, + { + "epoch": 1.9992312760817044, + "grad_norm": 1.9772628545761108, + "learning_rate": 5e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7343131899833679, + "num_tokens": 471057948.0, + "step": 18205 + }, + { + "epoch": 1.999341093784318, + "grad_norm": 1.9181362390518188, + "learning_rate": 5e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7456376552581787, + "num_tokens": 471078858.0, + "step": 18206 + }, + { + "epoch": 1.9994509114869317, + "grad_norm": 1.80863618850708, + "learning_rate": 5e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7177830934524536, + "num_tokens": 471105951.0, + "step": 18207 + }, + { + "epoch": 1.9995607291895454, + "grad_norm": 1.6638083457946777, + "learning_rate": 5e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7342947721481323, + "num_tokens": 471135188.0, + "step": 18208 + }, + { + "epoch": 1.9996705468921592, + "grad_norm": 1.9777107238769531, + "learning_rate": 5e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7273685932159424, + "num_tokens": 471156600.0, + "step": 18209 + }, + { + "epoch": 1.9997803645947727, + "grad_norm": 1.7309170961380005, + "learning_rate": 5e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7243189215660095, + "num_tokens": 471183625.0, + "step": 18210 + }, + { + "epoch": 1.9998901822973862, + "grad_norm": 1.5360932350158691, + "learning_rate": 5e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7093561887741089, + "num_tokens": 471216461.0, + "step": 18211 + }, + { + "epoch": 2.0, + "grad_norm": 1.969585657119751, + "learning_rate": 5e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7316766977310181, + "num_tokens": 471237531.0, + "step": 18212 + }, + { + "epoch": 2.0001098177026138, + "grad_norm": 1.9024758338928223, + "learning_rate": 5e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7727057337760925, + "num_tokens": 471264215.0, + "step": 18213 + }, + { + "epoch": 2.0002196354052275, + "grad_norm": 1.855823040008545, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7588529586791992, + "num_tokens": 471291127.0, + "step": 18214 + }, + { + "epoch": 2.000329453107841, + "grad_norm": 1.9649457931518555, + "learning_rate": 5e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7608415484428406, + "num_tokens": 471315281.0, + "step": 18215 + }, + { + "epoch": 2.0004392708104546, + "grad_norm": 1.8857167959213257, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.774116039276123, + "num_tokens": 471336925.0, + "step": 18216 + }, + { + "epoch": 2.0005490885130683, + "grad_norm": 1.5421725511550903, + "learning_rate": 5e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7369229197502136, + "num_tokens": 471369182.0, + "step": 18217 + }, + { + "epoch": 2.000658906215682, + "grad_norm": 1.779233455657959, + "learning_rate": 5e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.727141797542572, + "num_tokens": 471402714.0, + "step": 18218 + }, + { + "epoch": 2.0007687239182954, + "grad_norm": 1.8315701484680176, + "learning_rate": 5e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.740087628364563, + "num_tokens": 471434970.0, + "step": 18219 + }, + { + "epoch": 2.000878541620909, + "grad_norm": 1.7040178775787354, + "learning_rate": 5e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7411379218101501, + "num_tokens": 471470035.0, + "step": 18220 + }, + { + "epoch": 2.000988359323523, + "grad_norm": 1.8269805908203125, + "learning_rate": 5e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7459012269973755, + "num_tokens": 471499601.0, + "step": 18221 + }, + { + "epoch": 2.0010981770261367, + "grad_norm": 2.232793092727661, + "learning_rate": 5e-06, + "loss": 0.6183, + "mean_token_accuracy": 0.7895429134368896, + "num_tokens": 471519260.0, + "step": 18222 + }, + { + "epoch": 2.0012079947287504, + "grad_norm": 2.17402720451355, + "learning_rate": 5e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7696323394775391, + "num_tokens": 471542784.0, + "step": 18223 + }, + { + "epoch": 2.0013178124313638, + "grad_norm": 2.1933815479278564, + "learning_rate": 5e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7592859268188477, + "num_tokens": 471564428.0, + "step": 18224 + }, + { + "epoch": 2.0014276301339775, + "grad_norm": 1.9668205976486206, + "learning_rate": 5e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7531129121780396, + "num_tokens": 471594282.0, + "step": 18225 + }, + { + "epoch": 2.0015374478365913, + "grad_norm": 1.7848986387252808, + "learning_rate": 5e-06, + "loss": 0.7304, + "mean_token_accuracy": 0.7580012083053589, + "num_tokens": 471623822.0, + "step": 18226 + }, + { + "epoch": 2.001647265539205, + "grad_norm": 2.097428560256958, + "learning_rate": 5e-06, + "loss": 0.6705, + "mean_token_accuracy": 0.7749452590942383, + "num_tokens": 471646805.0, + "step": 18227 + }, + { + "epoch": 2.001757083241819, + "grad_norm": 1.747382402420044, + "learning_rate": 5e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7510757446289062, + "num_tokens": 471683529.0, + "step": 18228 + }, + { + "epoch": 2.001866900944432, + "grad_norm": 2.235290050506592, + "learning_rate": 5e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7524241209030151, + "num_tokens": 471708146.0, + "step": 18229 + }, + { + "epoch": 2.001976718647046, + "grad_norm": 2.0313265323638916, + "learning_rate": 5e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.7712565064430237, + "num_tokens": 471733161.0, + "step": 18230 + }, + { + "epoch": 2.0020865363496596, + "grad_norm": 1.9923452138900757, + "learning_rate": 5e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7622736692428589, + "num_tokens": 471762454.0, + "step": 18231 + }, + { + "epoch": 2.0021963540522734, + "grad_norm": 2.221423625946045, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7567063570022583, + "num_tokens": 471784738.0, + "step": 18232 + }, + { + "epoch": 2.0023061717548867, + "grad_norm": 2.0299594402313232, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7428044080734253, + "num_tokens": 471815591.0, + "step": 18233 + }, + { + "epoch": 2.0024159894575004, + "grad_norm": 2.0040605068206787, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7668116092681885, + "num_tokens": 471844095.0, + "step": 18234 + }, + { + "epoch": 2.002525807160114, + "grad_norm": 1.87989342212677, + "learning_rate": 5e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7671718001365662, + "num_tokens": 471874815.0, + "step": 18235 + }, + { + "epoch": 2.002635624862728, + "grad_norm": 2.025271415710449, + "learning_rate": 5e-06, + "loss": 0.6131, + "mean_token_accuracy": 0.7958793044090271, + "num_tokens": 471899594.0, + "step": 18236 + }, + { + "epoch": 2.0027454425653417, + "grad_norm": 1.9666972160339355, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7605390548706055, + "num_tokens": 471929867.0, + "step": 18237 + }, + { + "epoch": 2.002855260267955, + "grad_norm": 2.1274352073669434, + "learning_rate": 5e-06, + "loss": 0.6711, + "mean_token_accuracy": 0.7786387205123901, + "num_tokens": 471952910.0, + "step": 18238 + }, + { + "epoch": 2.0029650779705688, + "grad_norm": 1.9990164041519165, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7641022205352783, + "num_tokens": 471978303.0, + "step": 18239 + }, + { + "epoch": 2.0030748956731825, + "grad_norm": 1.9047796726226807, + "learning_rate": 5e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8057955503463745, + "num_tokens": 472004110.0, + "step": 18240 + }, + { + "epoch": 2.0031847133757963, + "grad_norm": 2.0332915782928467, + "learning_rate": 5e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.7655367255210876, + "num_tokens": 472028380.0, + "step": 18241 + }, + { + "epoch": 2.00329453107841, + "grad_norm": 1.8610929250717163, + "learning_rate": 5e-06, + "loss": 0.6517, + "mean_token_accuracy": 0.7820259928703308, + "num_tokens": 472056641.0, + "step": 18242 + }, + { + "epoch": 2.0034043487810234, + "grad_norm": 2.145721912384033, + "learning_rate": 5e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7533607482910156, + "num_tokens": 472080795.0, + "step": 18243 + }, + { + "epoch": 2.003514166483637, + "grad_norm": 1.883077621459961, + "learning_rate": 5e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7719200849533081, + "num_tokens": 472109282.0, + "step": 18244 + }, + { + "epoch": 2.003623984186251, + "grad_norm": 1.8792181015014648, + "learning_rate": 5e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.7968137264251709, + "num_tokens": 472138308.0, + "step": 18245 + }, + { + "epoch": 2.0037338018888646, + "grad_norm": 2.2132039070129395, + "learning_rate": 5e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.761241614818573, + "num_tokens": 472162293.0, + "step": 18246 + }, + { + "epoch": 2.003843619591478, + "grad_norm": 1.9658511877059937, + "learning_rate": 5e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7635197043418884, + "num_tokens": 472190533.0, + "step": 18247 + }, + { + "epoch": 2.0039534372940917, + "grad_norm": 2.2641618251800537, + "learning_rate": 5e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.7746299505233765, + "num_tokens": 472210385.0, + "step": 18248 + }, + { + "epoch": 2.0040632549967055, + "grad_norm": 2.113804578781128, + "learning_rate": 5e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.7848185300827026, + "num_tokens": 472234772.0, + "step": 18249 + }, + { + "epoch": 2.004173072699319, + "grad_norm": 1.9951896667480469, + "learning_rate": 5e-06, + "loss": 0.6102, + "mean_token_accuracy": 0.797817587852478, + "num_tokens": 472259469.0, + "step": 18250 + }, + { + "epoch": 2.004282890401933, + "grad_norm": 1.7719519138336182, + "learning_rate": 5e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7771467566490173, + "num_tokens": 472289934.0, + "step": 18251 + }, + { + "epoch": 2.0043927081045463, + "grad_norm": 2.0809850692749023, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7587293386459351, + "num_tokens": 472318148.0, + "step": 18252 + }, + { + "epoch": 2.00450252580716, + "grad_norm": 2.055816888809204, + "learning_rate": 5e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.8008073568344116, + "num_tokens": 472341572.0, + "step": 18253 + }, + { + "epoch": 2.004612343509774, + "grad_norm": 2.1584274768829346, + "learning_rate": 5e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7680751085281372, + "num_tokens": 472364750.0, + "step": 18254 + }, + { + "epoch": 2.0047221612123876, + "grad_norm": 1.9507085084915161, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7704305648803711, + "num_tokens": 472389674.0, + "step": 18255 + }, + { + "epoch": 2.0048319789150013, + "grad_norm": 1.7274547815322876, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7704905867576599, + "num_tokens": 472423314.0, + "step": 18256 + }, + { + "epoch": 2.0049417966176146, + "grad_norm": 2.096318244934082, + "learning_rate": 5e-06, + "loss": 0.6736, + "mean_token_accuracy": 0.7861048579216003, + "num_tokens": 472446733.0, + "step": 18257 + }, + { + "epoch": 2.0050516143202284, + "grad_norm": 1.844119906425476, + "learning_rate": 5e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7681841850280762, + "num_tokens": 472475845.0, + "step": 18258 + }, + { + "epoch": 2.005161432022842, + "grad_norm": 1.7926372289657593, + "learning_rate": 5e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7476340532302856, + "num_tokens": 472508703.0, + "step": 18259 + }, + { + "epoch": 2.005271249725456, + "grad_norm": 1.9450652599334717, + "learning_rate": 5e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.7876379489898682, + "num_tokens": 472534897.0, + "step": 18260 + }, + { + "epoch": 2.005381067428069, + "grad_norm": 2.2051284313201904, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7669559717178345, + "num_tokens": 472557513.0, + "step": 18261 + }, + { + "epoch": 2.005490885130683, + "grad_norm": 1.9165048599243164, + "learning_rate": 5e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7549020051956177, + "num_tokens": 472585871.0, + "step": 18262 + }, + { + "epoch": 2.0056007028332967, + "grad_norm": 2.020878314971924, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7575106620788574, + "num_tokens": 472612851.0, + "step": 18263 + }, + { + "epoch": 2.0057105205359105, + "grad_norm": 2.084839344024658, + "learning_rate": 5e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7631852626800537, + "num_tokens": 472638483.0, + "step": 18264 + }, + { + "epoch": 2.0058203382385242, + "grad_norm": 1.7576369047164917, + "learning_rate": 5e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7555508017539978, + "num_tokens": 472670836.0, + "step": 18265 + }, + { + "epoch": 2.0059301559411375, + "grad_norm": 2.2591888904571533, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7733322978019714, + "num_tokens": 472691591.0, + "step": 18266 + }, + { + "epoch": 2.0060399736437513, + "grad_norm": 1.985119104385376, + "learning_rate": 5e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7655813694000244, + "num_tokens": 472715540.0, + "step": 18267 + }, + { + "epoch": 2.006149791346365, + "grad_norm": 1.769020915031433, + "learning_rate": 5e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.7908033728599548, + "num_tokens": 472745404.0, + "step": 18268 + }, + { + "epoch": 2.006259609048979, + "grad_norm": 1.8381730318069458, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7357113361358643, + "num_tokens": 472777352.0, + "step": 18269 + }, + { + "epoch": 2.0063694267515926, + "grad_norm": 2.3012826442718506, + "learning_rate": 5e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7757100462913513, + "num_tokens": 472799181.0, + "step": 18270 + }, + { + "epoch": 2.006479244454206, + "grad_norm": 2.180351972579956, + "learning_rate": 5e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7582365274429321, + "num_tokens": 472821909.0, + "step": 18271 + }, + { + "epoch": 2.0065890621568196, + "grad_norm": 2.195312023162842, + "learning_rate": 5e-06, + "loss": 0.6426, + "mean_token_accuracy": 0.7836495041847229, + "num_tokens": 472843842.0, + "step": 18272 + }, + { + "epoch": 2.0066988798594334, + "grad_norm": 1.8407455682754517, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.741398274898529, + "num_tokens": 472876085.0, + "step": 18273 + }, + { + "epoch": 2.006808697562047, + "grad_norm": 2.120724678039551, + "learning_rate": 5e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7719283103942871, + "num_tokens": 472899872.0, + "step": 18274 + }, + { + "epoch": 2.0069185152646605, + "grad_norm": 1.7817622423171997, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7619824409484863, + "num_tokens": 472932943.0, + "step": 18275 + }, + { + "epoch": 2.0070283329672742, + "grad_norm": 1.85893714427948, + "learning_rate": 5e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7554227113723755, + "num_tokens": 472963637.0, + "step": 18276 + }, + { + "epoch": 2.007138150669888, + "grad_norm": 2.0879921913146973, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7694065570831299, + "num_tokens": 472987259.0, + "step": 18277 + }, + { + "epoch": 2.0072479683725017, + "grad_norm": 1.855623483657837, + "learning_rate": 5e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7808383703231812, + "num_tokens": 473015611.0, + "step": 18278 + }, + { + "epoch": 2.0073577860751155, + "grad_norm": 2.098145008087158, + "learning_rate": 5e-06, + "loss": 0.6201, + "mean_token_accuracy": 0.7966840863227844, + "num_tokens": 473038304.0, + "step": 18279 + }, + { + "epoch": 2.007467603777729, + "grad_norm": 1.9753196239471436, + "learning_rate": 5e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.7730236053466797, + "num_tokens": 473068471.0, + "step": 18280 + }, + { + "epoch": 2.0075774214803426, + "grad_norm": 2.409273147583008, + "learning_rate": 5e-06, + "loss": 0.6289, + "mean_token_accuracy": 0.7878170013427734, + "num_tokens": 473089076.0, + "step": 18281 + }, + { + "epoch": 2.0076872391829563, + "grad_norm": 2.2857704162597656, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7723773121833801, + "num_tokens": 473111570.0, + "step": 18282 + }, + { + "epoch": 2.00779705688557, + "grad_norm": 1.991001009941101, + "learning_rate": 5e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7580337524414062, + "num_tokens": 473139779.0, + "step": 18283 + }, + { + "epoch": 2.0079068745881834, + "grad_norm": 1.8063244819641113, + "learning_rate": 5e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7446495294570923, + "num_tokens": 473173132.0, + "step": 18284 + }, + { + "epoch": 2.008016692290797, + "grad_norm": 1.9590730667114258, + "learning_rate": 5e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7734666466712952, + "num_tokens": 473200496.0, + "step": 18285 + }, + { + "epoch": 2.008126509993411, + "grad_norm": 1.8923768997192383, + "learning_rate": 5e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7652034163475037, + "num_tokens": 473229615.0, + "step": 18286 + }, + { + "epoch": 2.0082363276960247, + "grad_norm": 2.0318591594696045, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7509450912475586, + "num_tokens": 473256739.0, + "step": 18287 + }, + { + "epoch": 2.0083461453986384, + "grad_norm": 2.2771689891815186, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7661579847335815, + "num_tokens": 473280103.0, + "step": 18288 + }, + { + "epoch": 2.0084559631012517, + "grad_norm": 1.8123719692230225, + "learning_rate": 5e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.784957766532898, + "num_tokens": 473309888.0, + "step": 18289 + }, + { + "epoch": 2.0085657808038655, + "grad_norm": 1.9173328876495361, + "learning_rate": 5e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7547280788421631, + "num_tokens": 473341820.0, + "step": 18290 + }, + { + "epoch": 2.0086755985064793, + "grad_norm": 1.9842950105667114, + "learning_rate": 5e-06, + "loss": 0.5618, + "mean_token_accuracy": 0.8091312646865845, + "num_tokens": 473366021.0, + "step": 18291 + }, + { + "epoch": 2.008785416209093, + "grad_norm": 2.0138378143310547, + "learning_rate": 5e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7738481163978577, + "num_tokens": 473390341.0, + "step": 18292 + }, + { + "epoch": 2.0088952339117068, + "grad_norm": 2.127185106277466, + "learning_rate": 5e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7699308395385742, + "num_tokens": 473415862.0, + "step": 18293 + }, + { + "epoch": 2.00900505161432, + "grad_norm": 1.9999308586120605, + "learning_rate": 5e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.7761422395706177, + "num_tokens": 473441207.0, + "step": 18294 + }, + { + "epoch": 2.009114869316934, + "grad_norm": 2.1455140113830566, + "learning_rate": 5e-06, + "loss": 0.6579, + "mean_token_accuracy": 0.7802584171295166, + "num_tokens": 473464547.0, + "step": 18295 + }, + { + "epoch": 2.0092246870195476, + "grad_norm": 2.0652689933776855, + "learning_rate": 5e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.7974188327789307, + "num_tokens": 473488684.0, + "step": 18296 + }, + { + "epoch": 2.0093345047221614, + "grad_norm": 2.1069343090057373, + "learning_rate": 5e-06, + "loss": 0.6617, + "mean_token_accuracy": 0.7865620851516724, + "num_tokens": 473514567.0, + "step": 18297 + }, + { + "epoch": 2.0094443224247747, + "grad_norm": 1.8353118896484375, + "learning_rate": 5e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7468993067741394, + "num_tokens": 473545236.0, + "step": 18298 + }, + { + "epoch": 2.0095541401273884, + "grad_norm": 2.032127618789673, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7800287008285522, + "num_tokens": 473571627.0, + "step": 18299 + }, + { + "epoch": 2.009663957830002, + "grad_norm": 1.9810490608215332, + "learning_rate": 5e-06, + "loss": 0.6726, + "mean_token_accuracy": 0.7772543430328369, + "num_tokens": 473598227.0, + "step": 18300 + }, + { + "epoch": 2.009773775532616, + "grad_norm": 2.179910659790039, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7633370757102966, + "num_tokens": 473621187.0, + "step": 18301 + }, + { + "epoch": 2.0098835932352297, + "grad_norm": 2.005854606628418, + "learning_rate": 5e-06, + "loss": 0.7304, + "mean_token_accuracy": 0.7549275159835815, + "num_tokens": 473648891.0, + "step": 18302 + }, + { + "epoch": 2.009993410937843, + "grad_norm": 2.014923572540283, + "learning_rate": 5e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.7729940414428711, + "num_tokens": 473675592.0, + "step": 18303 + }, + { + "epoch": 2.0101032286404568, + "grad_norm": 1.9751174449920654, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7515090703964233, + "num_tokens": 473703492.0, + "step": 18304 + }, + { + "epoch": 2.0102130463430705, + "grad_norm": 2.1074979305267334, + "learning_rate": 5e-06, + "loss": 0.579, + "mean_token_accuracy": 0.8009154796600342, + "num_tokens": 473725697.0, + "step": 18305 + }, + { + "epoch": 2.0103228640456843, + "grad_norm": 2.1591060161590576, + "learning_rate": 5e-06, + "loss": 0.6103, + "mean_token_accuracy": 0.791267991065979, + "num_tokens": 473747940.0, + "step": 18306 + }, + { + "epoch": 2.010432681748298, + "grad_norm": 2.0029144287109375, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7527674436569214, + "num_tokens": 473777100.0, + "step": 18307 + }, + { + "epoch": 2.0105424994509113, + "grad_norm": 2.0960206985473633, + "learning_rate": 5e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.8002898693084717, + "num_tokens": 473800833.0, + "step": 18308 + }, + { + "epoch": 2.010652317153525, + "grad_norm": 2.086219549179077, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7744843363761902, + "num_tokens": 473825485.0, + "step": 18309 + }, + { + "epoch": 2.010762134856139, + "grad_norm": 2.077131748199463, + "learning_rate": 5e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7633830308914185, + "num_tokens": 473851920.0, + "step": 18310 + }, + { + "epoch": 2.0108719525587526, + "grad_norm": 2.141049861907959, + "learning_rate": 5e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.7729650735855103, + "num_tokens": 473876050.0, + "step": 18311 + }, + { + "epoch": 2.010981770261366, + "grad_norm": 2.0364577770233154, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7758990526199341, + "num_tokens": 473901516.0, + "step": 18312 + }, + { + "epoch": 2.0110915879639797, + "grad_norm": 1.9611797332763672, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7645730972290039, + "num_tokens": 473930568.0, + "step": 18313 + }, + { + "epoch": 2.0112014056665934, + "grad_norm": 1.9952342510223389, + "learning_rate": 5e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7578070163726807, + "num_tokens": 473956343.0, + "step": 18314 + }, + { + "epoch": 2.011311223369207, + "grad_norm": 2.0737099647521973, + "learning_rate": 5e-06, + "loss": 0.612, + "mean_token_accuracy": 0.7973545789718628, + "num_tokens": 473979744.0, + "step": 18315 + }, + { + "epoch": 2.011421041071821, + "grad_norm": 1.7156307697296143, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7788791656494141, + "num_tokens": 474016206.0, + "step": 18316 + }, + { + "epoch": 2.0115308587744343, + "grad_norm": 1.9064468145370483, + "learning_rate": 5e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7632380127906799, + "num_tokens": 474043962.0, + "step": 18317 + }, + { + "epoch": 2.011640676477048, + "grad_norm": 1.755626916885376, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7606212496757507, + "num_tokens": 474075271.0, + "step": 18318 + }, + { + "epoch": 2.011750494179662, + "grad_norm": 2.1981754302978516, + "learning_rate": 5e-06, + "loss": 0.6435, + "mean_token_accuracy": 0.7843017578125, + "num_tokens": 474095408.0, + "step": 18319 + }, + { + "epoch": 2.0118603118822755, + "grad_norm": 2.0153634548187256, + "learning_rate": 5e-06, + "loss": 0.6131, + "mean_token_accuracy": 0.7927526235580444, + "num_tokens": 474119463.0, + "step": 18320 + }, + { + "epoch": 2.0119701295848893, + "grad_norm": 2.0665318965911865, + "learning_rate": 5e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.7962876558303833, + "num_tokens": 474140265.0, + "step": 18321 + }, + { + "epoch": 2.0120799472875026, + "grad_norm": 2.097660779953003, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7685948014259338, + "num_tokens": 474163093.0, + "step": 18322 + }, + { + "epoch": 2.0121897649901164, + "grad_norm": 1.9043534994125366, + "learning_rate": 5e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7621688842773438, + "num_tokens": 474192477.0, + "step": 18323 + }, + { + "epoch": 2.01229958269273, + "grad_norm": 2.122178077697754, + "learning_rate": 5e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7715010643005371, + "num_tokens": 474216555.0, + "step": 18324 + }, + { + "epoch": 2.012409400395344, + "grad_norm": 2.2388875484466553, + "learning_rate": 5e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.7629587650299072, + "num_tokens": 474240620.0, + "step": 18325 + }, + { + "epoch": 2.012519218097957, + "grad_norm": 2.5180230140686035, + "learning_rate": 5e-06, + "loss": 0.6423, + "mean_token_accuracy": 0.7924710512161255, + "num_tokens": 474258406.0, + "step": 18326 + }, + { + "epoch": 2.012629035800571, + "grad_norm": 2.0435805320739746, + "learning_rate": 5e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.7634032964706421, + "num_tokens": 474284812.0, + "step": 18327 + }, + { + "epoch": 2.0127388535031847, + "grad_norm": 2.2304751873016357, + "learning_rate": 5e-06, + "loss": 0.6721, + "mean_token_accuracy": 0.7879979610443115, + "num_tokens": 474310910.0, + "step": 18328 + }, + { + "epoch": 2.0128486712057985, + "grad_norm": 1.8980969190597534, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7450838685035706, + "num_tokens": 474343538.0, + "step": 18329 + }, + { + "epoch": 2.0129584889084122, + "grad_norm": 2.043264627456665, + "learning_rate": 5e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7726422548294067, + "num_tokens": 474366604.0, + "step": 18330 + }, + { + "epoch": 2.0130683066110255, + "grad_norm": 1.961724042892456, + "learning_rate": 5e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.7761211395263672, + "num_tokens": 474394799.0, + "step": 18331 + }, + { + "epoch": 2.0131781243136393, + "grad_norm": 1.8532860279083252, + "learning_rate": 5e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7621501684188843, + "num_tokens": 474424478.0, + "step": 18332 + }, + { + "epoch": 2.013287942016253, + "grad_norm": 1.8495986461639404, + "learning_rate": 5e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7691593170166016, + "num_tokens": 474452581.0, + "step": 18333 + }, + { + "epoch": 2.013397759718867, + "grad_norm": 2.084423303604126, + "learning_rate": 5e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.7906590700149536, + "num_tokens": 474475395.0, + "step": 18334 + }, + { + "epoch": 2.01350757742148, + "grad_norm": 2.0932445526123047, + "learning_rate": 5e-06, + "loss": 0.6396, + "mean_token_accuracy": 0.788459300994873, + "num_tokens": 474497622.0, + "step": 18335 + }, + { + "epoch": 2.013617395124094, + "grad_norm": 2.0988082885742188, + "learning_rate": 5e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7633815407752991, + "num_tokens": 474528127.0, + "step": 18336 + }, + { + "epoch": 2.0137272128267076, + "grad_norm": 2.1131043434143066, + "learning_rate": 5e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.7767664790153503, + "num_tokens": 474551136.0, + "step": 18337 + }, + { + "epoch": 2.0138370305293214, + "grad_norm": 2.47550630569458, + "learning_rate": 5e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.7883516550064087, + "num_tokens": 474570851.0, + "step": 18338 + }, + { + "epoch": 2.013946848231935, + "grad_norm": 2.124997854232788, + "learning_rate": 5e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7585673332214355, + "num_tokens": 474597889.0, + "step": 18339 + }, + { + "epoch": 2.0140566659345485, + "grad_norm": 1.8926382064819336, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7720659971237183, + "num_tokens": 474629729.0, + "step": 18340 + }, + { + "epoch": 2.014166483637162, + "grad_norm": 2.251002311706543, + "learning_rate": 5e-06, + "loss": 0.6304, + "mean_token_accuracy": 0.7878912687301636, + "num_tokens": 474653600.0, + "step": 18341 + }, + { + "epoch": 2.014276301339776, + "grad_norm": 2.2995800971984863, + "learning_rate": 5e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.776947021484375, + "num_tokens": 474678090.0, + "step": 18342 + }, + { + "epoch": 2.0143861190423897, + "grad_norm": 2.1325442790985107, + "learning_rate": 5e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.7732481956481934, + "num_tokens": 474700167.0, + "step": 18343 + }, + { + "epoch": 2.0144959367450035, + "grad_norm": 2.0083272457122803, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7588998079299927, + "num_tokens": 474726593.0, + "step": 18344 + }, + { + "epoch": 2.014605754447617, + "grad_norm": 2.270455837249756, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.763352632522583, + "num_tokens": 474748523.0, + "step": 18345 + }, + { + "epoch": 2.0147155721502306, + "grad_norm": 2.034045696258545, + "learning_rate": 5e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7280550003051758, + "num_tokens": 474780726.0, + "step": 18346 + }, + { + "epoch": 2.0148253898528443, + "grad_norm": 2.0674314498901367, + "learning_rate": 5e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.7911413908004761, + "num_tokens": 474804976.0, + "step": 18347 + }, + { + "epoch": 2.014935207555458, + "grad_norm": 1.9516013860702515, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.757745623588562, + "num_tokens": 474834325.0, + "step": 18348 + }, + { + "epoch": 2.0150450252580714, + "grad_norm": 2.1455554962158203, + "learning_rate": 5e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7721805572509766, + "num_tokens": 474857377.0, + "step": 18349 + }, + { + "epoch": 2.015154842960685, + "grad_norm": 2.01360821723938, + "learning_rate": 5e-06, + "loss": 0.6542, + "mean_token_accuracy": 0.7930656671524048, + "num_tokens": 474883037.0, + "step": 18350 + }, + { + "epoch": 2.015264660663299, + "grad_norm": 2.1772937774658203, + "learning_rate": 5e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7700273394584656, + "num_tokens": 474907731.0, + "step": 18351 + }, + { + "epoch": 2.0153744783659127, + "grad_norm": 2.52203631401062, + "learning_rate": 5e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.765512228012085, + "num_tokens": 474930079.0, + "step": 18352 + }, + { + "epoch": 2.0154842960685264, + "grad_norm": 2.294001579284668, + "learning_rate": 5e-06, + "loss": 0.6533, + "mean_token_accuracy": 0.7828980684280396, + "num_tokens": 474952111.0, + "step": 18353 + }, + { + "epoch": 2.0155941137711397, + "grad_norm": 2.157757520675659, + "learning_rate": 5e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.7750430703163147, + "num_tokens": 474976511.0, + "step": 18354 + }, + { + "epoch": 2.0157039314737535, + "grad_norm": 1.9914263486862183, + "learning_rate": 5e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7570908069610596, + "num_tokens": 475004657.0, + "step": 18355 + }, + { + "epoch": 2.0158137491763672, + "grad_norm": 1.931439757347107, + "learning_rate": 5e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8036776781082153, + "num_tokens": 475029911.0, + "step": 18356 + }, + { + "epoch": 2.015923566878981, + "grad_norm": 1.8051462173461914, + "learning_rate": 5e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7694696187973022, + "num_tokens": 475059681.0, + "step": 18357 + }, + { + "epoch": 2.0160333845815948, + "grad_norm": 2.1745986938476562, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7748805284500122, + "num_tokens": 475084701.0, + "step": 18358 + }, + { + "epoch": 2.016143202284208, + "grad_norm": 2.183664321899414, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7591241598129272, + "num_tokens": 475110457.0, + "step": 18359 + }, + { + "epoch": 2.016253019986822, + "grad_norm": 2.0708911418914795, + "learning_rate": 5e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.7843241095542908, + "num_tokens": 475135323.0, + "step": 18360 + }, + { + "epoch": 2.0163628376894356, + "grad_norm": 2.1475625038146973, + "learning_rate": 5e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7667521238327026, + "num_tokens": 475160915.0, + "step": 18361 + }, + { + "epoch": 2.0164726553920493, + "grad_norm": 2.0906081199645996, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7801469564437866, + "num_tokens": 475189037.0, + "step": 18362 + }, + { + "epoch": 2.0165824730946627, + "grad_norm": 2.1175343990325928, + "learning_rate": 5e-06, + "loss": 0.6575, + "mean_token_accuracy": 0.783484935760498, + "num_tokens": 475212128.0, + "step": 18363 + }, + { + "epoch": 2.0166922907972764, + "grad_norm": 2.207476854324341, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7767319679260254, + "num_tokens": 475237399.0, + "step": 18364 + }, + { + "epoch": 2.01680210849989, + "grad_norm": 1.9887285232543945, + "learning_rate": 5e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7700375914573669, + "num_tokens": 475264648.0, + "step": 18365 + }, + { + "epoch": 2.016911926202504, + "grad_norm": 1.6964889764785767, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7670108079910278, + "num_tokens": 475298851.0, + "step": 18366 + }, + { + "epoch": 2.0170217439051177, + "grad_norm": 2.3126633167266846, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7671997547149658, + "num_tokens": 475320829.0, + "step": 18367 + }, + { + "epoch": 2.017131561607731, + "grad_norm": 2.4892282485961914, + "learning_rate": 5e-06, + "loss": 0.6085, + "mean_token_accuracy": 0.7969990372657776, + "num_tokens": 475340919.0, + "step": 18368 + }, + { + "epoch": 2.0172413793103448, + "grad_norm": 1.8843015432357788, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7602332830429077, + "num_tokens": 475369214.0, + "step": 18369 + }, + { + "epoch": 2.0173511970129585, + "grad_norm": 2.0754547119140625, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7587912082672119, + "num_tokens": 475394886.0, + "step": 18370 + }, + { + "epoch": 2.0174610147155723, + "grad_norm": 1.9643093347549438, + "learning_rate": 5e-06, + "loss": 0.5945, + "mean_token_accuracy": 0.7999703884124756, + "num_tokens": 475420224.0, + "step": 18371 + }, + { + "epoch": 2.017570832418186, + "grad_norm": 2.080564260482788, + "learning_rate": 5e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7719230651855469, + "num_tokens": 475447781.0, + "step": 18372 + }, + { + "epoch": 2.0176806501207993, + "grad_norm": 2.1704494953155518, + "learning_rate": 5e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7639586925506592, + "num_tokens": 475473105.0, + "step": 18373 + }, + { + "epoch": 2.017790467823413, + "grad_norm": 2.3499107360839844, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.780994176864624, + "num_tokens": 475494711.0, + "step": 18374 + }, + { + "epoch": 2.017900285526027, + "grad_norm": 2.0333447456359863, + "learning_rate": 5e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.7736138105392456, + "num_tokens": 475519565.0, + "step": 18375 + }, + { + "epoch": 2.0180101032286406, + "grad_norm": 2.265544891357422, + "learning_rate": 5e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.7933600544929504, + "num_tokens": 475542651.0, + "step": 18376 + }, + { + "epoch": 2.018119920931254, + "grad_norm": 2.2063887119293213, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7629580497741699, + "num_tokens": 475570443.0, + "step": 18377 + }, + { + "epoch": 2.0182297386338677, + "grad_norm": 2.0570483207702637, + "learning_rate": 5e-06, + "loss": 0.6762, + "mean_token_accuracy": 0.7712393999099731, + "num_tokens": 475592862.0, + "step": 18378 + }, + { + "epoch": 2.0183395563364814, + "grad_norm": 1.838926076889038, + "learning_rate": 5e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7526661157608032, + "num_tokens": 475625367.0, + "step": 18379 + }, + { + "epoch": 2.018449374039095, + "grad_norm": 2.0955145359039307, + "learning_rate": 5e-06, + "loss": 0.6324, + "mean_token_accuracy": 0.7906852960586548, + "num_tokens": 475649963.0, + "step": 18380 + }, + { + "epoch": 2.018559191741709, + "grad_norm": 1.9633164405822754, + "learning_rate": 5e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7542640566825867, + "num_tokens": 475681646.0, + "step": 18381 + }, + { + "epoch": 2.0186690094443223, + "grad_norm": 1.7563070058822632, + "learning_rate": 5e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.768595278263092, + "num_tokens": 475715436.0, + "step": 18382 + }, + { + "epoch": 2.018778827146936, + "grad_norm": 1.8786187171936035, + "learning_rate": 5e-06, + "loss": 0.701, + "mean_token_accuracy": 0.7710605263710022, + "num_tokens": 475744934.0, + "step": 18383 + }, + { + "epoch": 2.0188886448495498, + "grad_norm": 2.1328623294830322, + "learning_rate": 5e-06, + "loss": 0.5908, + "mean_token_accuracy": 0.799144446849823, + "num_tokens": 475767588.0, + "step": 18384 + }, + { + "epoch": 2.0189984625521635, + "grad_norm": 1.9582575559616089, + "learning_rate": 5e-06, + "loss": 0.7304, + "mean_token_accuracy": 0.7633397579193115, + "num_tokens": 475797652.0, + "step": 18385 + }, + { + "epoch": 2.0191082802547773, + "grad_norm": 2.296603202819824, + "learning_rate": 5e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.7748613953590393, + "num_tokens": 475821944.0, + "step": 18386 + }, + { + "epoch": 2.0192180979573906, + "grad_norm": 2.1120152473449707, + "learning_rate": 5e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7813112139701843, + "num_tokens": 475847389.0, + "step": 18387 + }, + { + "epoch": 2.0193279156600044, + "grad_norm": 1.927190899848938, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7738978862762451, + "num_tokens": 475873162.0, + "step": 18388 + }, + { + "epoch": 2.019437733362618, + "grad_norm": 2.2051234245300293, + "learning_rate": 5e-06, + "loss": 0.6981, + "mean_token_accuracy": 0.7779605388641357, + "num_tokens": 475896633.0, + "step": 18389 + }, + { + "epoch": 2.019547551065232, + "grad_norm": 2.05257511138916, + "learning_rate": 5e-06, + "loss": 0.639, + "mean_token_accuracy": 0.7900595664978027, + "num_tokens": 475921547.0, + "step": 18390 + }, + { + "epoch": 2.019657368767845, + "grad_norm": 1.86606764793396, + "learning_rate": 5e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7465313673019409, + "num_tokens": 475951508.0, + "step": 18391 + }, + { + "epoch": 2.019767186470459, + "grad_norm": 2.0565178394317627, + "learning_rate": 5e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.7821906208992004, + "num_tokens": 475976263.0, + "step": 18392 + }, + { + "epoch": 2.0198770041730727, + "grad_norm": 2.1112453937530518, + "learning_rate": 5e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.775615930557251, + "num_tokens": 476002328.0, + "step": 18393 + }, + { + "epoch": 2.0199868218756865, + "grad_norm": 2.0722649097442627, + "learning_rate": 5e-06, + "loss": 0.5455, + "mean_token_accuracy": 0.8164928555488586, + "num_tokens": 476026413.0, + "step": 18394 + }, + { + "epoch": 2.0200966395783, + "grad_norm": 2.1502699851989746, + "learning_rate": 5e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.7777417898178101, + "num_tokens": 476050796.0, + "step": 18395 + }, + { + "epoch": 2.0202064572809135, + "grad_norm": 2.2658350467681885, + "learning_rate": 5e-06, + "loss": 0.6734, + "mean_token_accuracy": 0.775603175163269, + "num_tokens": 476075841.0, + "step": 18396 + }, + { + "epoch": 2.0203162749835273, + "grad_norm": 1.9763718843460083, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7717126607894897, + "num_tokens": 476102691.0, + "step": 18397 + }, + { + "epoch": 2.020426092686141, + "grad_norm": 2.209435224533081, + "learning_rate": 5e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7693018317222595, + "num_tokens": 476129898.0, + "step": 18398 + }, + { + "epoch": 2.020535910388755, + "grad_norm": 2.019242763519287, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7552899718284607, + "num_tokens": 476157918.0, + "step": 18399 + }, + { + "epoch": 2.020645728091368, + "grad_norm": 1.9875259399414062, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7577530741691589, + "num_tokens": 476183854.0, + "step": 18400 + }, + { + "epoch": 2.020755545793982, + "grad_norm": 2.441312789916992, + "learning_rate": 5e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.7879853844642639, + "num_tokens": 476204934.0, + "step": 18401 + }, + { + "epoch": 2.0208653634965956, + "grad_norm": 1.9677393436431885, + "learning_rate": 5e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.775321900844574, + "num_tokens": 476231562.0, + "step": 18402 + }, + { + "epoch": 2.0209751811992094, + "grad_norm": 2.2414426803588867, + "learning_rate": 5e-06, + "loss": 0.6425, + "mean_token_accuracy": 0.781448483467102, + "num_tokens": 476253331.0, + "step": 18403 + }, + { + "epoch": 2.021084998901823, + "grad_norm": 1.901720643043518, + "learning_rate": 5e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.7815520763397217, + "num_tokens": 476281091.0, + "step": 18404 + }, + { + "epoch": 2.0211948166044365, + "grad_norm": 2.1204657554626465, + "learning_rate": 5e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.7871826887130737, + "num_tokens": 476304736.0, + "step": 18405 + }, + { + "epoch": 2.02130463430705, + "grad_norm": 2.107912540435791, + "learning_rate": 5e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7536947727203369, + "num_tokens": 476330299.0, + "step": 18406 + }, + { + "epoch": 2.021414452009664, + "grad_norm": 2.120563507080078, + "learning_rate": 5e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.7837511897087097, + "num_tokens": 476353406.0, + "step": 18407 + }, + { + "epoch": 2.0215242697122777, + "grad_norm": 2.0516574382781982, + "learning_rate": 5e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.7814203500747681, + "num_tokens": 476378049.0, + "step": 18408 + }, + { + "epoch": 2.0216340874148915, + "grad_norm": 1.9881256818771362, + "learning_rate": 5e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.7747653126716614, + "num_tokens": 476403860.0, + "step": 18409 + }, + { + "epoch": 2.021743905117505, + "grad_norm": 2.1014564037323, + "learning_rate": 5e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7602037191390991, + "num_tokens": 476428522.0, + "step": 18410 + }, + { + "epoch": 2.0218537228201185, + "grad_norm": 2.066847324371338, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7519893646240234, + "num_tokens": 476453883.0, + "step": 18411 + }, + { + "epoch": 2.0219635405227323, + "grad_norm": 1.9666643142700195, + "learning_rate": 5e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7693973183631897, + "num_tokens": 476483041.0, + "step": 18412 + }, + { + "epoch": 2.022073358225346, + "grad_norm": 2.1839852333068848, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7693917751312256, + "num_tokens": 476507467.0, + "step": 18413 + }, + { + "epoch": 2.0221831759279594, + "grad_norm": 2.2318592071533203, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.7835579514503479, + "num_tokens": 476529751.0, + "step": 18414 + }, + { + "epoch": 2.022292993630573, + "grad_norm": 1.9251272678375244, + "learning_rate": 5e-06, + "loss": 0.646, + "mean_token_accuracy": 0.7899131774902344, + "num_tokens": 476557224.0, + "step": 18415 + }, + { + "epoch": 2.022402811333187, + "grad_norm": 2.0439646244049072, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7729916572570801, + "num_tokens": 476583010.0, + "step": 18416 + }, + { + "epoch": 2.0225126290358006, + "grad_norm": 2.031708240509033, + "learning_rate": 5e-06, + "loss": 0.6428, + "mean_token_accuracy": 0.7849009037017822, + "num_tokens": 476607648.0, + "step": 18417 + }, + { + "epoch": 2.0226224467384144, + "grad_norm": 2.261483907699585, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7863930463790894, + "num_tokens": 476628811.0, + "step": 18418 + }, + { + "epoch": 2.0227322644410277, + "grad_norm": 2.18166184425354, + "learning_rate": 5e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7793686985969543, + "num_tokens": 476651710.0, + "step": 18419 + }, + { + "epoch": 2.0228420821436415, + "grad_norm": 1.9369933605194092, + "learning_rate": 5e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7664750218391418, + "num_tokens": 476679296.0, + "step": 18420 + }, + { + "epoch": 2.0229518998462552, + "grad_norm": 1.9979090690612793, + "learning_rate": 5e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7760804891586304, + "num_tokens": 476703872.0, + "step": 18421 + }, + { + "epoch": 2.023061717548869, + "grad_norm": 2.112870931625366, + "learning_rate": 5e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7709126472473145, + "num_tokens": 476728059.0, + "step": 18422 + }, + { + "epoch": 2.0231715352514827, + "grad_norm": 1.8550478219985962, + "learning_rate": 5e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7660806179046631, + "num_tokens": 476759903.0, + "step": 18423 + }, + { + "epoch": 2.023281352954096, + "grad_norm": 2.000863790512085, + "learning_rate": 5e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.776563823223114, + "num_tokens": 476786799.0, + "step": 18424 + }, + { + "epoch": 2.02339117065671, + "grad_norm": 2.1589601039886475, + "learning_rate": 5e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7742742300033569, + "num_tokens": 476811774.0, + "step": 18425 + }, + { + "epoch": 2.0235009883593236, + "grad_norm": 2.0638303756713867, + "learning_rate": 5e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7646402716636658, + "num_tokens": 476837825.0, + "step": 18426 + }, + { + "epoch": 2.0236108060619373, + "grad_norm": 2.0850517749786377, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7709104418754578, + "num_tokens": 476861609.0, + "step": 18427 + }, + { + "epoch": 2.0237206237645506, + "grad_norm": 2.03075909614563, + "learning_rate": 5e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7504692673683167, + "num_tokens": 476889983.0, + "step": 18428 + }, + { + "epoch": 2.0238304414671644, + "grad_norm": 2.1650021076202393, + "learning_rate": 5e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7764701247215271, + "num_tokens": 476915110.0, + "step": 18429 + }, + { + "epoch": 2.023940259169778, + "grad_norm": 1.8821607828140259, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7619876861572266, + "num_tokens": 476944747.0, + "step": 18430 + }, + { + "epoch": 2.024050076872392, + "grad_norm": 2.18868088722229, + "learning_rate": 5e-06, + "loss": 0.6366, + "mean_token_accuracy": 0.7889477014541626, + "num_tokens": 476966940.0, + "step": 18431 + }, + { + "epoch": 2.0241598945750057, + "grad_norm": 2.0147171020507812, + "learning_rate": 5e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.825560986995697, + "num_tokens": 476989169.0, + "step": 18432 + }, + { + "epoch": 2.024269712277619, + "grad_norm": 2.1634488105773926, + "learning_rate": 5e-06, + "loss": 0.6345, + "mean_token_accuracy": 0.7892919182777405, + "num_tokens": 477011258.0, + "step": 18433 + }, + { + "epoch": 2.0243795299802327, + "grad_norm": 2.0900354385375977, + "learning_rate": 5e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.7842850089073181, + "num_tokens": 477034666.0, + "step": 18434 + }, + { + "epoch": 2.0244893476828465, + "grad_norm": 2.0793230533599854, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7700971364974976, + "num_tokens": 477058528.0, + "step": 18435 + }, + { + "epoch": 2.0245991653854603, + "grad_norm": 1.896066665649414, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7711864709854126, + "num_tokens": 477087826.0, + "step": 18436 + }, + { + "epoch": 2.024708983088074, + "grad_norm": 2.045466899871826, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7704862952232361, + "num_tokens": 477113927.0, + "step": 18437 + }, + { + "epoch": 2.0248188007906873, + "grad_norm": 2.268270492553711, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7659668922424316, + "num_tokens": 477135882.0, + "step": 18438 + }, + { + "epoch": 2.024928618493301, + "grad_norm": 2.172865390777588, + "learning_rate": 5e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7615147829055786, + "num_tokens": 477162024.0, + "step": 18439 + }, + { + "epoch": 2.025038436195915, + "grad_norm": 2.031118631362915, + "learning_rate": 5e-06, + "loss": 0.6378, + "mean_token_accuracy": 0.792995810508728, + "num_tokens": 477186406.0, + "step": 18440 + }, + { + "epoch": 2.0251482538985286, + "grad_norm": 2.0153329372406006, + "learning_rate": 5e-06, + "loss": 0.6466, + "mean_token_accuracy": 0.7828222513198853, + "num_tokens": 477210886.0, + "step": 18441 + }, + { + "epoch": 2.025258071601142, + "grad_norm": 2.505143880844116, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7797119617462158, + "num_tokens": 477229601.0, + "step": 18442 + }, + { + "epoch": 2.0253678893037557, + "grad_norm": 1.983865737915039, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7413681149482727, + "num_tokens": 477258172.0, + "step": 18443 + }, + { + "epoch": 2.0254777070063694, + "grad_norm": 1.967518925666809, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7640924453735352, + "num_tokens": 477286521.0, + "step": 18444 + }, + { + "epoch": 2.025587524708983, + "grad_norm": 1.896155834197998, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7729172110557556, + "num_tokens": 477318059.0, + "step": 18445 + }, + { + "epoch": 2.025697342411597, + "grad_norm": 1.8607975244522095, + "learning_rate": 5e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.759852409362793, + "num_tokens": 477346164.0, + "step": 18446 + }, + { + "epoch": 2.0258071601142102, + "grad_norm": 2.010094165802002, + "learning_rate": 5e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7526311874389648, + "num_tokens": 477372702.0, + "step": 18447 + }, + { + "epoch": 2.025916977816824, + "grad_norm": 2.2994184494018555, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.775424063205719, + "num_tokens": 477394806.0, + "step": 18448 + }, + { + "epoch": 2.0260267955194378, + "grad_norm": 2.0431790351867676, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7574688196182251, + "num_tokens": 477421101.0, + "step": 18449 + }, + { + "epoch": 2.0261366132220515, + "grad_norm": 2.02541184425354, + "learning_rate": 5e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7551950812339783, + "num_tokens": 477449843.0, + "step": 18450 + }, + { + "epoch": 2.0262464309246653, + "grad_norm": 1.8732775449752808, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.767047643661499, + "num_tokens": 477479255.0, + "step": 18451 + }, + { + "epoch": 2.0263562486272786, + "grad_norm": 2.257230043411255, + "learning_rate": 5e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7741707563400269, + "num_tokens": 477501044.0, + "step": 18452 + }, + { + "epoch": 2.0264660663298923, + "grad_norm": 2.1402409076690674, + "learning_rate": 5e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7754640579223633, + "num_tokens": 477525459.0, + "step": 18453 + }, + { + "epoch": 2.026575884032506, + "grad_norm": 2.1203064918518066, + "learning_rate": 5e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.78029465675354, + "num_tokens": 477548684.0, + "step": 18454 + }, + { + "epoch": 2.02668570173512, + "grad_norm": 1.748986005783081, + "learning_rate": 5e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7639631628990173, + "num_tokens": 477583388.0, + "step": 18455 + }, + { + "epoch": 2.026795519437733, + "grad_norm": 2.262716054916382, + "learning_rate": 5e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7844598293304443, + "num_tokens": 477605706.0, + "step": 18456 + }, + { + "epoch": 2.026905337140347, + "grad_norm": 2.1231682300567627, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7609186172485352, + "num_tokens": 477631531.0, + "step": 18457 + }, + { + "epoch": 2.0270151548429607, + "grad_norm": 2.071254014968872, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7771425843238831, + "num_tokens": 477655704.0, + "step": 18458 + }, + { + "epoch": 2.0271249725455744, + "grad_norm": 1.9426145553588867, + "learning_rate": 5e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.7797914743423462, + "num_tokens": 477682606.0, + "step": 18459 + }, + { + "epoch": 2.027234790248188, + "grad_norm": 2.330665111541748, + "learning_rate": 5e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7605106234550476, + "num_tokens": 477707328.0, + "step": 18460 + }, + { + "epoch": 2.0273446079508015, + "grad_norm": 2.261941432952881, + "learning_rate": 5e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.7854166030883789, + "num_tokens": 477729305.0, + "step": 18461 + }, + { + "epoch": 2.0274544256534153, + "grad_norm": 2.0103600025177, + "learning_rate": 5e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.73613440990448, + "num_tokens": 477758145.0, + "step": 18462 + }, + { + "epoch": 2.027564243356029, + "grad_norm": 2.044299364089966, + "learning_rate": 5e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7684340476989746, + "num_tokens": 477783405.0, + "step": 18463 + }, + { + "epoch": 2.027674061058643, + "grad_norm": 2.0040738582611084, + "learning_rate": 5e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7750082015991211, + "num_tokens": 477808955.0, + "step": 18464 + }, + { + "epoch": 2.027783878761256, + "grad_norm": 1.9973031282424927, + "learning_rate": 5e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7641292810440063, + "num_tokens": 477837108.0, + "step": 18465 + }, + { + "epoch": 2.02789369646387, + "grad_norm": 1.9708811044692993, + "learning_rate": 5e-06, + "loss": 0.6782, + "mean_token_accuracy": 0.7765268087387085, + "num_tokens": 477864703.0, + "step": 18466 + }, + { + "epoch": 2.0280035141664836, + "grad_norm": 2.1265146732330322, + "learning_rate": 5e-06, + "loss": 0.6687, + "mean_token_accuracy": 0.7831122875213623, + "num_tokens": 477888462.0, + "step": 18467 + }, + { + "epoch": 2.0281133318690974, + "grad_norm": 1.871160626411438, + "learning_rate": 5e-06, + "loss": 0.675, + "mean_token_accuracy": 0.7708954811096191, + "num_tokens": 477917397.0, + "step": 18468 + }, + { + "epoch": 2.028223149571711, + "grad_norm": 2.1922707557678223, + "learning_rate": 5e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.7905504703521729, + "num_tokens": 477938708.0, + "step": 18469 + }, + { + "epoch": 2.0283329672743244, + "grad_norm": 2.2314767837524414, + "learning_rate": 5e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.7795196175575256, + "num_tokens": 477960960.0, + "step": 18470 + }, + { + "epoch": 2.028442784976938, + "grad_norm": 2.0333058834075928, + "learning_rate": 5e-06, + "loss": 0.6116, + "mean_token_accuracy": 0.7985101938247681, + "num_tokens": 477985564.0, + "step": 18471 + }, + { + "epoch": 2.028552602679552, + "grad_norm": 1.8711671829223633, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7721906304359436, + "num_tokens": 478013574.0, + "step": 18472 + }, + { + "epoch": 2.0286624203821657, + "grad_norm": 2.141085386276245, + "learning_rate": 5e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8092899918556213, + "num_tokens": 478036034.0, + "step": 18473 + }, + { + "epoch": 2.0287722380847795, + "grad_norm": 2.1569950580596924, + "learning_rate": 5e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7761439085006714, + "num_tokens": 478060068.0, + "step": 18474 + }, + { + "epoch": 2.028882055787393, + "grad_norm": 2.0668246746063232, + "learning_rate": 5e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7591208219528198, + "num_tokens": 478087526.0, + "step": 18475 + }, + { + "epoch": 2.0289918734900065, + "grad_norm": 2.1202266216278076, + "learning_rate": 5e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7660301327705383, + "num_tokens": 478111731.0, + "step": 18476 + }, + { + "epoch": 2.0291016911926203, + "grad_norm": 2.06848406791687, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7561830878257751, + "num_tokens": 478138424.0, + "step": 18477 + }, + { + "epoch": 2.029211508895234, + "grad_norm": 2.254436492919922, + "learning_rate": 5e-06, + "loss": 0.6345, + "mean_token_accuracy": 0.7895191311836243, + "num_tokens": 478158706.0, + "step": 18478 + }, + { + "epoch": 2.0293213265978474, + "grad_norm": 2.351097345352173, + "learning_rate": 5e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.7985881567001343, + "num_tokens": 478177522.0, + "step": 18479 + }, + { + "epoch": 2.029431144300461, + "grad_norm": 2.055253028869629, + "learning_rate": 5e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.7883645296096802, + "num_tokens": 478202900.0, + "step": 18480 + }, + { + "epoch": 2.029540962003075, + "grad_norm": 2.0657477378845215, + "learning_rate": 5e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.8004302978515625, + "num_tokens": 478225888.0, + "step": 18481 + }, + { + "epoch": 2.0296507797056886, + "grad_norm": 1.9749281406402588, + "learning_rate": 5e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7569913864135742, + "num_tokens": 478252229.0, + "step": 18482 + }, + { + "epoch": 2.0297605974083024, + "grad_norm": 2.2753329277038574, + "learning_rate": 5e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7590891122817993, + "num_tokens": 478276855.0, + "step": 18483 + }, + { + "epoch": 2.0298704151109157, + "grad_norm": 2.203350067138672, + "learning_rate": 5e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7763432860374451, + "num_tokens": 478300545.0, + "step": 18484 + }, + { + "epoch": 2.0299802328135295, + "grad_norm": 2.209786891937256, + "learning_rate": 5e-06, + "loss": 0.6949, + "mean_token_accuracy": 0.767654538154602, + "num_tokens": 478325960.0, + "step": 18485 + }, + { + "epoch": 2.030090050516143, + "grad_norm": 2.3558924198150635, + "learning_rate": 5e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.7836084365844727, + "num_tokens": 478346066.0, + "step": 18486 + }, + { + "epoch": 2.030199868218757, + "grad_norm": 1.867132544517517, + "learning_rate": 5e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.7690297961235046, + "num_tokens": 478376748.0, + "step": 18487 + }, + { + "epoch": 2.0303096859213707, + "grad_norm": 2.109400987625122, + "learning_rate": 5e-06, + "loss": 0.6375, + "mean_token_accuracy": 0.789385199546814, + "num_tokens": 478399394.0, + "step": 18488 + }, + { + "epoch": 2.030419503623984, + "grad_norm": 2.1729416847229004, + "learning_rate": 5e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.8145575523376465, + "num_tokens": 478419292.0, + "step": 18489 + }, + { + "epoch": 2.030529321326598, + "grad_norm": 1.9518496990203857, + "learning_rate": 5e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7742480039596558, + "num_tokens": 478447869.0, + "step": 18490 + }, + { + "epoch": 2.0306391390292116, + "grad_norm": 1.9564025402069092, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7625846862792969, + "num_tokens": 478476728.0, + "step": 18491 + }, + { + "epoch": 2.0307489567318253, + "grad_norm": 2.8221275806427, + "learning_rate": 5e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7798742055892944, + "num_tokens": 478492737.0, + "step": 18492 + }, + { + "epoch": 2.0308587744344386, + "grad_norm": 1.9972319602966309, + "learning_rate": 5e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.778963565826416, + "num_tokens": 478518665.0, + "step": 18493 + }, + { + "epoch": 2.0309685921370524, + "grad_norm": 2.246737241744995, + "learning_rate": 5e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.7792723774909973, + "num_tokens": 478542039.0, + "step": 18494 + }, + { + "epoch": 2.031078409839666, + "grad_norm": 1.8232200145721436, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7555125951766968, + "num_tokens": 478574487.0, + "step": 18495 + }, + { + "epoch": 2.03118822754228, + "grad_norm": 2.132702589035034, + "learning_rate": 5e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.7844235301017761, + "num_tokens": 478598306.0, + "step": 18496 + }, + { + "epoch": 2.0312980452448937, + "grad_norm": 1.9303969144821167, + "learning_rate": 5e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.782191812992096, + "num_tokens": 478623384.0, + "step": 18497 + }, + { + "epoch": 2.031407862947507, + "grad_norm": 1.8968209028244019, + "learning_rate": 5e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7653258442878723, + "num_tokens": 478652203.0, + "step": 18498 + }, + { + "epoch": 2.0315176806501207, + "grad_norm": 1.908482551574707, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7446730136871338, + "num_tokens": 478683703.0, + "step": 18499 + }, + { + "epoch": 2.0316274983527345, + "grad_norm": 2.0165867805480957, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7615303993225098, + "num_tokens": 478711251.0, + "step": 18500 + }, + { + "epoch": 2.0317373160553482, + "grad_norm": 1.9846616983413696, + "learning_rate": 5e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7481563091278076, + "num_tokens": 478738571.0, + "step": 18501 + }, + { + "epoch": 2.031847133757962, + "grad_norm": 1.8198988437652588, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7483267784118652, + "num_tokens": 478770753.0, + "step": 18502 + }, + { + "epoch": 2.0319569514605753, + "grad_norm": 2.0787065029144287, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7783788442611694, + "num_tokens": 478794609.0, + "step": 18503 + }, + { + "epoch": 2.032066769163189, + "grad_norm": 2.2661962509155273, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.770327091217041, + "num_tokens": 478818131.0, + "step": 18504 + }, + { + "epoch": 2.032176586865803, + "grad_norm": 2.080535650253296, + "learning_rate": 5e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7712881565093994, + "num_tokens": 478843648.0, + "step": 18505 + }, + { + "epoch": 2.0322864045684166, + "grad_norm": 1.988858938217163, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7632633447647095, + "num_tokens": 478874237.0, + "step": 18506 + }, + { + "epoch": 2.03239622227103, + "grad_norm": 2.3156144618988037, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7677375078201294, + "num_tokens": 478894237.0, + "step": 18507 + }, + { + "epoch": 2.0325060399736437, + "grad_norm": 1.770090937614441, + "learning_rate": 5e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7676366567611694, + "num_tokens": 478924054.0, + "step": 18508 + }, + { + "epoch": 2.0326158576762574, + "grad_norm": 1.9707565307617188, + "learning_rate": 5e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.7763165831565857, + "num_tokens": 478953076.0, + "step": 18509 + }, + { + "epoch": 2.032725675378871, + "grad_norm": 2.069610595703125, + "learning_rate": 5e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7582477331161499, + "num_tokens": 478978187.0, + "step": 18510 + }, + { + "epoch": 2.032835493081485, + "grad_norm": 2.1558046340942383, + "learning_rate": 5e-06, + "loss": 0.672, + "mean_token_accuracy": 0.7761964797973633, + "num_tokens": 479002633.0, + "step": 18511 + }, + { + "epoch": 2.0329453107840982, + "grad_norm": 2.0404748916625977, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7561007142066956, + "num_tokens": 479030287.0, + "step": 18512 + }, + { + "epoch": 2.033055128486712, + "grad_norm": 2.4941024780273438, + "learning_rate": 5e-06, + "loss": 0.6108, + "mean_token_accuracy": 0.7954416275024414, + "num_tokens": 479049122.0, + "step": 18513 + }, + { + "epoch": 2.0331649461893258, + "grad_norm": 2.0916152000427246, + "learning_rate": 5e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7663228511810303, + "num_tokens": 479072445.0, + "step": 18514 + }, + { + "epoch": 2.0332747638919395, + "grad_norm": 2.1609950065612793, + "learning_rate": 5e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.7843695878982544, + "num_tokens": 479095437.0, + "step": 18515 + }, + { + "epoch": 2.033384581594553, + "grad_norm": 1.9235798120498657, + "learning_rate": 5e-06, + "loss": 0.6521, + "mean_token_accuracy": 0.7845932245254517, + "num_tokens": 479122195.0, + "step": 18516 + }, + { + "epoch": 2.0334943992971666, + "grad_norm": 2.325491428375244, + "learning_rate": 5e-06, + "loss": 0.6497, + "mean_token_accuracy": 0.7895135879516602, + "num_tokens": 479142948.0, + "step": 18517 + }, + { + "epoch": 2.0336042169997803, + "grad_norm": 2.0278472900390625, + "learning_rate": 5e-06, + "loss": 0.701, + "mean_token_accuracy": 0.7688103914260864, + "num_tokens": 479172927.0, + "step": 18518 + }, + { + "epoch": 2.033714034702394, + "grad_norm": 2.239614486694336, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.770046591758728, + "num_tokens": 479196197.0, + "step": 18519 + }, + { + "epoch": 2.033823852405008, + "grad_norm": 2.0869088172912598, + "learning_rate": 5e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.7803881764411926, + "num_tokens": 479219346.0, + "step": 18520 + }, + { + "epoch": 2.033933670107621, + "grad_norm": 1.9203429222106934, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7544565796852112, + "num_tokens": 479247936.0, + "step": 18521 + }, + { + "epoch": 2.034043487810235, + "grad_norm": 2.4426262378692627, + "learning_rate": 5e-06, + "loss": 0.5619, + "mean_token_accuracy": 0.8113179206848145, + "num_tokens": 479265458.0, + "step": 18522 + }, + { + "epoch": 2.0341533055128487, + "grad_norm": 2.102567434310913, + "learning_rate": 5e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8072336316108704, + "num_tokens": 479290514.0, + "step": 18523 + }, + { + "epoch": 2.0342631232154624, + "grad_norm": 2.147083044052124, + "learning_rate": 5e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7813979983329773, + "num_tokens": 479316287.0, + "step": 18524 + }, + { + "epoch": 2.034372940918076, + "grad_norm": 2.0240936279296875, + "learning_rate": 5e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7730686664581299, + "num_tokens": 479342775.0, + "step": 18525 + }, + { + "epoch": 2.0344827586206895, + "grad_norm": 1.9237983226776123, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7655639052391052, + "num_tokens": 479370498.0, + "step": 18526 + }, + { + "epoch": 2.0345925763233033, + "grad_norm": 2.324418783187866, + "learning_rate": 5e-06, + "loss": 0.6631, + "mean_token_accuracy": 0.7846909761428833, + "num_tokens": 479392119.0, + "step": 18527 + }, + { + "epoch": 2.034702394025917, + "grad_norm": 2.048100709915161, + "learning_rate": 5e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7609126567840576, + "num_tokens": 479420712.0, + "step": 18528 + }, + { + "epoch": 2.0348122117285308, + "grad_norm": 2.0848069190979004, + "learning_rate": 5e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7664088010787964, + "num_tokens": 479447913.0, + "step": 18529 + }, + { + "epoch": 2.034922029431144, + "grad_norm": 1.979858636856079, + "learning_rate": 5e-06, + "loss": 0.6629, + "mean_token_accuracy": 0.7895129919052124, + "num_tokens": 479473283.0, + "step": 18530 + }, + { + "epoch": 2.035031847133758, + "grad_norm": 1.8830068111419678, + "learning_rate": 5e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7483972311019897, + "num_tokens": 479502303.0, + "step": 18531 + }, + { + "epoch": 2.0351416648363716, + "grad_norm": 1.894898772239685, + "learning_rate": 5e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.7746824622154236, + "num_tokens": 479530472.0, + "step": 18532 + }, + { + "epoch": 2.0352514825389854, + "grad_norm": 2.0254995822906494, + "learning_rate": 5e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.757926344871521, + "num_tokens": 479559003.0, + "step": 18533 + }, + { + "epoch": 2.035361300241599, + "grad_norm": 2.162883996963501, + "learning_rate": 5e-06, + "loss": 0.6099, + "mean_token_accuracy": 0.795905590057373, + "num_tokens": 479583271.0, + "step": 18534 + }, + { + "epoch": 2.0354711179442124, + "grad_norm": 2.013730049133301, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7661991119384766, + "num_tokens": 479612716.0, + "step": 18535 + }, + { + "epoch": 2.035580935646826, + "grad_norm": 1.9470864534378052, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7605383992195129, + "num_tokens": 479643493.0, + "step": 18536 + }, + { + "epoch": 2.03569075334944, + "grad_norm": 2.0187408924102783, + "learning_rate": 5e-06, + "loss": 0.6667, + "mean_token_accuracy": 0.781078577041626, + "num_tokens": 479668192.0, + "step": 18537 + }, + { + "epoch": 2.0358005710520537, + "grad_norm": 1.967523217201233, + "learning_rate": 5e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7785906791687012, + "num_tokens": 479699695.0, + "step": 18538 + }, + { + "epoch": 2.0359103887546675, + "grad_norm": 1.9305431842803955, + "learning_rate": 5e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7803370952606201, + "num_tokens": 479725342.0, + "step": 18539 + }, + { + "epoch": 2.0360202064572808, + "grad_norm": 2.1830742359161377, + "learning_rate": 5e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.7708195447921753, + "num_tokens": 479749421.0, + "step": 18540 + }, + { + "epoch": 2.0361300241598945, + "grad_norm": 2.1637308597564697, + "learning_rate": 5e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.7674877047538757, + "num_tokens": 479775002.0, + "step": 18541 + }, + { + "epoch": 2.0362398418625083, + "grad_norm": 2.133293390274048, + "learning_rate": 5e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7672705054283142, + "num_tokens": 479798179.0, + "step": 18542 + }, + { + "epoch": 2.036349659565122, + "grad_norm": 2.014286518096924, + "learning_rate": 5e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7780055403709412, + "num_tokens": 479826416.0, + "step": 18543 + }, + { + "epoch": 2.0364594772677354, + "grad_norm": 2.61979079246521, + "learning_rate": 5e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.795491635799408, + "num_tokens": 479844111.0, + "step": 18544 + }, + { + "epoch": 2.036569294970349, + "grad_norm": 2.3217527866363525, + "learning_rate": 5e-06, + "loss": 0.6381, + "mean_token_accuracy": 0.7960922718048096, + "num_tokens": 479863308.0, + "step": 18545 + }, + { + "epoch": 2.036679112672963, + "grad_norm": 2.1939079761505127, + "learning_rate": 5e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7775088548660278, + "num_tokens": 479888650.0, + "step": 18546 + }, + { + "epoch": 2.0367889303755766, + "grad_norm": 2.16550350189209, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7725008726119995, + "num_tokens": 479913655.0, + "step": 18547 + }, + { + "epoch": 2.0368987480781904, + "grad_norm": 2.05462646484375, + "learning_rate": 5e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.7890419960021973, + "num_tokens": 479936239.0, + "step": 18548 + }, + { + "epoch": 2.0370085657808037, + "grad_norm": 2.1778242588043213, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7557174563407898, + "num_tokens": 479963241.0, + "step": 18549 + }, + { + "epoch": 2.0371183834834174, + "grad_norm": 2.170663833618164, + "learning_rate": 5e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7787869572639465, + "num_tokens": 479987244.0, + "step": 18550 + }, + { + "epoch": 2.037228201186031, + "grad_norm": 1.9781992435455322, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.76686692237854, + "num_tokens": 480017544.0, + "step": 18551 + }, + { + "epoch": 2.037338018888645, + "grad_norm": 2.0804905891418457, + "learning_rate": 5e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7591807246208191, + "num_tokens": 480044714.0, + "step": 18552 + }, + { + "epoch": 2.0374478365912587, + "grad_norm": 2.068199396133423, + "learning_rate": 5e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7718129754066467, + "num_tokens": 480072172.0, + "step": 18553 + }, + { + "epoch": 2.037557654293872, + "grad_norm": 2.5203018188476562, + "learning_rate": 5e-06, + "loss": 0.631, + "mean_token_accuracy": 0.7873477935791016, + "num_tokens": 480093851.0, + "step": 18554 + }, + { + "epoch": 2.037667471996486, + "grad_norm": 1.9068852663040161, + "learning_rate": 5e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7670726776123047, + "num_tokens": 480122312.0, + "step": 18555 + }, + { + "epoch": 2.0377772896990995, + "grad_norm": 2.115335702896118, + "learning_rate": 5e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7642561197280884, + "num_tokens": 480147180.0, + "step": 18556 + }, + { + "epoch": 2.0378871074017133, + "grad_norm": 1.9395990371704102, + "learning_rate": 5e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.784420371055603, + "num_tokens": 480174964.0, + "step": 18557 + }, + { + "epoch": 2.0379969251043266, + "grad_norm": 1.7956446409225464, + "learning_rate": 5e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7765108346939087, + "num_tokens": 480209069.0, + "step": 18558 + }, + { + "epoch": 2.0381067428069404, + "grad_norm": 1.9699530601501465, + "learning_rate": 5e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.7775915861129761, + "num_tokens": 480233650.0, + "step": 18559 + }, + { + "epoch": 2.038216560509554, + "grad_norm": 1.833185076713562, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7604734897613525, + "num_tokens": 480264143.0, + "step": 18560 + }, + { + "epoch": 2.038326378212168, + "grad_norm": 2.013151168823242, + "learning_rate": 5e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.7805115580558777, + "num_tokens": 480292906.0, + "step": 18561 + }, + { + "epoch": 2.0384361959147816, + "grad_norm": 2.1229991912841797, + "learning_rate": 5e-06, + "loss": 0.6528, + "mean_token_accuracy": 0.7819374203681946, + "num_tokens": 480318067.0, + "step": 18562 + }, + { + "epoch": 2.038546013617395, + "grad_norm": 2.205975294113159, + "learning_rate": 5e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.767524003982544, + "num_tokens": 480342305.0, + "step": 18563 + }, + { + "epoch": 2.0386558313200087, + "grad_norm": 2.0231781005859375, + "learning_rate": 5e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.7844721078872681, + "num_tokens": 480368187.0, + "step": 18564 + }, + { + "epoch": 2.0387656490226225, + "grad_norm": 1.9463146924972534, + "learning_rate": 5e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.7856841087341309, + "num_tokens": 480394429.0, + "step": 18565 + }, + { + "epoch": 2.0388754667252362, + "grad_norm": 2.061373710632324, + "learning_rate": 5e-06, + "loss": 0.6667, + "mean_token_accuracy": 0.7875304222106934, + "num_tokens": 480417917.0, + "step": 18566 + }, + { + "epoch": 2.03898528442785, + "grad_norm": 2.6278536319732666, + "learning_rate": 5e-06, + "loss": 0.6182, + "mean_token_accuracy": 0.7903963327407837, + "num_tokens": 480436780.0, + "step": 18567 + }, + { + "epoch": 2.0390951021304633, + "grad_norm": 2.0679948329925537, + "learning_rate": 5e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7659343481063843, + "num_tokens": 480462339.0, + "step": 18568 + }, + { + "epoch": 2.039204919833077, + "grad_norm": 2.0594441890716553, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7689175605773926, + "num_tokens": 480488698.0, + "step": 18569 + }, + { + "epoch": 2.039314737535691, + "grad_norm": 2.1936559677124023, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7598848938941956, + "num_tokens": 480512673.0, + "step": 18570 + }, + { + "epoch": 2.0394245552383046, + "grad_norm": 1.8502229452133179, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7419685125350952, + "num_tokens": 480547320.0, + "step": 18571 + }, + { + "epoch": 2.039534372940918, + "grad_norm": 2.13655686378479, + "learning_rate": 5e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8058773875236511, + "num_tokens": 480568992.0, + "step": 18572 + }, + { + "epoch": 2.0396441906435316, + "grad_norm": 2.141885757446289, + "learning_rate": 5e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.7571455836296082, + "num_tokens": 480594541.0, + "step": 18573 + }, + { + "epoch": 2.0397540083461454, + "grad_norm": 2.1857314109802246, + "learning_rate": 5e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7507862448692322, + "num_tokens": 480619584.0, + "step": 18574 + }, + { + "epoch": 2.039863826048759, + "grad_norm": 1.8126869201660156, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7716107368469238, + "num_tokens": 480648454.0, + "step": 18575 + }, + { + "epoch": 2.039973643751373, + "grad_norm": 2.2359752655029297, + "learning_rate": 5e-06, + "loss": 0.6156, + "mean_token_accuracy": 0.795111358165741, + "num_tokens": 480669681.0, + "step": 18576 + }, + { + "epoch": 2.0400834614539862, + "grad_norm": 2.010315418243408, + "learning_rate": 5e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.7721461653709412, + "num_tokens": 480694879.0, + "step": 18577 + }, + { + "epoch": 2.0401932791566, + "grad_norm": 2.1296868324279785, + "learning_rate": 5e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.7792526483535767, + "num_tokens": 480720283.0, + "step": 18578 + }, + { + "epoch": 2.0403030968592137, + "grad_norm": 2.078622817993164, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7840358018875122, + "num_tokens": 480745541.0, + "step": 18579 + }, + { + "epoch": 2.0404129145618275, + "grad_norm": 2.188533306121826, + "learning_rate": 5e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7599766850471497, + "num_tokens": 480767883.0, + "step": 18580 + }, + { + "epoch": 2.0405227322644413, + "grad_norm": 1.9972126483917236, + "learning_rate": 5e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7607552409172058, + "num_tokens": 480798408.0, + "step": 18581 + }, + { + "epoch": 2.0406325499670546, + "grad_norm": 2.0310864448547363, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7652127742767334, + "num_tokens": 480826723.0, + "step": 18582 + }, + { + "epoch": 2.0407423676696683, + "grad_norm": 2.372593879699707, + "learning_rate": 5e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7791730165481567, + "num_tokens": 480847596.0, + "step": 18583 + }, + { + "epoch": 2.040852185372282, + "grad_norm": 2.1549017429351807, + "learning_rate": 5e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.7707687616348267, + "num_tokens": 480871986.0, + "step": 18584 + }, + { + "epoch": 2.040962003074896, + "grad_norm": 2.11948823928833, + "learning_rate": 5e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.7800748348236084, + "num_tokens": 480894129.0, + "step": 18585 + }, + { + "epoch": 2.041071820777509, + "grad_norm": 2.0352959632873535, + "learning_rate": 5e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7561451196670532, + "num_tokens": 480921525.0, + "step": 18586 + }, + { + "epoch": 2.041181638480123, + "grad_norm": 1.910237431526184, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7721706628799438, + "num_tokens": 480951117.0, + "step": 18587 + }, + { + "epoch": 2.0412914561827367, + "grad_norm": 2.1004600524902344, + "learning_rate": 5e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7583924531936646, + "num_tokens": 480980982.0, + "step": 18588 + }, + { + "epoch": 2.0414012738853504, + "grad_norm": 1.8039870262145996, + "learning_rate": 5e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7342907190322876, + "num_tokens": 481012841.0, + "step": 18589 + }, + { + "epoch": 2.041511091587964, + "grad_norm": 2.311366558074951, + "learning_rate": 5e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7728396654129028, + "num_tokens": 481035419.0, + "step": 18590 + }, + { + "epoch": 2.0416209092905775, + "grad_norm": 1.9728187322616577, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7685730457305908, + "num_tokens": 481063734.0, + "step": 18591 + }, + { + "epoch": 2.0417307269931912, + "grad_norm": 1.8896125555038452, + "learning_rate": 5e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7620567083358765, + "num_tokens": 481093646.0, + "step": 18592 + }, + { + "epoch": 2.041840544695805, + "grad_norm": 2.300243377685547, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7587114572525024, + "num_tokens": 481118135.0, + "step": 18593 + }, + { + "epoch": 2.0419503623984188, + "grad_norm": 2.0562572479248047, + "learning_rate": 5e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7554805874824524, + "num_tokens": 481146754.0, + "step": 18594 + }, + { + "epoch": 2.042060180101032, + "grad_norm": 2.099207878112793, + "learning_rate": 5e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7774664163589478, + "num_tokens": 481171130.0, + "step": 18595 + }, + { + "epoch": 2.042169997803646, + "grad_norm": 1.832846760749817, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7575494647026062, + "num_tokens": 481203224.0, + "step": 18596 + }, + { + "epoch": 2.0422798155062596, + "grad_norm": 1.8798279762268066, + "learning_rate": 5e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7373784780502319, + "num_tokens": 481238949.0, + "step": 18597 + }, + { + "epoch": 2.0423896332088733, + "grad_norm": 2.0450398921966553, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7691329121589661, + "num_tokens": 481264675.0, + "step": 18598 + }, + { + "epoch": 2.042499450911487, + "grad_norm": 1.8138155937194824, + "learning_rate": 5e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.7744139432907104, + "num_tokens": 481297585.0, + "step": 18599 + }, + { + "epoch": 2.0426092686141004, + "grad_norm": 1.8749642372131348, + "learning_rate": 5e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7703449726104736, + "num_tokens": 481328684.0, + "step": 18600 + }, + { + "epoch": 2.042719086316714, + "grad_norm": 2.0329339504241943, + "learning_rate": 5e-06, + "loss": 0.6811, + "mean_token_accuracy": 0.774703860282898, + "num_tokens": 481354685.0, + "step": 18601 + }, + { + "epoch": 2.042828904019328, + "grad_norm": 2.1358163356781006, + "learning_rate": 5e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7817211151123047, + "num_tokens": 481379940.0, + "step": 18602 + }, + { + "epoch": 2.0429387217219417, + "grad_norm": 2.163867235183716, + "learning_rate": 5e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7593163251876831, + "num_tokens": 481404194.0, + "step": 18603 + }, + { + "epoch": 2.0430485394245554, + "grad_norm": 2.1369338035583496, + "learning_rate": 5e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7611854076385498, + "num_tokens": 481430834.0, + "step": 18604 + }, + { + "epoch": 2.0431583571271688, + "grad_norm": 2.543194055557251, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.769170880317688, + "num_tokens": 481449917.0, + "step": 18605 + }, + { + "epoch": 2.0432681748297825, + "grad_norm": 1.9078096151351929, + "learning_rate": 5e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.7443423271179199, + "num_tokens": 481484344.0, + "step": 18606 + }, + { + "epoch": 2.0433779925323963, + "grad_norm": 1.8848702907562256, + "learning_rate": 5e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.74957674741745, + "num_tokens": 481519356.0, + "step": 18607 + }, + { + "epoch": 2.04348781023501, + "grad_norm": 2.1311235427856445, + "learning_rate": 5e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.7842944264411926, + "num_tokens": 481542220.0, + "step": 18608 + }, + { + "epoch": 2.0435976279376233, + "grad_norm": 1.9408208131790161, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7598838806152344, + "num_tokens": 481572210.0, + "step": 18609 + }, + { + "epoch": 2.043707445640237, + "grad_norm": 2.1617164611816406, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7782236337661743, + "num_tokens": 481595755.0, + "step": 18610 + }, + { + "epoch": 2.043817263342851, + "grad_norm": 2.2584946155548096, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7714133262634277, + "num_tokens": 481618628.0, + "step": 18611 + }, + { + "epoch": 2.0439270810454646, + "grad_norm": 2.0818700790405273, + "learning_rate": 5e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7686657905578613, + "num_tokens": 481644140.0, + "step": 18612 + }, + { + "epoch": 2.0440368987480784, + "grad_norm": 2.4521100521087646, + "learning_rate": 5e-06, + "loss": 0.6475, + "mean_token_accuracy": 0.7872165441513062, + "num_tokens": 481664889.0, + "step": 18613 + }, + { + "epoch": 2.0441467164506917, + "grad_norm": 1.987630009651184, + "learning_rate": 5e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.779949426651001, + "num_tokens": 481691722.0, + "step": 18614 + }, + { + "epoch": 2.0442565341533054, + "grad_norm": 2.04307222366333, + "learning_rate": 5e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8025728464126587, + "num_tokens": 481715147.0, + "step": 18615 + }, + { + "epoch": 2.044366351855919, + "grad_norm": 2.2325985431671143, + "learning_rate": 5e-06, + "loss": 0.6289, + "mean_token_accuracy": 0.7961679100990295, + "num_tokens": 481734130.0, + "step": 18616 + }, + { + "epoch": 2.044476169558533, + "grad_norm": 2.0910723209381104, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7666079998016357, + "num_tokens": 481759148.0, + "step": 18617 + }, + { + "epoch": 2.0445859872611467, + "grad_norm": 1.8833181858062744, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7437458038330078, + "num_tokens": 481789775.0, + "step": 18618 + }, + { + "epoch": 2.04469580496376, + "grad_norm": 2.3047115802764893, + "learning_rate": 5e-06, + "loss": 0.5732, + "mean_token_accuracy": 0.808696448802948, + "num_tokens": 481809154.0, + "step": 18619 + }, + { + "epoch": 2.044805622666374, + "grad_norm": 1.9638686180114746, + "learning_rate": 5e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.768481433391571, + "num_tokens": 481839689.0, + "step": 18620 + }, + { + "epoch": 2.0449154403689875, + "grad_norm": 2.1129815578460693, + "learning_rate": 5e-06, + "loss": 0.6316, + "mean_token_accuracy": 0.7908402681350708, + "num_tokens": 481863286.0, + "step": 18621 + }, + { + "epoch": 2.0450252580716013, + "grad_norm": 2.183798313140869, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7877939343452454, + "num_tokens": 481887360.0, + "step": 18622 + }, + { + "epoch": 2.0451350757742146, + "grad_norm": 1.9986848831176758, + "learning_rate": 5e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7783815264701843, + "num_tokens": 481914262.0, + "step": 18623 + }, + { + "epoch": 2.0452448934768284, + "grad_norm": 2.05122447013855, + "learning_rate": 5e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7473834753036499, + "num_tokens": 481944510.0, + "step": 18624 + }, + { + "epoch": 2.045354711179442, + "grad_norm": 2.043513536453247, + "learning_rate": 5e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.7951323986053467, + "num_tokens": 481970647.0, + "step": 18625 + }, + { + "epoch": 2.045464528882056, + "grad_norm": 2.1244418621063232, + "learning_rate": 5e-06, + "loss": 0.6596, + "mean_token_accuracy": 0.7870699167251587, + "num_tokens": 481994500.0, + "step": 18626 + }, + { + "epoch": 2.0455743465846696, + "grad_norm": 2.3175652027130127, + "learning_rate": 5e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.7811832427978516, + "num_tokens": 482015844.0, + "step": 18627 + }, + { + "epoch": 2.045684164287283, + "grad_norm": 2.0735418796539307, + "learning_rate": 5e-06, + "loss": 0.6522, + "mean_token_accuracy": 0.7882387638092041, + "num_tokens": 482041525.0, + "step": 18628 + }, + { + "epoch": 2.0457939819898967, + "grad_norm": 1.941904902458191, + "learning_rate": 5e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7699202299118042, + "num_tokens": 482073012.0, + "step": 18629 + }, + { + "epoch": 2.0459037996925105, + "grad_norm": 2.024589776992798, + "learning_rate": 5e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.7839632034301758, + "num_tokens": 482099281.0, + "step": 18630 + }, + { + "epoch": 2.046013617395124, + "grad_norm": 2.1138391494750977, + "learning_rate": 5e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7808600664138794, + "num_tokens": 482121780.0, + "step": 18631 + }, + { + "epoch": 2.046123435097738, + "grad_norm": 2.1655991077423096, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7641316652297974, + "num_tokens": 482148452.0, + "step": 18632 + }, + { + "epoch": 2.0462332528003513, + "grad_norm": 1.992901086807251, + "learning_rate": 5e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7635852098464966, + "num_tokens": 482176975.0, + "step": 18633 + }, + { + "epoch": 2.046343070502965, + "grad_norm": 1.8478819131851196, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7562379240989685, + "num_tokens": 482209050.0, + "step": 18634 + }, + { + "epoch": 2.046452888205579, + "grad_norm": 2.2701470851898193, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7678929567337036, + "num_tokens": 482233882.0, + "step": 18635 + }, + { + "epoch": 2.0465627059081926, + "grad_norm": 1.9669667482376099, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7775777578353882, + "num_tokens": 482260962.0, + "step": 18636 + }, + { + "epoch": 2.046672523610806, + "grad_norm": 2.16554594039917, + "learning_rate": 5e-06, + "loss": 0.6762, + "mean_token_accuracy": 0.7826602458953857, + "num_tokens": 482284961.0, + "step": 18637 + }, + { + "epoch": 2.0467823413134196, + "grad_norm": 2.165411949157715, + "learning_rate": 5e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.7691823840141296, + "num_tokens": 482308257.0, + "step": 18638 + }, + { + "epoch": 2.0468921590160334, + "grad_norm": 2.086371421813965, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7746819853782654, + "num_tokens": 482333516.0, + "step": 18639 + }, + { + "epoch": 2.047001976718647, + "grad_norm": 2.313706636428833, + "learning_rate": 5e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7835516333580017, + "num_tokens": 482356883.0, + "step": 18640 + }, + { + "epoch": 2.047111794421261, + "grad_norm": 2.081794261932373, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7671827077865601, + "num_tokens": 482382233.0, + "step": 18641 + }, + { + "epoch": 2.047221612123874, + "grad_norm": 2.050020694732666, + "learning_rate": 5e-06, + "loss": 0.6021, + "mean_token_accuracy": 0.7997646331787109, + "num_tokens": 482409406.0, + "step": 18642 + }, + { + "epoch": 2.047331429826488, + "grad_norm": 2.056835174560547, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7616375684738159, + "num_tokens": 482437430.0, + "step": 18643 + }, + { + "epoch": 2.0474412475291017, + "grad_norm": 1.95663583278656, + "learning_rate": 5e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7745358347892761, + "num_tokens": 482465903.0, + "step": 18644 + }, + { + "epoch": 2.0475510652317155, + "grad_norm": 2.215085983276367, + "learning_rate": 5e-06, + "loss": 0.6466, + "mean_token_accuracy": 0.7933935523033142, + "num_tokens": 482487405.0, + "step": 18645 + }, + { + "epoch": 2.047660882934329, + "grad_norm": 2.398329973220825, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7669111490249634, + "num_tokens": 482508544.0, + "step": 18646 + }, + { + "epoch": 2.0477707006369426, + "grad_norm": 1.9132862091064453, + "learning_rate": 5e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7493805885314941, + "num_tokens": 482538773.0, + "step": 18647 + }, + { + "epoch": 2.0478805183395563, + "grad_norm": 2.3889670372009277, + "learning_rate": 5e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.7846783399581909, + "num_tokens": 482563213.0, + "step": 18648 + }, + { + "epoch": 2.04799033604217, + "grad_norm": 2.022611141204834, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7658822536468506, + "num_tokens": 482589954.0, + "step": 18649 + }, + { + "epoch": 2.048100153744784, + "grad_norm": 2.322103500366211, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7643999457359314, + "num_tokens": 482613040.0, + "step": 18650 + }, + { + "epoch": 2.048209971447397, + "grad_norm": 2.1459481716156006, + "learning_rate": 5e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7690807580947876, + "num_tokens": 482637624.0, + "step": 18651 + }, + { + "epoch": 2.048319789150011, + "grad_norm": 1.8859502077102661, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7574158310890198, + "num_tokens": 482669725.0, + "step": 18652 + }, + { + "epoch": 2.0484296068526247, + "grad_norm": 1.9208608865737915, + "learning_rate": 5e-06, + "loss": 0.6646, + "mean_token_accuracy": 0.772994875907898, + "num_tokens": 482697730.0, + "step": 18653 + }, + { + "epoch": 2.0485394245552384, + "grad_norm": 2.0190863609313965, + "learning_rate": 5e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.778444230556488, + "num_tokens": 482723879.0, + "step": 18654 + }, + { + "epoch": 2.048649242257852, + "grad_norm": 1.9569180011749268, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7550241947174072, + "num_tokens": 482753104.0, + "step": 18655 + }, + { + "epoch": 2.0487590599604655, + "grad_norm": 2.34375, + "learning_rate": 5e-06, + "loss": 0.6562, + "mean_token_accuracy": 0.7822402715682983, + "num_tokens": 482775464.0, + "step": 18656 + }, + { + "epoch": 2.0488688776630792, + "grad_norm": 1.9865556955337524, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7639951705932617, + "num_tokens": 482804135.0, + "step": 18657 + }, + { + "epoch": 2.048978695365693, + "grad_norm": 2.207498550415039, + "learning_rate": 5e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7733404636383057, + "num_tokens": 482829725.0, + "step": 18658 + }, + { + "epoch": 2.0490885130683067, + "grad_norm": 1.9515960216522217, + "learning_rate": 5e-06, + "loss": 0.6199, + "mean_token_accuracy": 0.792914628982544, + "num_tokens": 482857339.0, + "step": 18659 + }, + { + "epoch": 2.04919833077092, + "grad_norm": 1.855976939201355, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7399308085441589, + "num_tokens": 482892058.0, + "step": 18660 + }, + { + "epoch": 2.049308148473534, + "grad_norm": 2.062718629837036, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7617214918136597, + "num_tokens": 482917961.0, + "step": 18661 + }, + { + "epoch": 2.0494179661761476, + "grad_norm": 2.099064350128174, + "learning_rate": 5e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7632445096969604, + "num_tokens": 482942175.0, + "step": 18662 + }, + { + "epoch": 2.0495277838787613, + "grad_norm": 2.0450191497802734, + "learning_rate": 5e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7605729103088379, + "num_tokens": 482968870.0, + "step": 18663 + }, + { + "epoch": 2.049637601581375, + "grad_norm": 2.163027763366699, + "learning_rate": 5e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.7740566730499268, + "num_tokens": 482993607.0, + "step": 18664 + }, + { + "epoch": 2.0497474192839884, + "grad_norm": 2.287423849105835, + "learning_rate": 5e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.7790026664733887, + "num_tokens": 483014039.0, + "step": 18665 + }, + { + "epoch": 2.049857236986602, + "grad_norm": 2.0025761127471924, + "learning_rate": 5e-06, + "loss": 0.6424, + "mean_token_accuracy": 0.7837260961532593, + "num_tokens": 483039772.0, + "step": 18666 + }, + { + "epoch": 2.049967054689216, + "grad_norm": 1.984721302986145, + "learning_rate": 5e-06, + "loss": 0.5982, + "mean_token_accuracy": 0.7980906367301941, + "num_tokens": 483063653.0, + "step": 18667 + }, + { + "epoch": 2.0500768723918297, + "grad_norm": 1.9049257040023804, + "learning_rate": 5e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7793222665786743, + "num_tokens": 483092764.0, + "step": 18668 + }, + { + "epoch": 2.0501866900944434, + "grad_norm": 2.163604736328125, + "learning_rate": 5e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.7869261503219604, + "num_tokens": 483116354.0, + "step": 18669 + }, + { + "epoch": 2.0502965077970567, + "grad_norm": 2.036046028137207, + "learning_rate": 5e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.7799121141433716, + "num_tokens": 483143324.0, + "step": 18670 + }, + { + "epoch": 2.0504063254996705, + "grad_norm": 1.9322761297225952, + "learning_rate": 5e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7692688703536987, + "num_tokens": 483173254.0, + "step": 18671 + }, + { + "epoch": 2.0505161432022843, + "grad_norm": 1.9825714826583862, + "learning_rate": 5e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7649334073066711, + "num_tokens": 483202872.0, + "step": 18672 + }, + { + "epoch": 2.050625960904898, + "grad_norm": 2.286407947540283, + "learning_rate": 5e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8181903958320618, + "num_tokens": 483223341.0, + "step": 18673 + }, + { + "epoch": 2.0507357786075113, + "grad_norm": 1.965580940246582, + "learning_rate": 5e-06, + "loss": 0.8081, + "mean_token_accuracy": 0.7457695603370667, + "num_tokens": 483253134.0, + "step": 18674 + }, + { + "epoch": 2.050845596310125, + "grad_norm": 2.0858535766601562, + "learning_rate": 5e-06, + "loss": 0.652, + "mean_token_accuracy": 0.7799219489097595, + "num_tokens": 483277042.0, + "step": 18675 + }, + { + "epoch": 2.050955414012739, + "grad_norm": 2.5489797592163086, + "learning_rate": 5e-06, + "loss": 0.6127, + "mean_token_accuracy": 0.7957015037536621, + "num_tokens": 483295616.0, + "step": 18676 + }, + { + "epoch": 2.0510652317153526, + "grad_norm": 2.2901270389556885, + "learning_rate": 5e-06, + "loss": 0.6417, + "mean_token_accuracy": 0.7814173102378845, + "num_tokens": 483315953.0, + "step": 18677 + }, + { + "epoch": 2.0511750494179664, + "grad_norm": 2.1164114475250244, + "learning_rate": 5e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.755416750907898, + "num_tokens": 483340154.0, + "step": 18678 + }, + { + "epoch": 2.0512848671205797, + "grad_norm": 2.0666797161102295, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.772492527961731, + "num_tokens": 483366226.0, + "step": 18679 + }, + { + "epoch": 2.0513946848231934, + "grad_norm": 2.102792978286743, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7638646364212036, + "num_tokens": 483391878.0, + "step": 18680 + }, + { + "epoch": 2.051504502525807, + "grad_norm": 2.0291035175323486, + "learning_rate": 5e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7725127339363098, + "num_tokens": 483420246.0, + "step": 18681 + }, + { + "epoch": 2.051614320228421, + "grad_norm": 1.9355435371398926, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7573297619819641, + "num_tokens": 483450217.0, + "step": 18682 + }, + { + "epoch": 2.0517241379310347, + "grad_norm": 1.9084810018539429, + "learning_rate": 5e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7467562556266785, + "num_tokens": 483479518.0, + "step": 18683 + }, + { + "epoch": 2.051833955633648, + "grad_norm": 2.006398916244507, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7789616584777832, + "num_tokens": 483504757.0, + "step": 18684 + }, + { + "epoch": 2.0519437733362618, + "grad_norm": 2.029433250427246, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7515848278999329, + "num_tokens": 483530701.0, + "step": 18685 + }, + { + "epoch": 2.0520535910388755, + "grad_norm": 2.180288791656494, + "learning_rate": 5e-06, + "loss": 0.5779, + "mean_token_accuracy": 0.8058321475982666, + "num_tokens": 483551482.0, + "step": 18686 + }, + { + "epoch": 2.0521634087414893, + "grad_norm": 1.9154473543167114, + "learning_rate": 5e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7601726055145264, + "num_tokens": 483580848.0, + "step": 18687 + }, + { + "epoch": 2.0522732264441026, + "grad_norm": 2.146425485610962, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7748217582702637, + "num_tokens": 483604826.0, + "step": 18688 + }, + { + "epoch": 2.0523830441467164, + "grad_norm": 2.078423261642456, + "learning_rate": 5e-06, + "loss": 0.6701, + "mean_token_accuracy": 0.7746066451072693, + "num_tokens": 483630160.0, + "step": 18689 + }, + { + "epoch": 2.05249286184933, + "grad_norm": 2.122260808944702, + "learning_rate": 5e-06, + "loss": 0.6199, + "mean_token_accuracy": 0.7986798882484436, + "num_tokens": 483654815.0, + "step": 18690 + }, + { + "epoch": 2.052602679551944, + "grad_norm": 2.0495049953460693, + "learning_rate": 5e-06, + "loss": 0.6705, + "mean_token_accuracy": 0.7743597030639648, + "num_tokens": 483681300.0, + "step": 18691 + }, + { + "epoch": 2.0527124972545576, + "grad_norm": 2.13217830657959, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7654284834861755, + "num_tokens": 483705707.0, + "step": 18692 + }, + { + "epoch": 2.052822314957171, + "grad_norm": 2.5659663677215576, + "learning_rate": 5e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.8023830056190491, + "num_tokens": 483723648.0, + "step": 18693 + }, + { + "epoch": 2.0529321326597847, + "grad_norm": 2.221776008605957, + "learning_rate": 5e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.7889781594276428, + "num_tokens": 483747066.0, + "step": 18694 + }, + { + "epoch": 2.0530419503623984, + "grad_norm": 2.3910303115844727, + "learning_rate": 5e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.7827630639076233, + "num_tokens": 483770103.0, + "step": 18695 + }, + { + "epoch": 2.053151768065012, + "grad_norm": 2.170447826385498, + "learning_rate": 5e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.790785551071167, + "num_tokens": 483794188.0, + "step": 18696 + }, + { + "epoch": 2.0532615857676255, + "grad_norm": 1.7596417665481567, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7596635222434998, + "num_tokens": 483830094.0, + "step": 18697 + }, + { + "epoch": 2.0533714034702393, + "grad_norm": 1.9590725898742676, + "learning_rate": 5e-06, + "loss": 0.5954, + "mean_token_accuracy": 0.8020215630531311, + "num_tokens": 483853599.0, + "step": 18698 + }, + { + "epoch": 2.053481221172853, + "grad_norm": 2.0309712886810303, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7520277500152588, + "num_tokens": 483883487.0, + "step": 18699 + }, + { + "epoch": 2.053591038875467, + "grad_norm": 1.9877254962921143, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7549514770507812, + "num_tokens": 483914542.0, + "step": 18700 + }, + { + "epoch": 2.0537008565780805, + "grad_norm": 2.1879401206970215, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7677527666091919, + "num_tokens": 483938821.0, + "step": 18701 + }, + { + "epoch": 2.053810674280694, + "grad_norm": 1.8008676767349243, + "learning_rate": 5e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.789004921913147, + "num_tokens": 483972511.0, + "step": 18702 + }, + { + "epoch": 2.0539204919833076, + "grad_norm": 2.17423939704895, + "learning_rate": 5e-06, + "loss": 0.6638, + "mean_token_accuracy": 0.7753608822822571, + "num_tokens": 483997440.0, + "step": 18703 + }, + { + "epoch": 2.0540303096859214, + "grad_norm": 2.183769702911377, + "learning_rate": 5e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8031039237976074, + "num_tokens": 484018077.0, + "step": 18704 + }, + { + "epoch": 2.054140127388535, + "grad_norm": 1.8187131881713867, + "learning_rate": 5e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7663836479187012, + "num_tokens": 484050832.0, + "step": 18705 + }, + { + "epoch": 2.054249945091149, + "grad_norm": 1.7552767992019653, + "learning_rate": 5e-06, + "loss": 0.6707, + "mean_token_accuracy": 0.7771680951118469, + "num_tokens": 484078684.0, + "step": 18706 + }, + { + "epoch": 2.054359762793762, + "grad_norm": 1.9874022006988525, + "learning_rate": 5e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7457199096679688, + "num_tokens": 484107758.0, + "step": 18707 + }, + { + "epoch": 2.054469580496376, + "grad_norm": 2.0354344844818115, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7751156687736511, + "num_tokens": 484134076.0, + "step": 18708 + }, + { + "epoch": 2.0545793981989897, + "grad_norm": 2.240255355834961, + "learning_rate": 5e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.773219645023346, + "num_tokens": 484157292.0, + "step": 18709 + }, + { + "epoch": 2.0546892159016035, + "grad_norm": 1.988511323928833, + "learning_rate": 5e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7274532318115234, + "num_tokens": 484188450.0, + "step": 18710 + }, + { + "epoch": 2.054799033604217, + "grad_norm": 2.1473422050476074, + "learning_rate": 5e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.791191041469574, + "num_tokens": 484213772.0, + "step": 18711 + }, + { + "epoch": 2.0549088513068305, + "grad_norm": 1.9270164966583252, + "learning_rate": 5e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7535704374313354, + "num_tokens": 484244796.0, + "step": 18712 + }, + { + "epoch": 2.0550186690094443, + "grad_norm": 2.2992684841156006, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7723698019981384, + "num_tokens": 484266256.0, + "step": 18713 + }, + { + "epoch": 2.055128486712058, + "grad_norm": 2.0061585903167725, + "learning_rate": 5e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7495526075363159, + "num_tokens": 484292825.0, + "step": 18714 + }, + { + "epoch": 2.055238304414672, + "grad_norm": 2.0988929271698, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7779628038406372, + "num_tokens": 484319305.0, + "step": 18715 + }, + { + "epoch": 2.055348122117285, + "grad_norm": 1.9775162935256958, + "learning_rate": 5e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.7983402013778687, + "num_tokens": 484346541.0, + "step": 18716 + }, + { + "epoch": 2.055457939819899, + "grad_norm": 2.036778688430786, + "learning_rate": 5e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7691996693611145, + "num_tokens": 484373675.0, + "step": 18717 + }, + { + "epoch": 2.0555677575225126, + "grad_norm": 2.1419677734375, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7616592645645142, + "num_tokens": 484398791.0, + "step": 18718 + }, + { + "epoch": 2.0556775752251264, + "grad_norm": 2.187058448791504, + "learning_rate": 5e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.7966589331626892, + "num_tokens": 484420102.0, + "step": 18719 + }, + { + "epoch": 2.05578739292774, + "grad_norm": 2.0680463314056396, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7664685249328613, + "num_tokens": 484448056.0, + "step": 18720 + }, + { + "epoch": 2.0558972106303535, + "grad_norm": 2.013744592666626, + "learning_rate": 5e-06, + "loss": 0.6364, + "mean_token_accuracy": 0.7892454862594604, + "num_tokens": 484473876.0, + "step": 18721 + }, + { + "epoch": 2.0560070283329672, + "grad_norm": 2.094533920288086, + "learning_rate": 5e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7625765204429626, + "num_tokens": 484499493.0, + "step": 18722 + }, + { + "epoch": 2.056116846035581, + "grad_norm": 2.5917558670043945, + "learning_rate": 5e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.777464747428894, + "num_tokens": 484518902.0, + "step": 18723 + }, + { + "epoch": 2.0562266637381947, + "grad_norm": 2.0749685764312744, + "learning_rate": 5e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7771586179733276, + "num_tokens": 484543060.0, + "step": 18724 + }, + { + "epoch": 2.056336481440808, + "grad_norm": 1.9698312282562256, + "learning_rate": 5e-06, + "loss": 0.711, + "mean_token_accuracy": 0.7664921283721924, + "num_tokens": 484573819.0, + "step": 18725 + }, + { + "epoch": 2.056446299143422, + "grad_norm": 2.577981948852539, + "learning_rate": 5e-06, + "loss": 0.6842, + "mean_token_accuracy": 0.7745989561080933, + "num_tokens": 484594041.0, + "step": 18726 + }, + { + "epoch": 2.0565561168460356, + "grad_norm": 2.0723304748535156, + "learning_rate": 5e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7616100907325745, + "num_tokens": 484622072.0, + "step": 18727 + }, + { + "epoch": 2.0566659345486493, + "grad_norm": 2.30845046043396, + "learning_rate": 5e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7659397125244141, + "num_tokens": 484645653.0, + "step": 18728 + }, + { + "epoch": 2.056775752251263, + "grad_norm": 2.3664355278015137, + "learning_rate": 5e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7613287568092346, + "num_tokens": 484669720.0, + "step": 18729 + }, + { + "epoch": 2.0568855699538764, + "grad_norm": 2.026118755340576, + "learning_rate": 5e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.7835899591445923, + "num_tokens": 484698147.0, + "step": 18730 + }, + { + "epoch": 2.05699538765649, + "grad_norm": 1.9475373029708862, + "learning_rate": 5e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7666442394256592, + "num_tokens": 484728578.0, + "step": 18731 + }, + { + "epoch": 2.057105205359104, + "grad_norm": 2.045414686203003, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7834073305130005, + "num_tokens": 484755582.0, + "step": 18732 + }, + { + "epoch": 2.0572150230617177, + "grad_norm": 2.001448154449463, + "learning_rate": 5e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7640305757522583, + "num_tokens": 484782791.0, + "step": 18733 + }, + { + "epoch": 2.0573248407643314, + "grad_norm": 2.0646839141845703, + "learning_rate": 5e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.7867507338523865, + "num_tokens": 484808058.0, + "step": 18734 + }, + { + "epoch": 2.0574346584669447, + "grad_norm": 2.1174752712249756, + "learning_rate": 5e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7661755084991455, + "num_tokens": 484834009.0, + "step": 18735 + }, + { + "epoch": 2.0575444761695585, + "grad_norm": 1.8937785625457764, + "learning_rate": 5e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7539486885070801, + "num_tokens": 484867755.0, + "step": 18736 + }, + { + "epoch": 2.0576542938721722, + "grad_norm": 2.032711982727051, + "learning_rate": 5e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7568365931510925, + "num_tokens": 484895729.0, + "step": 18737 + }, + { + "epoch": 2.057764111574786, + "grad_norm": 2.08389949798584, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7510488629341125, + "num_tokens": 484927204.0, + "step": 18738 + }, + { + "epoch": 2.0578739292773993, + "grad_norm": 2.172966241836548, + "learning_rate": 5e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.7822173833847046, + "num_tokens": 484950751.0, + "step": 18739 + }, + { + "epoch": 2.057983746980013, + "grad_norm": 1.7887173891067505, + "learning_rate": 5e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7665959596633911, + "num_tokens": 484982805.0, + "step": 18740 + }, + { + "epoch": 2.058093564682627, + "grad_norm": 2.3834612369537354, + "learning_rate": 5e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.7764987945556641, + "num_tokens": 485003808.0, + "step": 18741 + }, + { + "epoch": 2.0582033823852406, + "grad_norm": 2.4313743114471436, + "learning_rate": 5e-06, + "loss": 0.6508, + "mean_token_accuracy": 0.7873191237449646, + "num_tokens": 485023836.0, + "step": 18742 + }, + { + "epoch": 2.0583132000878543, + "grad_norm": 2.044233560562134, + "learning_rate": 5e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7526877522468567, + "num_tokens": 485051633.0, + "step": 18743 + }, + { + "epoch": 2.0584230177904677, + "grad_norm": 2.1855108737945557, + "learning_rate": 5e-06, + "loss": 0.6866, + "mean_token_accuracy": 0.7745018005371094, + "num_tokens": 485075338.0, + "step": 18744 + }, + { + "epoch": 2.0585328354930814, + "grad_norm": 2.531190872192383, + "learning_rate": 5e-06, + "loss": 0.6349, + "mean_token_accuracy": 0.789315402507782, + "num_tokens": 485095204.0, + "step": 18745 + }, + { + "epoch": 2.058642653195695, + "grad_norm": 2.070019006729126, + "learning_rate": 5e-06, + "loss": 0.6408, + "mean_token_accuracy": 0.7870898246765137, + "num_tokens": 485119057.0, + "step": 18746 + }, + { + "epoch": 2.058752470898309, + "grad_norm": 2.107707977294922, + "learning_rate": 5e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.7923974394798279, + "num_tokens": 485144215.0, + "step": 18747 + }, + { + "epoch": 2.0588622886009227, + "grad_norm": 1.8167543411254883, + "learning_rate": 5e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.7794983386993408, + "num_tokens": 485172984.0, + "step": 18748 + }, + { + "epoch": 2.058972106303536, + "grad_norm": 2.3011534214019775, + "learning_rate": 5e-06, + "loss": 0.6185, + "mean_token_accuracy": 0.7973844408988953, + "num_tokens": 485193483.0, + "step": 18749 + }, + { + "epoch": 2.0590819240061498, + "grad_norm": 2.201833724975586, + "learning_rate": 5e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.7838860750198364, + "num_tokens": 485218348.0, + "step": 18750 + }, + { + "epoch": 2.0591917417087635, + "grad_norm": 2.2809317111968994, + "learning_rate": 5e-06, + "loss": 0.6324, + "mean_token_accuracy": 0.7895331978797913, + "num_tokens": 485240514.0, + "step": 18751 + }, + { + "epoch": 2.0593015594113773, + "grad_norm": 1.9622714519500732, + "learning_rate": 5e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7722587585449219, + "num_tokens": 485267751.0, + "step": 18752 + }, + { + "epoch": 2.0594113771139906, + "grad_norm": 2.1353161334991455, + "learning_rate": 5e-06, + "loss": 0.6241, + "mean_token_accuracy": 0.7909363508224487, + "num_tokens": 485293338.0, + "step": 18753 + }, + { + "epoch": 2.0595211948166043, + "grad_norm": 1.8296525478363037, + "learning_rate": 5e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.7902385592460632, + "num_tokens": 485320888.0, + "step": 18754 + }, + { + "epoch": 2.059631012519218, + "grad_norm": 1.9638031721115112, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7460349798202515, + "num_tokens": 485349430.0, + "step": 18755 + }, + { + "epoch": 2.059740830221832, + "grad_norm": 2.220243453979492, + "learning_rate": 5e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7685836553573608, + "num_tokens": 485372497.0, + "step": 18756 + }, + { + "epoch": 2.0598506479244456, + "grad_norm": 2.360529661178589, + "learning_rate": 5e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.7924814820289612, + "num_tokens": 485393658.0, + "step": 18757 + }, + { + "epoch": 2.059960465627059, + "grad_norm": 1.8079378604888916, + "learning_rate": 5e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7597942352294922, + "num_tokens": 485425679.0, + "step": 18758 + }, + { + "epoch": 2.0600702833296727, + "grad_norm": 2.0717105865478516, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7533572316169739, + "num_tokens": 485455878.0, + "step": 18759 + }, + { + "epoch": 2.0601801010322864, + "grad_norm": 2.0476434230804443, + "learning_rate": 5e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7745134830474854, + "num_tokens": 485484375.0, + "step": 18760 + }, + { + "epoch": 2.0602899187349, + "grad_norm": 2.1880455017089844, + "learning_rate": 5e-06, + "loss": 0.676, + "mean_token_accuracy": 0.7768645882606506, + "num_tokens": 485511178.0, + "step": 18761 + }, + { + "epoch": 2.060399736437514, + "grad_norm": 1.843396544456482, + "learning_rate": 5e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7640905976295471, + "num_tokens": 485543764.0, + "step": 18762 + }, + { + "epoch": 2.0605095541401273, + "grad_norm": 2.043095588684082, + "learning_rate": 5e-06, + "loss": 0.672, + "mean_token_accuracy": 0.7813636064529419, + "num_tokens": 485569640.0, + "step": 18763 + }, + { + "epoch": 2.060619371842741, + "grad_norm": 1.7003856897354126, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7672735452651978, + "num_tokens": 485605680.0, + "step": 18764 + }, + { + "epoch": 2.060729189545355, + "grad_norm": 2.3432459831237793, + "learning_rate": 5e-06, + "loss": 0.6693, + "mean_token_accuracy": 0.777080774307251, + "num_tokens": 485627862.0, + "step": 18765 + }, + { + "epoch": 2.0608390072479685, + "grad_norm": 2.026265859603882, + "learning_rate": 5e-06, + "loss": 0.6575, + "mean_token_accuracy": 0.7848454117774963, + "num_tokens": 485653814.0, + "step": 18766 + }, + { + "epoch": 2.060948824950582, + "grad_norm": 1.8518961668014526, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7713716626167297, + "num_tokens": 485684837.0, + "step": 18767 + }, + { + "epoch": 2.0610586426531956, + "grad_norm": 2.2191221714019775, + "learning_rate": 5e-06, + "loss": 0.6575, + "mean_token_accuracy": 0.7891467809677124, + "num_tokens": 485708729.0, + "step": 18768 + }, + { + "epoch": 2.0611684603558094, + "grad_norm": 1.9299263954162598, + "learning_rate": 5e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7772924900054932, + "num_tokens": 485732807.0, + "step": 18769 + }, + { + "epoch": 2.061278278058423, + "grad_norm": 2.077345371246338, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7685348987579346, + "num_tokens": 485759144.0, + "step": 18770 + }, + { + "epoch": 2.061388095761037, + "grad_norm": 1.8894948959350586, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7600066065788269, + "num_tokens": 485789826.0, + "step": 18771 + }, + { + "epoch": 2.06149791346365, + "grad_norm": 1.9150232076644897, + "learning_rate": 5e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7555467486381531, + "num_tokens": 485821209.0, + "step": 18772 + }, + { + "epoch": 2.061607731166264, + "grad_norm": 2.0647358894348145, + "learning_rate": 5e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7655534744262695, + "num_tokens": 485848771.0, + "step": 18773 + }, + { + "epoch": 2.0617175488688777, + "grad_norm": 2.1778972148895264, + "learning_rate": 5e-06, + "loss": 0.6466, + "mean_token_accuracy": 0.7782608270645142, + "num_tokens": 485871698.0, + "step": 18774 + }, + { + "epoch": 2.0618273665714915, + "grad_norm": 2.1166775226593018, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7745070457458496, + "num_tokens": 485897336.0, + "step": 18775 + }, + { + "epoch": 2.0619371842741048, + "grad_norm": 2.0550243854522705, + "learning_rate": 5e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7748470902442932, + "num_tokens": 485924470.0, + "step": 18776 + }, + { + "epoch": 2.0620470019767185, + "grad_norm": 2.144388437271118, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7583217024803162, + "num_tokens": 485946998.0, + "step": 18777 + }, + { + "epoch": 2.0621568196793323, + "grad_norm": 2.045916795730591, + "learning_rate": 5e-06, + "loss": 0.668, + "mean_token_accuracy": 0.7722460031509399, + "num_tokens": 485973391.0, + "step": 18778 + }, + { + "epoch": 2.062266637381946, + "grad_norm": 1.9889636039733887, + "learning_rate": 5e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.743653416633606, + "num_tokens": 486001694.0, + "step": 18779 + }, + { + "epoch": 2.06237645508456, + "grad_norm": 2.0749685764312744, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7613841891288757, + "num_tokens": 486029939.0, + "step": 18780 + }, + { + "epoch": 2.062486272787173, + "grad_norm": 2.019233226776123, + "learning_rate": 5e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.8037430047988892, + "num_tokens": 486054198.0, + "step": 18781 + }, + { + "epoch": 2.062596090489787, + "grad_norm": 2.1252217292785645, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7590745687484741, + "num_tokens": 486079843.0, + "step": 18782 + }, + { + "epoch": 2.0627059081924006, + "grad_norm": 2.373434066772461, + "learning_rate": 5e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7732836008071899, + "num_tokens": 486099678.0, + "step": 18783 + }, + { + "epoch": 2.0628157258950144, + "grad_norm": 1.912529706954956, + "learning_rate": 5e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7635040283203125, + "num_tokens": 486127840.0, + "step": 18784 + }, + { + "epoch": 2.062925543597628, + "grad_norm": 1.933173656463623, + "learning_rate": 5e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7625576257705688, + "num_tokens": 486155117.0, + "step": 18785 + }, + { + "epoch": 2.0630353613002415, + "grad_norm": 2.0378308296203613, + "learning_rate": 5e-06, + "loss": 0.6946, + "mean_token_accuracy": 0.7684156894683838, + "num_tokens": 486183018.0, + "step": 18786 + }, + { + "epoch": 2.063145179002855, + "grad_norm": 1.9187309741973877, + "learning_rate": 5e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.7679418325424194, + "num_tokens": 486211770.0, + "step": 18787 + }, + { + "epoch": 2.063254996705469, + "grad_norm": 2.3398361206054688, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7781943678855896, + "num_tokens": 486232829.0, + "step": 18788 + }, + { + "epoch": 2.0633648144080827, + "grad_norm": 2.029494047164917, + "learning_rate": 5e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7843846678733826, + "num_tokens": 486258660.0, + "step": 18789 + }, + { + "epoch": 2.063474632110696, + "grad_norm": 2.2382895946502686, + "learning_rate": 5e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.7917962670326233, + "num_tokens": 486279931.0, + "step": 18790 + }, + { + "epoch": 2.06358444981331, + "grad_norm": 1.9512743949890137, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7519772052764893, + "num_tokens": 486308066.0, + "step": 18791 + }, + { + "epoch": 2.0636942675159236, + "grad_norm": 2.3494691848754883, + "learning_rate": 5e-06, + "loss": 0.5921, + "mean_token_accuracy": 0.7987978458404541, + "num_tokens": 486327812.0, + "step": 18792 + }, + { + "epoch": 2.0638040852185373, + "grad_norm": 2.302008628845215, + "learning_rate": 5e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.7786921858787537, + "num_tokens": 486351176.0, + "step": 18793 + }, + { + "epoch": 2.063913902921151, + "grad_norm": 2.138489007949829, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.750636100769043, + "num_tokens": 486378195.0, + "step": 18794 + }, + { + "epoch": 2.0640237206237644, + "grad_norm": 2.085413694381714, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7693726420402527, + "num_tokens": 486403518.0, + "step": 18795 + }, + { + "epoch": 2.064133538326378, + "grad_norm": 1.9450446367263794, + "learning_rate": 5e-06, + "loss": 0.6529, + "mean_token_accuracy": 0.7822155952453613, + "num_tokens": 486428676.0, + "step": 18796 + }, + { + "epoch": 2.064243356028992, + "grad_norm": 1.9970632791519165, + "learning_rate": 5e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.7694886326789856, + "num_tokens": 486455266.0, + "step": 18797 + }, + { + "epoch": 2.0643531737316057, + "grad_norm": 2.0787768363952637, + "learning_rate": 5e-06, + "loss": 0.678, + "mean_token_accuracy": 0.7762870788574219, + "num_tokens": 486481996.0, + "step": 18798 + }, + { + "epoch": 2.0644629914342194, + "grad_norm": 1.8932909965515137, + "learning_rate": 5e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7628259658813477, + "num_tokens": 486510592.0, + "step": 18799 + }, + { + "epoch": 2.0645728091368327, + "grad_norm": 1.8203473091125488, + "learning_rate": 5e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7720767259597778, + "num_tokens": 486542017.0, + "step": 18800 + }, + { + "epoch": 2.0646826268394465, + "grad_norm": 2.3565073013305664, + "learning_rate": 5e-06, + "loss": 0.6574, + "mean_token_accuracy": 0.7778453826904297, + "num_tokens": 486561849.0, + "step": 18801 + }, + { + "epoch": 2.0647924445420602, + "grad_norm": 2.1982076168060303, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7751303911209106, + "num_tokens": 486588233.0, + "step": 18802 + }, + { + "epoch": 2.064902262244674, + "grad_norm": 2.1976993083953857, + "learning_rate": 5e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7634198665618896, + "num_tokens": 486612261.0, + "step": 18803 + }, + { + "epoch": 2.0650120799472873, + "grad_norm": 1.8869777917861938, + "learning_rate": 5e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7569244503974915, + "num_tokens": 486644103.0, + "step": 18804 + }, + { + "epoch": 2.065121897649901, + "grad_norm": 2.095566511154175, + "learning_rate": 5e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.788169264793396, + "num_tokens": 486669140.0, + "step": 18805 + }, + { + "epoch": 2.065231715352515, + "grad_norm": 1.9048577547073364, + "learning_rate": 5e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7591167092323303, + "num_tokens": 486698878.0, + "step": 18806 + }, + { + "epoch": 2.0653415330551286, + "grad_norm": 2.3408758640289307, + "learning_rate": 5e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.7809264659881592, + "num_tokens": 486719549.0, + "step": 18807 + }, + { + "epoch": 2.0654513507577423, + "grad_norm": 1.9507163763046265, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7645617723464966, + "num_tokens": 486750793.0, + "step": 18808 + }, + { + "epoch": 2.0655611684603556, + "grad_norm": 1.9311847686767578, + "learning_rate": 5e-06, + "loss": 0.715, + "mean_token_accuracy": 0.757952868938446, + "num_tokens": 486779118.0, + "step": 18809 + }, + { + "epoch": 2.0656709861629694, + "grad_norm": 1.9532219171524048, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.763462483882904, + "num_tokens": 486811452.0, + "step": 18810 + }, + { + "epoch": 2.065780803865583, + "grad_norm": 2.0667638778686523, + "learning_rate": 5e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7786446809768677, + "num_tokens": 486838153.0, + "step": 18811 + }, + { + "epoch": 2.065890621568197, + "grad_norm": 2.1712324619293213, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7752997279167175, + "num_tokens": 486861893.0, + "step": 18812 + }, + { + "epoch": 2.0660004392708107, + "grad_norm": 1.9203283786773682, + "learning_rate": 5e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7474613189697266, + "num_tokens": 486894488.0, + "step": 18813 + }, + { + "epoch": 2.066110256973424, + "grad_norm": 2.5811355113983154, + "learning_rate": 5e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.7885948419570923, + "num_tokens": 486913041.0, + "step": 18814 + }, + { + "epoch": 2.0662200746760377, + "grad_norm": 1.9445810317993164, + "learning_rate": 5e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.778084933757782, + "num_tokens": 486941262.0, + "step": 18815 + }, + { + "epoch": 2.0663298923786515, + "grad_norm": 2.063556432723999, + "learning_rate": 5e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7637974619865417, + "num_tokens": 486968134.0, + "step": 18816 + }, + { + "epoch": 2.0664397100812653, + "grad_norm": 2.2665317058563232, + "learning_rate": 5e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.7794110178947449, + "num_tokens": 486991436.0, + "step": 18817 + }, + { + "epoch": 2.0665495277838786, + "grad_norm": 1.9263205528259277, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7773932218551636, + "num_tokens": 487020653.0, + "step": 18818 + }, + { + "epoch": 2.0666593454864923, + "grad_norm": 2.1339962482452393, + "learning_rate": 5e-06, + "loss": 0.6639, + "mean_token_accuracy": 0.7858849763870239, + "num_tokens": 487044284.0, + "step": 18819 + }, + { + "epoch": 2.066769163189106, + "grad_norm": 1.9677376747131348, + "learning_rate": 5e-06, + "loss": 0.6628, + "mean_token_accuracy": 0.789137065410614, + "num_tokens": 487068130.0, + "step": 18820 + }, + { + "epoch": 2.06687898089172, + "grad_norm": 2.0255544185638428, + "learning_rate": 5e-06, + "loss": 0.6711, + "mean_token_accuracy": 0.7764148712158203, + "num_tokens": 487094697.0, + "step": 18821 + }, + { + "epoch": 2.0669887985943336, + "grad_norm": 2.11122465133667, + "learning_rate": 5e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7736700773239136, + "num_tokens": 487119070.0, + "step": 18822 + }, + { + "epoch": 2.067098616296947, + "grad_norm": 2.1366379261016846, + "learning_rate": 5e-06, + "loss": 0.6354, + "mean_token_accuracy": 0.7835662364959717, + "num_tokens": 487143461.0, + "step": 18823 + }, + { + "epoch": 2.0672084339995607, + "grad_norm": 2.050342082977295, + "learning_rate": 5e-06, + "loss": 0.588, + "mean_token_accuracy": 0.8044522404670715, + "num_tokens": 487166384.0, + "step": 18824 + }, + { + "epoch": 2.0673182517021744, + "grad_norm": 2.405447483062744, + "learning_rate": 5e-06, + "loss": 0.6366, + "mean_token_accuracy": 0.7831581830978394, + "num_tokens": 487187906.0, + "step": 18825 + }, + { + "epoch": 2.067428069404788, + "grad_norm": 1.8035626411437988, + "learning_rate": 5e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7377293109893799, + "num_tokens": 487220469.0, + "step": 18826 + }, + { + "epoch": 2.0675378871074015, + "grad_norm": 1.9554774761199951, + "learning_rate": 5e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7458484172821045, + "num_tokens": 487249217.0, + "step": 18827 + }, + { + "epoch": 2.0676477048100153, + "grad_norm": 2.053093671798706, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7645246982574463, + "num_tokens": 487277058.0, + "step": 18828 + }, + { + "epoch": 2.067757522512629, + "grad_norm": 2.040907859802246, + "learning_rate": 5e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7777892351150513, + "num_tokens": 487301538.0, + "step": 18829 + }, + { + "epoch": 2.0678673402152428, + "grad_norm": 1.9850337505340576, + "learning_rate": 5e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7703200578689575, + "num_tokens": 487326686.0, + "step": 18830 + }, + { + "epoch": 2.0679771579178565, + "grad_norm": 2.1273794174194336, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7723541259765625, + "num_tokens": 487351703.0, + "step": 18831 + }, + { + "epoch": 2.06808697562047, + "grad_norm": 2.2770869731903076, + "learning_rate": 5e-06, + "loss": 0.6529, + "mean_token_accuracy": 0.789686381816864, + "num_tokens": 487371739.0, + "step": 18832 + }, + { + "epoch": 2.0681967933230836, + "grad_norm": 2.0377180576324463, + "learning_rate": 5e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7732374668121338, + "num_tokens": 487401069.0, + "step": 18833 + }, + { + "epoch": 2.0683066110256974, + "grad_norm": 2.193094253540039, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7700073719024658, + "num_tokens": 487425683.0, + "step": 18834 + }, + { + "epoch": 2.068416428728311, + "grad_norm": 2.1068155765533447, + "learning_rate": 5e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.7558244466781616, + "num_tokens": 487450553.0, + "step": 18835 + }, + { + "epoch": 2.068526246430925, + "grad_norm": 2.0241315364837646, + "learning_rate": 5e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7552225589752197, + "num_tokens": 487477439.0, + "step": 18836 + }, + { + "epoch": 2.068636064133538, + "grad_norm": 1.9917181730270386, + "learning_rate": 5e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.7744033336639404, + "num_tokens": 487506908.0, + "step": 18837 + }, + { + "epoch": 2.068745881836152, + "grad_norm": 2.549173593521118, + "learning_rate": 5e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.7823857069015503, + "num_tokens": 487528226.0, + "step": 18838 + }, + { + "epoch": 2.0688556995387657, + "grad_norm": 2.089679479598999, + "learning_rate": 5e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7597533464431763, + "num_tokens": 487554777.0, + "step": 18839 + }, + { + "epoch": 2.0689655172413794, + "grad_norm": 2.031744956970215, + "learning_rate": 5e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.7675911784172058, + "num_tokens": 487582373.0, + "step": 18840 + }, + { + "epoch": 2.0690753349439928, + "grad_norm": 2.1659018993377686, + "learning_rate": 5e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7688637971878052, + "num_tokens": 487605791.0, + "step": 18841 + }, + { + "epoch": 2.0691851526466065, + "grad_norm": 2.0334279537200928, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7821624279022217, + "num_tokens": 487631712.0, + "step": 18842 + }, + { + "epoch": 2.0692949703492203, + "grad_norm": 1.8582428693771362, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7677499651908875, + "num_tokens": 487659745.0, + "step": 18843 + }, + { + "epoch": 2.069404788051834, + "grad_norm": 2.2101826667785645, + "learning_rate": 5e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.7989347577095032, + "num_tokens": 487682011.0, + "step": 18844 + }, + { + "epoch": 2.069514605754448, + "grad_norm": 2.168109178543091, + "learning_rate": 5e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7694685459136963, + "num_tokens": 487707148.0, + "step": 18845 + }, + { + "epoch": 2.069624423457061, + "grad_norm": 2.2102856636047363, + "learning_rate": 5e-06, + "loss": 0.641, + "mean_token_accuracy": 0.7853696346282959, + "num_tokens": 487731150.0, + "step": 18846 + }, + { + "epoch": 2.069734241159675, + "grad_norm": 1.8990062475204468, + "learning_rate": 5e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7710117101669312, + "num_tokens": 487766564.0, + "step": 18847 + }, + { + "epoch": 2.0698440588622886, + "grad_norm": 1.912307858467102, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7756750583648682, + "num_tokens": 487798681.0, + "step": 18848 + }, + { + "epoch": 2.0699538765649024, + "grad_norm": 2.052069664001465, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7736223340034485, + "num_tokens": 487827814.0, + "step": 18849 + }, + { + "epoch": 2.070063694267516, + "grad_norm": 1.9497545957565308, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7594354152679443, + "num_tokens": 487854655.0, + "step": 18850 + }, + { + "epoch": 2.0701735119701294, + "grad_norm": 1.895643949508667, + "learning_rate": 5e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7551229000091553, + "num_tokens": 487885924.0, + "step": 18851 + }, + { + "epoch": 2.070283329672743, + "grad_norm": 2.167781352996826, + "learning_rate": 5e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.7971446514129639, + "num_tokens": 487907387.0, + "step": 18852 + }, + { + "epoch": 2.070393147375357, + "grad_norm": 1.9320327043533325, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7580252885818481, + "num_tokens": 487938665.0, + "step": 18853 + }, + { + "epoch": 2.0705029650779707, + "grad_norm": 2.416569948196411, + "learning_rate": 5e-06, + "loss": 0.6564, + "mean_token_accuracy": 0.7786370515823364, + "num_tokens": 487959008.0, + "step": 18854 + }, + { + "epoch": 2.070612782780584, + "grad_norm": 2.5576109886169434, + "learning_rate": 5e-06, + "loss": 0.639, + "mean_token_accuracy": 0.78378826379776, + "num_tokens": 487976650.0, + "step": 18855 + }, + { + "epoch": 2.070722600483198, + "grad_norm": 1.998745083808899, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7765498757362366, + "num_tokens": 488003784.0, + "step": 18856 + }, + { + "epoch": 2.0708324181858115, + "grad_norm": 2.208249807357788, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7697696685791016, + "num_tokens": 488027476.0, + "step": 18857 + }, + { + "epoch": 2.0709422358884253, + "grad_norm": 2.0218865871429443, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7805455923080444, + "num_tokens": 488052194.0, + "step": 18858 + }, + { + "epoch": 2.071052053591039, + "grad_norm": 2.2000856399536133, + "learning_rate": 5e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7710890173912048, + "num_tokens": 488075098.0, + "step": 18859 + }, + { + "epoch": 2.0711618712936524, + "grad_norm": 1.9447587728500366, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7778743505477905, + "num_tokens": 488103011.0, + "step": 18860 + }, + { + "epoch": 2.071271688996266, + "grad_norm": 2.3975064754486084, + "learning_rate": 5e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7764818072319031, + "num_tokens": 488122749.0, + "step": 18861 + }, + { + "epoch": 2.07138150669888, + "grad_norm": 2.156388282775879, + "learning_rate": 5e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7750757336616516, + "num_tokens": 488149282.0, + "step": 18862 + }, + { + "epoch": 2.0714913244014936, + "grad_norm": 2.3600263595581055, + "learning_rate": 5e-06, + "loss": 0.6756, + "mean_token_accuracy": 0.77960205078125, + "num_tokens": 488171009.0, + "step": 18863 + }, + { + "epoch": 2.0716011421041074, + "grad_norm": 2.001746416091919, + "learning_rate": 5e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7491979598999023, + "num_tokens": 488197614.0, + "step": 18864 + }, + { + "epoch": 2.0717109598067207, + "grad_norm": 2.1209487915039062, + "learning_rate": 5e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.7823959589004517, + "num_tokens": 488220416.0, + "step": 18865 + }, + { + "epoch": 2.0718207775093345, + "grad_norm": 2.078784942626953, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7413918972015381, + "num_tokens": 488249255.0, + "step": 18866 + }, + { + "epoch": 2.0719305952119482, + "grad_norm": 2.1006593704223633, + "learning_rate": 5e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.7740823030471802, + "num_tokens": 488274930.0, + "step": 18867 + }, + { + "epoch": 2.072040412914562, + "grad_norm": 2.3306968212127686, + "learning_rate": 5e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.7734931707382202, + "num_tokens": 488295939.0, + "step": 18868 + }, + { + "epoch": 2.0721502306171753, + "grad_norm": 2.0948357582092285, + "learning_rate": 5e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7684900760650635, + "num_tokens": 488322738.0, + "step": 18869 + }, + { + "epoch": 2.072260048319789, + "grad_norm": 2.004621982574463, + "learning_rate": 5e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7607508897781372, + "num_tokens": 488352086.0, + "step": 18870 + }, + { + "epoch": 2.072369866022403, + "grad_norm": 2.207953453063965, + "learning_rate": 5e-06, + "loss": 0.6358, + "mean_token_accuracy": 0.7900421619415283, + "num_tokens": 488373728.0, + "step": 18871 + }, + { + "epoch": 2.0724796837250166, + "grad_norm": 2.120851755142212, + "learning_rate": 5e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7563909292221069, + "num_tokens": 488399780.0, + "step": 18872 + }, + { + "epoch": 2.0725895014276303, + "grad_norm": 1.8966013193130493, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7618913650512695, + "num_tokens": 488428756.0, + "step": 18873 + }, + { + "epoch": 2.0726993191302436, + "grad_norm": 2.129636764526367, + "learning_rate": 5e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7689058780670166, + "num_tokens": 488453955.0, + "step": 18874 + }, + { + "epoch": 2.0728091368328574, + "grad_norm": 2.4285123348236084, + "learning_rate": 5e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.7754117846488953, + "num_tokens": 488474527.0, + "step": 18875 + }, + { + "epoch": 2.072918954535471, + "grad_norm": 2.2813940048217773, + "learning_rate": 5e-06, + "loss": 0.6298, + "mean_token_accuracy": 0.7894232273101807, + "num_tokens": 488494771.0, + "step": 18876 + }, + { + "epoch": 2.073028772238085, + "grad_norm": 2.1603355407714844, + "learning_rate": 5e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7593770027160645, + "num_tokens": 488521944.0, + "step": 18877 + }, + { + "epoch": 2.073138589940698, + "grad_norm": 2.4815967082977295, + "learning_rate": 5e-06, + "loss": 0.6443, + "mean_token_accuracy": 0.7915741205215454, + "num_tokens": 488540910.0, + "step": 18878 + }, + { + "epoch": 2.073248407643312, + "grad_norm": 2.014955997467041, + "learning_rate": 5e-06, + "loss": 0.5872, + "mean_token_accuracy": 0.7972791790962219, + "num_tokens": 488564909.0, + "step": 18879 + }, + { + "epoch": 2.0733582253459257, + "grad_norm": 2.0849013328552246, + "learning_rate": 5e-06, + "loss": 0.6295, + "mean_token_accuracy": 0.8001276850700378, + "num_tokens": 488589347.0, + "step": 18880 + }, + { + "epoch": 2.0734680430485395, + "grad_norm": 2.0595595836639404, + "learning_rate": 5e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7525941729545593, + "num_tokens": 488616690.0, + "step": 18881 + }, + { + "epoch": 2.0735778607511532, + "grad_norm": 2.3671538829803467, + "learning_rate": 5e-06, + "loss": 0.661, + "mean_token_accuracy": 0.7815272212028503, + "num_tokens": 488638147.0, + "step": 18882 + }, + { + "epoch": 2.0736876784537666, + "grad_norm": 1.9546364545822144, + "learning_rate": 5e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7645750045776367, + "num_tokens": 488668104.0, + "step": 18883 + }, + { + "epoch": 2.0737974961563803, + "grad_norm": 2.2420365810394287, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7814943790435791, + "num_tokens": 488690230.0, + "step": 18884 + }, + { + "epoch": 2.073907313858994, + "grad_norm": 2.304043769836426, + "learning_rate": 5e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.766367495059967, + "num_tokens": 488714412.0, + "step": 18885 + }, + { + "epoch": 2.074017131561608, + "grad_norm": 1.9601213932037354, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7525568604469299, + "num_tokens": 488744004.0, + "step": 18886 + }, + { + "epoch": 2.0741269492642216, + "grad_norm": 1.9489392042160034, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7604171633720398, + "num_tokens": 488771821.0, + "step": 18887 + }, + { + "epoch": 2.074236766966835, + "grad_norm": 2.1386237144470215, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7748286128044128, + "num_tokens": 488795412.0, + "step": 18888 + }, + { + "epoch": 2.0743465846694487, + "grad_norm": 1.9907060861587524, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7754062414169312, + "num_tokens": 488824941.0, + "step": 18889 + }, + { + "epoch": 2.0744564023720624, + "grad_norm": 2.1731629371643066, + "learning_rate": 5e-06, + "loss": 0.6236, + "mean_token_accuracy": 0.7901383638381958, + "num_tokens": 488847971.0, + "step": 18890 + }, + { + "epoch": 2.074566220074676, + "grad_norm": 2.1730093955993652, + "learning_rate": 5e-06, + "loss": 0.6407, + "mean_token_accuracy": 0.7877130508422852, + "num_tokens": 488870283.0, + "step": 18891 + }, + { + "epoch": 2.07467603777729, + "grad_norm": 1.914528250694275, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7689613103866577, + "num_tokens": 488899260.0, + "step": 18892 + }, + { + "epoch": 2.0747858554799032, + "grad_norm": 2.365616798400879, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7607810497283936, + "num_tokens": 488923456.0, + "step": 18893 + }, + { + "epoch": 2.074895673182517, + "grad_norm": 2.0438690185546875, + "learning_rate": 5e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7715959548950195, + "num_tokens": 488948175.0, + "step": 18894 + }, + { + "epoch": 2.0750054908851308, + "grad_norm": 2.1986494064331055, + "learning_rate": 5e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.7756842374801636, + "num_tokens": 488969939.0, + "step": 18895 + }, + { + "epoch": 2.0751153085877445, + "grad_norm": 2.0441012382507324, + "learning_rate": 5e-06, + "loss": 0.6366, + "mean_token_accuracy": 0.7888131737709045, + "num_tokens": 488995891.0, + "step": 18896 + }, + { + "epoch": 2.075225126290358, + "grad_norm": 2.0583178997039795, + "learning_rate": 5e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7438023686408997, + "num_tokens": 489023944.0, + "step": 18897 + }, + { + "epoch": 2.0753349439929716, + "grad_norm": 2.1550493240356445, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.7931418418884277, + "num_tokens": 489047762.0, + "step": 18898 + }, + { + "epoch": 2.0754447616955853, + "grad_norm": 2.0332789421081543, + "learning_rate": 5e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.7777280807495117, + "num_tokens": 489073091.0, + "step": 18899 + }, + { + "epoch": 2.075554579398199, + "grad_norm": 2.1817703247070312, + "learning_rate": 5e-06, + "loss": 0.6281, + "mean_token_accuracy": 0.7907849550247192, + "num_tokens": 489097033.0, + "step": 18900 + }, + { + "epoch": 2.075664397100813, + "grad_norm": 1.9325600862503052, + "learning_rate": 5e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7527844905853271, + "num_tokens": 489126976.0, + "step": 18901 + }, + { + "epoch": 2.075774214803426, + "grad_norm": 1.9716647863388062, + "learning_rate": 5e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7647383809089661, + "num_tokens": 489154359.0, + "step": 18902 + }, + { + "epoch": 2.07588403250604, + "grad_norm": 1.9116156101226807, + "learning_rate": 5e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7448536157608032, + "num_tokens": 489184966.0, + "step": 18903 + }, + { + "epoch": 2.0759938502086537, + "grad_norm": 2.00203275680542, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7513636350631714, + "num_tokens": 489213376.0, + "step": 18904 + }, + { + "epoch": 2.0761036679112674, + "grad_norm": 2.062648296356201, + "learning_rate": 5e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7737636566162109, + "num_tokens": 489240904.0, + "step": 18905 + }, + { + "epoch": 2.0762134856138807, + "grad_norm": 1.9419103860855103, + "learning_rate": 5e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7627227306365967, + "num_tokens": 489270304.0, + "step": 18906 + }, + { + "epoch": 2.0763233033164945, + "grad_norm": 2.1512694358825684, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7743339538574219, + "num_tokens": 489294874.0, + "step": 18907 + }, + { + "epoch": 2.0764331210191083, + "grad_norm": 2.4445929527282715, + "learning_rate": 5e-06, + "loss": 0.5882, + "mean_token_accuracy": 0.8074631094932556, + "num_tokens": 489313603.0, + "step": 18908 + }, + { + "epoch": 2.076542938721722, + "grad_norm": 1.9910309314727783, + "learning_rate": 5e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.74032062292099, + "num_tokens": 489346320.0, + "step": 18909 + }, + { + "epoch": 2.0766527564243358, + "grad_norm": 1.8506629467010498, + "learning_rate": 5e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7786630988121033, + "num_tokens": 489377240.0, + "step": 18910 + }, + { + "epoch": 2.076762574126949, + "grad_norm": 2.0055482387542725, + "learning_rate": 5e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7602746486663818, + "num_tokens": 489404264.0, + "step": 18911 + }, + { + "epoch": 2.076872391829563, + "grad_norm": 1.9372626543045044, + "learning_rate": 5e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7572663426399231, + "num_tokens": 489433723.0, + "step": 18912 + }, + { + "epoch": 2.0769822095321766, + "grad_norm": 1.9895671606063843, + "learning_rate": 5e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.789328396320343, + "num_tokens": 489459981.0, + "step": 18913 + }, + { + "epoch": 2.0770920272347904, + "grad_norm": 2.2189483642578125, + "learning_rate": 5e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7582656741142273, + "num_tokens": 489483515.0, + "step": 18914 + }, + { + "epoch": 2.077201844937404, + "grad_norm": 1.8773607015609741, + "learning_rate": 5e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7625261545181274, + "num_tokens": 489514662.0, + "step": 18915 + }, + { + "epoch": 2.0773116626400174, + "grad_norm": 2.0106892585754395, + "learning_rate": 5e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.7805026769638062, + "num_tokens": 489540364.0, + "step": 18916 + }, + { + "epoch": 2.077421480342631, + "grad_norm": 2.300962209701538, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7754476070404053, + "num_tokens": 489563939.0, + "step": 18917 + }, + { + "epoch": 2.077531298045245, + "grad_norm": 2.188838005065918, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7714880704879761, + "num_tokens": 489587092.0, + "step": 18918 + }, + { + "epoch": 2.0776411157478587, + "grad_norm": 2.1181764602661133, + "learning_rate": 5e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.7542250156402588, + "num_tokens": 489615847.0, + "step": 18919 + }, + { + "epoch": 2.077750933450472, + "grad_norm": 1.9822742938995361, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7725259065628052, + "num_tokens": 489644116.0, + "step": 18920 + }, + { + "epoch": 2.0778607511530858, + "grad_norm": 1.8516656160354614, + "learning_rate": 5e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.7935993075370789, + "num_tokens": 489671966.0, + "step": 18921 + }, + { + "epoch": 2.0779705688556995, + "grad_norm": 2.068413257598877, + "learning_rate": 5e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7701817750930786, + "num_tokens": 489698568.0, + "step": 18922 + }, + { + "epoch": 2.0780803865583133, + "grad_norm": 1.9461685419082642, + "learning_rate": 5e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7672253847122192, + "num_tokens": 489727344.0, + "step": 18923 + }, + { + "epoch": 2.078190204260927, + "grad_norm": 2.197662115097046, + "learning_rate": 5e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.7768656611442566, + "num_tokens": 489752068.0, + "step": 18924 + }, + { + "epoch": 2.0783000219635404, + "grad_norm": 2.4184775352478027, + "learning_rate": 5e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.767006516456604, + "num_tokens": 489774219.0, + "step": 18925 + }, + { + "epoch": 2.078409839666154, + "grad_norm": 2.1980769634246826, + "learning_rate": 5e-06, + "loss": 0.6367, + "mean_token_accuracy": 0.7939524054527283, + "num_tokens": 489795883.0, + "step": 18926 + }, + { + "epoch": 2.078519657368768, + "grad_norm": 1.8918635845184326, + "learning_rate": 5e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.7907141447067261, + "num_tokens": 489825488.0, + "step": 18927 + }, + { + "epoch": 2.0786294750713816, + "grad_norm": 2.3585140705108643, + "learning_rate": 5e-06, + "loss": 0.63, + "mean_token_accuracy": 0.7885571122169495, + "num_tokens": 489845564.0, + "step": 18928 + }, + { + "epoch": 2.078739292773995, + "grad_norm": 2.014138698577881, + "learning_rate": 5e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.7734018564224243, + "num_tokens": 489872465.0, + "step": 18929 + }, + { + "epoch": 2.0788491104766087, + "grad_norm": 1.9990590810775757, + "learning_rate": 5e-06, + "loss": 0.6661, + "mean_token_accuracy": 0.7804941534996033, + "num_tokens": 489900561.0, + "step": 18930 + }, + { + "epoch": 2.0789589281792225, + "grad_norm": 2.0751986503601074, + "learning_rate": 5e-06, + "loss": 0.6161, + "mean_token_accuracy": 0.790895938873291, + "num_tokens": 489923490.0, + "step": 18931 + }, + { + "epoch": 2.079068745881836, + "grad_norm": 2.0198562145233154, + "learning_rate": 5e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7647855281829834, + "num_tokens": 489950217.0, + "step": 18932 + }, + { + "epoch": 2.07917856358445, + "grad_norm": 1.9427151679992676, + "learning_rate": 5e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.764764130115509, + "num_tokens": 489979418.0, + "step": 18933 + }, + { + "epoch": 2.0792883812870633, + "grad_norm": 2.03718900680542, + "learning_rate": 5e-06, + "loss": 0.6865, + "mean_token_accuracy": 0.7724568247795105, + "num_tokens": 490005642.0, + "step": 18934 + }, + { + "epoch": 2.079398198989677, + "grad_norm": 2.016514778137207, + "learning_rate": 5e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7658898830413818, + "num_tokens": 490031505.0, + "step": 18935 + }, + { + "epoch": 2.079508016692291, + "grad_norm": 1.9438841342926025, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7543954849243164, + "num_tokens": 490060583.0, + "step": 18936 + }, + { + "epoch": 2.0796178343949046, + "grad_norm": 2.1661264896392822, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7735819220542908, + "num_tokens": 490084909.0, + "step": 18937 + }, + { + "epoch": 2.0797276520975183, + "grad_norm": 2.0487005710601807, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7667427062988281, + "num_tokens": 490112690.0, + "step": 18938 + }, + { + "epoch": 2.0798374698001316, + "grad_norm": 1.8685855865478516, + "learning_rate": 5e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7784809470176697, + "num_tokens": 490141537.0, + "step": 18939 + }, + { + "epoch": 2.0799472875027454, + "grad_norm": 2.036638021469116, + "learning_rate": 5e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.7977105379104614, + "num_tokens": 490167867.0, + "step": 18940 + }, + { + "epoch": 2.080057105205359, + "grad_norm": 2.001868963241577, + "learning_rate": 5e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7621693015098572, + "num_tokens": 490194006.0, + "step": 18941 + }, + { + "epoch": 2.080166922907973, + "grad_norm": 2.5001654624938965, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7721633911132812, + "num_tokens": 490212760.0, + "step": 18942 + }, + { + "epoch": 2.0802767406105866, + "grad_norm": 2.0043702125549316, + "learning_rate": 5e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7719926238059998, + "num_tokens": 490240071.0, + "step": 18943 + }, + { + "epoch": 2.0803865583132, + "grad_norm": 1.9447705745697021, + "learning_rate": 5e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.7891594171524048, + "num_tokens": 490267187.0, + "step": 18944 + }, + { + "epoch": 2.0804963760158137, + "grad_norm": 2.0158705711364746, + "learning_rate": 5e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.7787287831306458, + "num_tokens": 490294381.0, + "step": 18945 + }, + { + "epoch": 2.0806061937184275, + "grad_norm": 2.195844888687134, + "learning_rate": 5e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.7893587946891785, + "num_tokens": 490316509.0, + "step": 18946 + }, + { + "epoch": 2.0807160114210412, + "grad_norm": 1.97517991065979, + "learning_rate": 5e-06, + "loss": 0.6426, + "mean_token_accuracy": 0.7844815850257874, + "num_tokens": 490343071.0, + "step": 18947 + }, + { + "epoch": 2.0808258291236545, + "grad_norm": 2.118180513381958, + "learning_rate": 5e-06, + "loss": 0.6464, + "mean_token_accuracy": 0.7853081226348877, + "num_tokens": 490367294.0, + "step": 18948 + }, + { + "epoch": 2.0809356468262683, + "grad_norm": 1.9916762113571167, + "learning_rate": 5e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7604562044143677, + "num_tokens": 490396462.0, + "step": 18949 + }, + { + "epoch": 2.081045464528882, + "grad_norm": 1.9637365341186523, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.760324239730835, + "num_tokens": 490426133.0, + "step": 18950 + }, + { + "epoch": 2.081155282231496, + "grad_norm": 2.180885076522827, + "learning_rate": 5e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7501181960105896, + "num_tokens": 490455206.0, + "step": 18951 + }, + { + "epoch": 2.0812650999341096, + "grad_norm": 2.2951748371124268, + "learning_rate": 5e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7712355256080627, + "num_tokens": 490477151.0, + "step": 18952 + }, + { + "epoch": 2.081374917636723, + "grad_norm": 2.2031757831573486, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7745442390441895, + "num_tokens": 490501222.0, + "step": 18953 + }, + { + "epoch": 2.0814847353393366, + "grad_norm": 2.6661853790283203, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.7847635746002197, + "num_tokens": 490518557.0, + "step": 18954 + }, + { + "epoch": 2.0815945530419504, + "grad_norm": 1.7828316688537598, + "learning_rate": 5e-06, + "loss": 0.674, + "mean_token_accuracy": 0.775916576385498, + "num_tokens": 490550531.0, + "step": 18955 + }, + { + "epoch": 2.081704370744564, + "grad_norm": 1.9584017992019653, + "learning_rate": 5e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7437717318534851, + "num_tokens": 490582045.0, + "step": 18956 + }, + { + "epoch": 2.0818141884471775, + "grad_norm": 1.9167709350585938, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7597599029541016, + "num_tokens": 490612007.0, + "step": 18957 + }, + { + "epoch": 2.0819240061497912, + "grad_norm": 2.1916043758392334, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7594334483146667, + "num_tokens": 490637314.0, + "step": 18958 + }, + { + "epoch": 2.082033823852405, + "grad_norm": 2.0636093616485596, + "learning_rate": 5e-06, + "loss": 0.6612, + "mean_token_accuracy": 0.7829816341400146, + "num_tokens": 490662183.0, + "step": 18959 + }, + { + "epoch": 2.0821436415550187, + "grad_norm": 1.955911636352539, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7786722183227539, + "num_tokens": 490691771.0, + "step": 18960 + }, + { + "epoch": 2.0822534592576325, + "grad_norm": 2.142035484313965, + "learning_rate": 5e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7720198631286621, + "num_tokens": 490714558.0, + "step": 18961 + }, + { + "epoch": 2.082363276960246, + "grad_norm": 1.9724897146224976, + "learning_rate": 5e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7775362133979797, + "num_tokens": 490741848.0, + "step": 18962 + }, + { + "epoch": 2.0824730946628596, + "grad_norm": 2.001319646835327, + "learning_rate": 5e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.78678297996521, + "num_tokens": 490770840.0, + "step": 18963 + }, + { + "epoch": 2.0825829123654733, + "grad_norm": 1.877137541770935, + "learning_rate": 5e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7664787173271179, + "num_tokens": 490802393.0, + "step": 18964 + }, + { + "epoch": 2.082692730068087, + "grad_norm": 1.9051319360733032, + "learning_rate": 5e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7618995904922485, + "num_tokens": 490832073.0, + "step": 18965 + }, + { + "epoch": 2.082802547770701, + "grad_norm": 1.84151291847229, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.750582218170166, + "num_tokens": 490860720.0, + "step": 18966 + }, + { + "epoch": 2.082912365473314, + "grad_norm": 2.156022071838379, + "learning_rate": 5e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7670712471008301, + "num_tokens": 490888530.0, + "step": 18967 + }, + { + "epoch": 2.083022183175928, + "grad_norm": 2.375394582748413, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7679015398025513, + "num_tokens": 490909379.0, + "step": 18968 + }, + { + "epoch": 2.0831320008785417, + "grad_norm": 2.4033944606781006, + "learning_rate": 5e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.7979840636253357, + "num_tokens": 490928752.0, + "step": 18969 + }, + { + "epoch": 2.0832418185811554, + "grad_norm": 1.7290620803833008, + "learning_rate": 5e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.765457034111023, + "num_tokens": 490965645.0, + "step": 18970 + }, + { + "epoch": 2.0833516362837687, + "grad_norm": 2.4434947967529297, + "learning_rate": 5e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.7768657803535461, + "num_tokens": 490987418.0, + "step": 18971 + }, + { + "epoch": 2.0834614539863825, + "grad_norm": 1.8946218490600586, + "learning_rate": 5e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7567138671875, + "num_tokens": 491017227.0, + "step": 18972 + }, + { + "epoch": 2.0835712716889963, + "grad_norm": 1.8412396907806396, + "learning_rate": 5e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7724354863166809, + "num_tokens": 491046520.0, + "step": 18973 + }, + { + "epoch": 2.08368108939161, + "grad_norm": 2.038339853286743, + "learning_rate": 5e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7737967371940613, + "num_tokens": 491073645.0, + "step": 18974 + }, + { + "epoch": 2.0837909070942238, + "grad_norm": 1.9108208417892456, + "learning_rate": 5e-06, + "loss": 0.647, + "mean_token_accuracy": 0.7840214967727661, + "num_tokens": 491099320.0, + "step": 18975 + }, + { + "epoch": 2.083900724796837, + "grad_norm": 2.1384692192077637, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7534717321395874, + "num_tokens": 491125416.0, + "step": 18976 + }, + { + "epoch": 2.084010542499451, + "grad_norm": 2.106083631515503, + "learning_rate": 5e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.7606042623519897, + "num_tokens": 491150871.0, + "step": 18977 + }, + { + "epoch": 2.0841203602020646, + "grad_norm": 2.1160972118377686, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7705543041229248, + "num_tokens": 491174243.0, + "step": 18978 + }, + { + "epoch": 2.0842301779046783, + "grad_norm": 1.929060697555542, + "learning_rate": 5e-06, + "loss": 0.6405, + "mean_token_accuracy": 0.7844645380973816, + "num_tokens": 491202241.0, + "step": 18979 + }, + { + "epoch": 2.084339995607292, + "grad_norm": 1.8643581867218018, + "learning_rate": 5e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7711280584335327, + "num_tokens": 491233214.0, + "step": 18980 + }, + { + "epoch": 2.0844498133099054, + "grad_norm": 2.073488712310791, + "learning_rate": 5e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7574601769447327, + "num_tokens": 491259531.0, + "step": 18981 + }, + { + "epoch": 2.084559631012519, + "grad_norm": 1.7821747064590454, + "learning_rate": 5e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.7790131568908691, + "num_tokens": 491290267.0, + "step": 18982 + }, + { + "epoch": 2.084669448715133, + "grad_norm": 2.024601936340332, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7472656965255737, + "num_tokens": 491319992.0, + "step": 18983 + }, + { + "epoch": 2.0847792664177467, + "grad_norm": 2.206322193145752, + "learning_rate": 5e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7670748233795166, + "num_tokens": 491343285.0, + "step": 18984 + }, + { + "epoch": 2.08488908412036, + "grad_norm": 2.161968231201172, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7751439213752747, + "num_tokens": 491366920.0, + "step": 18985 + }, + { + "epoch": 2.0849989018229738, + "grad_norm": 1.954336166381836, + "learning_rate": 5e-06, + "loss": 0.6776, + "mean_token_accuracy": 0.78242027759552, + "num_tokens": 491396772.0, + "step": 18986 + }, + { + "epoch": 2.0851087195255875, + "grad_norm": 2.2106258869171143, + "learning_rate": 5e-06, + "loss": 0.6232, + "mean_token_accuracy": 0.7888941764831543, + "num_tokens": 491418218.0, + "step": 18987 + }, + { + "epoch": 2.0852185372282013, + "grad_norm": 1.82503342628479, + "learning_rate": 5e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7587138414382935, + "num_tokens": 491450938.0, + "step": 18988 + }, + { + "epoch": 2.085328354930815, + "grad_norm": 2.2705612182617188, + "learning_rate": 5e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.7803766131401062, + "num_tokens": 491473640.0, + "step": 18989 + }, + { + "epoch": 2.0854381726334283, + "grad_norm": 2.313100576400757, + "learning_rate": 5e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.7683911323547363, + "num_tokens": 491494572.0, + "step": 18990 + }, + { + "epoch": 2.085547990336042, + "grad_norm": 2.1166412830352783, + "learning_rate": 5e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.7974974513053894, + "num_tokens": 491519941.0, + "step": 18991 + }, + { + "epoch": 2.085657808038656, + "grad_norm": 2.496974229812622, + "learning_rate": 5e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.7781844139099121, + "num_tokens": 491539977.0, + "step": 18992 + }, + { + "epoch": 2.0857676257412696, + "grad_norm": 2.2067720890045166, + "learning_rate": 5e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.7910375595092773, + "num_tokens": 491561806.0, + "step": 18993 + }, + { + "epoch": 2.0858774434438834, + "grad_norm": 1.8826746940612793, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7569649815559387, + "num_tokens": 491591055.0, + "step": 18994 + }, + { + "epoch": 2.0859872611464967, + "grad_norm": 2.344304323196411, + "learning_rate": 5e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.7641241550445557, + "num_tokens": 491612509.0, + "step": 18995 + }, + { + "epoch": 2.0860970788491104, + "grad_norm": 2.1462321281433105, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7677108645439148, + "num_tokens": 491636112.0, + "step": 18996 + }, + { + "epoch": 2.086206896551724, + "grad_norm": 1.9342594146728516, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7671207785606384, + "num_tokens": 491665834.0, + "step": 18997 + }, + { + "epoch": 2.086316714254338, + "grad_norm": 2.154778003692627, + "learning_rate": 5e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7739671468734741, + "num_tokens": 491689237.0, + "step": 18998 + }, + { + "epoch": 2.0864265319569513, + "grad_norm": 2.2656993865966797, + "learning_rate": 5e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.7695021629333496, + "num_tokens": 491712975.0, + "step": 18999 + }, + { + "epoch": 2.086536349659565, + "grad_norm": 2.0173890590667725, + "learning_rate": 5e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7371568083763123, + "num_tokens": 491741769.0, + "step": 19000 + }, + { + "epoch": 2.086646167362179, + "grad_norm": 1.787157416343689, + "learning_rate": 5e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7488915920257568, + "num_tokens": 491775240.0, + "step": 19001 + }, + { + "epoch": 2.0867559850647925, + "grad_norm": 1.9579511880874634, + "learning_rate": 5e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.7758886814117432, + "num_tokens": 491802827.0, + "step": 19002 + }, + { + "epoch": 2.0868658027674063, + "grad_norm": 1.9546657800674438, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7633534669876099, + "num_tokens": 491834079.0, + "step": 19003 + }, + { + "epoch": 2.0869756204700196, + "grad_norm": 1.9313957691192627, + "learning_rate": 5e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7701250314712524, + "num_tokens": 491863704.0, + "step": 19004 + }, + { + "epoch": 2.0870854381726334, + "grad_norm": 2.1762843132019043, + "learning_rate": 5e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7717985510826111, + "num_tokens": 491886511.0, + "step": 19005 + }, + { + "epoch": 2.087195255875247, + "grad_norm": 1.8342739343643188, + "learning_rate": 5e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7507933974266052, + "num_tokens": 491922537.0, + "step": 19006 + }, + { + "epoch": 2.087305073577861, + "grad_norm": 2.1502525806427, + "learning_rate": 5e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7374314069747925, + "num_tokens": 491950982.0, + "step": 19007 + }, + { + "epoch": 2.087414891280474, + "grad_norm": 2.319791316986084, + "learning_rate": 5e-06, + "loss": 0.6022, + "mean_token_accuracy": 0.7971144318580627, + "num_tokens": 491970280.0, + "step": 19008 + }, + { + "epoch": 2.087524708983088, + "grad_norm": 2.0176942348480225, + "learning_rate": 5e-06, + "loss": 0.6048, + "mean_token_accuracy": 0.7963643074035645, + "num_tokens": 491993592.0, + "step": 19009 + }, + { + "epoch": 2.0876345266857017, + "grad_norm": 1.9811474084854126, + "learning_rate": 5e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.7768235802650452, + "num_tokens": 492020487.0, + "step": 19010 + }, + { + "epoch": 2.0877443443883155, + "grad_norm": 1.8365041017532349, + "learning_rate": 5e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7447452545166016, + "num_tokens": 492056375.0, + "step": 19011 + }, + { + "epoch": 2.087854162090929, + "grad_norm": 2.0863211154937744, + "learning_rate": 5e-06, + "loss": 0.6348, + "mean_token_accuracy": 0.7875234484672546, + "num_tokens": 492083586.0, + "step": 19012 + }, + { + "epoch": 2.0879639797935425, + "grad_norm": 2.035426616668701, + "learning_rate": 5e-06, + "loss": 0.6521, + "mean_token_accuracy": 0.781067967414856, + "num_tokens": 492108467.0, + "step": 19013 + }, + { + "epoch": 2.0880737974961563, + "grad_norm": 1.9324805736541748, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7740302681922913, + "num_tokens": 492138919.0, + "step": 19014 + }, + { + "epoch": 2.08818361519877, + "grad_norm": 2.203401803970337, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7732812762260437, + "num_tokens": 492162404.0, + "step": 19015 + }, + { + "epoch": 2.088293432901384, + "grad_norm": 1.9539638757705688, + "learning_rate": 5e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.7766690254211426, + "num_tokens": 492191213.0, + "step": 19016 + }, + { + "epoch": 2.0884032506039976, + "grad_norm": 2.5747904777526855, + "learning_rate": 5e-06, + "loss": 0.612, + "mean_token_accuracy": 0.7945449352264404, + "num_tokens": 492209872.0, + "step": 19017 + }, + { + "epoch": 2.088513068306611, + "grad_norm": 1.9128164052963257, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7773785591125488, + "num_tokens": 492238154.0, + "step": 19018 + }, + { + "epoch": 2.0886228860092246, + "grad_norm": 2.2063779830932617, + "learning_rate": 5e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.7874976396560669, + "num_tokens": 492263298.0, + "step": 19019 + }, + { + "epoch": 2.0887327037118384, + "grad_norm": 2.118363380432129, + "learning_rate": 5e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7708752751350403, + "num_tokens": 492289839.0, + "step": 19020 + }, + { + "epoch": 2.088842521414452, + "grad_norm": 2.1995840072631836, + "learning_rate": 5e-06, + "loss": 0.6403, + "mean_token_accuracy": 0.7855209112167358, + "num_tokens": 492312244.0, + "step": 19021 + }, + { + "epoch": 2.0889523391170655, + "grad_norm": 2.1858222484588623, + "learning_rate": 5e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7598370313644409, + "num_tokens": 492337978.0, + "step": 19022 + }, + { + "epoch": 2.089062156819679, + "grad_norm": 2.2847018241882324, + "learning_rate": 5e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7465495467185974, + "num_tokens": 492362710.0, + "step": 19023 + }, + { + "epoch": 2.089171974522293, + "grad_norm": 2.0063796043395996, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7636423110961914, + "num_tokens": 492391674.0, + "step": 19024 + }, + { + "epoch": 2.0892817922249067, + "grad_norm": 1.982683777809143, + "learning_rate": 5e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7652794122695923, + "num_tokens": 492423741.0, + "step": 19025 + }, + { + "epoch": 2.0893916099275205, + "grad_norm": 2.175278425216675, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7801008224487305, + "num_tokens": 492446538.0, + "step": 19026 + }, + { + "epoch": 2.089501427630134, + "grad_norm": 2.0797922611236572, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7705850005149841, + "num_tokens": 492474911.0, + "step": 19027 + }, + { + "epoch": 2.0896112453327476, + "grad_norm": 2.363860607147217, + "learning_rate": 5e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7717834711074829, + "num_tokens": 492495803.0, + "step": 19028 + }, + { + "epoch": 2.0897210630353613, + "grad_norm": 2.6178581714630127, + "learning_rate": 5e-06, + "loss": 0.5819, + "mean_token_accuracy": 0.8004733324050903, + "num_tokens": 492513659.0, + "step": 19029 + }, + { + "epoch": 2.089830880737975, + "grad_norm": 1.9224140644073486, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7558680176734924, + "num_tokens": 492543961.0, + "step": 19030 + }, + { + "epoch": 2.089940698440589, + "grad_norm": 2.2835400104522705, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7805563807487488, + "num_tokens": 492566818.0, + "step": 19031 + }, + { + "epoch": 2.090050516143202, + "grad_norm": 2.087843656539917, + "learning_rate": 5e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7739861607551575, + "num_tokens": 492591119.0, + "step": 19032 + }, + { + "epoch": 2.090160333845816, + "grad_norm": 1.9564409255981445, + "learning_rate": 5e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7680213451385498, + "num_tokens": 492621571.0, + "step": 19033 + }, + { + "epoch": 2.0902701515484297, + "grad_norm": 1.9363205432891846, + "learning_rate": 5e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7628819942474365, + "num_tokens": 492649677.0, + "step": 19034 + }, + { + "epoch": 2.0903799692510434, + "grad_norm": 1.946494698524475, + "learning_rate": 5e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7485467195510864, + "num_tokens": 492680525.0, + "step": 19035 + }, + { + "epoch": 2.0904897869536567, + "grad_norm": 1.9369755983352661, + "learning_rate": 5e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7699007987976074, + "num_tokens": 492706481.0, + "step": 19036 + }, + { + "epoch": 2.0905996046562705, + "grad_norm": 1.883748173713684, + "learning_rate": 5e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7662205696105957, + "num_tokens": 492736716.0, + "step": 19037 + }, + { + "epoch": 2.0907094223588842, + "grad_norm": 1.9709266424179077, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7606170773506165, + "num_tokens": 492766199.0, + "step": 19038 + }, + { + "epoch": 2.090819240061498, + "grad_norm": 2.118405818939209, + "learning_rate": 5e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7685798406600952, + "num_tokens": 492792592.0, + "step": 19039 + }, + { + "epoch": 2.0909290577641118, + "grad_norm": 2.041001558303833, + "learning_rate": 5e-06, + "loss": 0.638, + "mean_token_accuracy": 0.7902142405509949, + "num_tokens": 492818522.0, + "step": 19040 + }, + { + "epoch": 2.091038875466725, + "grad_norm": 1.952019453048706, + "learning_rate": 5e-06, + "loss": 0.6527, + "mean_token_accuracy": 0.783963680267334, + "num_tokens": 492845151.0, + "step": 19041 + }, + { + "epoch": 2.091148693169339, + "grad_norm": 2.344735860824585, + "learning_rate": 5e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.8082118034362793, + "num_tokens": 492864121.0, + "step": 19042 + }, + { + "epoch": 2.0912585108719526, + "grad_norm": 1.9555842876434326, + "learning_rate": 5e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7585248351097107, + "num_tokens": 492893400.0, + "step": 19043 + }, + { + "epoch": 2.0913683285745663, + "grad_norm": 1.9140827655792236, + "learning_rate": 5e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.7867534756660461, + "num_tokens": 492922041.0, + "step": 19044 + }, + { + "epoch": 2.09147814627718, + "grad_norm": 2.167011022567749, + "learning_rate": 5e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7714613676071167, + "num_tokens": 492949867.0, + "step": 19045 + }, + { + "epoch": 2.0915879639797934, + "grad_norm": 2.1080925464630127, + "learning_rate": 5e-06, + "loss": 0.5994, + "mean_token_accuracy": 0.7986976504325867, + "num_tokens": 492971732.0, + "step": 19046 + }, + { + "epoch": 2.091697781682407, + "grad_norm": 2.049858331680298, + "learning_rate": 5e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.7906583547592163, + "num_tokens": 492996459.0, + "step": 19047 + }, + { + "epoch": 2.091807599385021, + "grad_norm": 2.1647017002105713, + "learning_rate": 5e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7465949058532715, + "num_tokens": 493019905.0, + "step": 19048 + }, + { + "epoch": 2.0919174170876347, + "grad_norm": 2.0839552879333496, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7594226002693176, + "num_tokens": 493047567.0, + "step": 19049 + }, + { + "epoch": 2.092027234790248, + "grad_norm": 2.1016581058502197, + "learning_rate": 5e-06, + "loss": 0.6639, + "mean_token_accuracy": 0.79155433177948, + "num_tokens": 493072497.0, + "step": 19050 + }, + { + "epoch": 2.0921370524928617, + "grad_norm": 2.1674184799194336, + "learning_rate": 5e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.7996470928192139, + "num_tokens": 493094195.0, + "step": 19051 + }, + { + "epoch": 2.0922468701954755, + "grad_norm": 2.22965669631958, + "learning_rate": 5e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.7813639640808105, + "num_tokens": 493116312.0, + "step": 19052 + }, + { + "epoch": 2.0923566878980893, + "grad_norm": 2.162184000015259, + "learning_rate": 5e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.7863027453422546, + "num_tokens": 493139272.0, + "step": 19053 + }, + { + "epoch": 2.092466505600703, + "grad_norm": 2.006721258163452, + "learning_rate": 5e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7552182674407959, + "num_tokens": 493169026.0, + "step": 19054 + }, + { + "epoch": 2.0925763233033163, + "grad_norm": 2.020559549331665, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7465753555297852, + "num_tokens": 493196573.0, + "step": 19055 + }, + { + "epoch": 2.09268614100593, + "grad_norm": 1.9573965072631836, + "learning_rate": 5e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7568769454956055, + "num_tokens": 493225007.0, + "step": 19056 + }, + { + "epoch": 2.092795958708544, + "grad_norm": 1.9796690940856934, + "learning_rate": 5e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7477364540100098, + "num_tokens": 493254675.0, + "step": 19057 + }, + { + "epoch": 2.0929057764111576, + "grad_norm": 2.191530704498291, + "learning_rate": 5e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.803137481212616, + "num_tokens": 493275090.0, + "step": 19058 + }, + { + "epoch": 2.093015594113771, + "grad_norm": 2.259723663330078, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7658303380012512, + "num_tokens": 493298731.0, + "step": 19059 + }, + { + "epoch": 2.0931254118163847, + "grad_norm": 2.0523452758789062, + "learning_rate": 5e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.7775585651397705, + "num_tokens": 493324520.0, + "step": 19060 + }, + { + "epoch": 2.0932352295189984, + "grad_norm": 2.16023325920105, + "learning_rate": 5e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7687240242958069, + "num_tokens": 493347491.0, + "step": 19061 + }, + { + "epoch": 2.093345047221612, + "grad_norm": 2.1853854656219482, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7549356818199158, + "num_tokens": 493373714.0, + "step": 19062 + }, + { + "epoch": 2.093454864924226, + "grad_norm": 1.8854336738586426, + "learning_rate": 5e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.7804985046386719, + "num_tokens": 493401648.0, + "step": 19063 + }, + { + "epoch": 2.0935646826268393, + "grad_norm": 2.2421226501464844, + "learning_rate": 5e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.7840929627418518, + "num_tokens": 493425346.0, + "step": 19064 + }, + { + "epoch": 2.093674500329453, + "grad_norm": 1.9158990383148193, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7684913277626038, + "num_tokens": 493455161.0, + "step": 19065 + }, + { + "epoch": 2.0937843180320668, + "grad_norm": 2.0192058086395264, + "learning_rate": 5e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.790345311164856, + "num_tokens": 493482385.0, + "step": 19066 + }, + { + "epoch": 2.0938941357346805, + "grad_norm": 1.8459446430206299, + "learning_rate": 5e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.7928231358528137, + "num_tokens": 493513924.0, + "step": 19067 + }, + { + "epoch": 2.0940039534372943, + "grad_norm": 2.0487685203552246, + "learning_rate": 5e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7593865394592285, + "num_tokens": 493541745.0, + "step": 19068 + }, + { + "epoch": 2.0941137711399076, + "grad_norm": 2.078336000442505, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7772699594497681, + "num_tokens": 493567519.0, + "step": 19069 + }, + { + "epoch": 2.0942235888425214, + "grad_norm": 1.9067028760910034, + "learning_rate": 5e-06, + "loss": 0.6287, + "mean_token_accuracy": 0.7866666913032532, + "num_tokens": 493594449.0, + "step": 19070 + }, + { + "epoch": 2.094333406545135, + "grad_norm": 2.0801448822021484, + "learning_rate": 5e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7724121809005737, + "num_tokens": 493619280.0, + "step": 19071 + }, + { + "epoch": 2.094443224247749, + "grad_norm": 2.1808083057403564, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7552586793899536, + "num_tokens": 493644944.0, + "step": 19072 + }, + { + "epoch": 2.0945530419503626, + "grad_norm": 1.943846344947815, + "learning_rate": 5e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.775869607925415, + "num_tokens": 493675539.0, + "step": 19073 + }, + { + "epoch": 2.094662859652976, + "grad_norm": 1.9923211336135864, + "learning_rate": 5e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7690879106521606, + "num_tokens": 493705142.0, + "step": 19074 + }, + { + "epoch": 2.0947726773555897, + "grad_norm": 1.972254991531372, + "learning_rate": 5e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.7628258466720581, + "num_tokens": 493733155.0, + "step": 19075 + }, + { + "epoch": 2.0948824950582035, + "grad_norm": 2.343729257583618, + "learning_rate": 5e-06, + "loss": 0.636, + "mean_token_accuracy": 0.7868320941925049, + "num_tokens": 493752982.0, + "step": 19076 + }, + { + "epoch": 2.094992312760817, + "grad_norm": 1.9311110973358154, + "learning_rate": 5e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7401053309440613, + "num_tokens": 493784568.0, + "step": 19077 + }, + { + "epoch": 2.0951021304634305, + "grad_norm": 2.3915724754333496, + "learning_rate": 5e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7590444087982178, + "num_tokens": 493809367.0, + "step": 19078 + }, + { + "epoch": 2.0952119481660443, + "grad_norm": 1.8453422784805298, + "learning_rate": 5e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7679096460342407, + "num_tokens": 493839969.0, + "step": 19079 + }, + { + "epoch": 2.095321765868658, + "grad_norm": 2.244748830795288, + "learning_rate": 5e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7698877453804016, + "num_tokens": 493862420.0, + "step": 19080 + }, + { + "epoch": 2.095431583571272, + "grad_norm": 1.8007348775863647, + "learning_rate": 5e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.7531018257141113, + "num_tokens": 493893222.0, + "step": 19081 + }, + { + "epoch": 2.0955414012738856, + "grad_norm": 2.015754461288452, + "learning_rate": 5e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7586251497268677, + "num_tokens": 493922648.0, + "step": 19082 + }, + { + "epoch": 2.095651218976499, + "grad_norm": 2.2305514812469482, + "learning_rate": 5e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.7770542502403259, + "num_tokens": 493945814.0, + "step": 19083 + }, + { + "epoch": 2.0957610366791126, + "grad_norm": 2.3251049518585205, + "learning_rate": 5e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.7805755138397217, + "num_tokens": 493966693.0, + "step": 19084 + }, + { + "epoch": 2.0958708543817264, + "grad_norm": 1.9930832386016846, + "learning_rate": 5e-06, + "loss": 0.6225, + "mean_token_accuracy": 0.786737322807312, + "num_tokens": 493992587.0, + "step": 19085 + }, + { + "epoch": 2.09598067208434, + "grad_norm": 1.9916404485702515, + "learning_rate": 5e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.7977675199508667, + "num_tokens": 494017834.0, + "step": 19086 + }, + { + "epoch": 2.0960904897869534, + "grad_norm": 1.889396071434021, + "learning_rate": 5e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7416889667510986, + "num_tokens": 494047647.0, + "step": 19087 + }, + { + "epoch": 2.096200307489567, + "grad_norm": 2.0057098865509033, + "learning_rate": 5e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.7793399095535278, + "num_tokens": 494074941.0, + "step": 19088 + }, + { + "epoch": 2.096310125192181, + "grad_norm": 2.103851079940796, + "learning_rate": 5e-06, + "loss": 0.6729, + "mean_token_accuracy": 0.7774800062179565, + "num_tokens": 494098657.0, + "step": 19089 + }, + { + "epoch": 2.0964199428947947, + "grad_norm": 2.270442485809326, + "learning_rate": 5e-06, + "loss": 0.6266, + "mean_token_accuracy": 0.7896392345428467, + "num_tokens": 494120057.0, + "step": 19090 + }, + { + "epoch": 2.0965297605974085, + "grad_norm": 2.097405433654785, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7483939528465271, + "num_tokens": 494147731.0, + "step": 19091 + }, + { + "epoch": 2.096639578300022, + "grad_norm": 1.9233790636062622, + "learning_rate": 5e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.750999927520752, + "num_tokens": 494179408.0, + "step": 19092 + }, + { + "epoch": 2.0967493960026355, + "grad_norm": 1.9920963048934937, + "learning_rate": 5e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.7908336520195007, + "num_tokens": 494208199.0, + "step": 19093 + }, + { + "epoch": 2.0968592137052493, + "grad_norm": 2.023442506790161, + "learning_rate": 5e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7455481886863708, + "num_tokens": 494237661.0, + "step": 19094 + }, + { + "epoch": 2.096969031407863, + "grad_norm": 1.9725522994995117, + "learning_rate": 5e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7763167023658752, + "num_tokens": 494263650.0, + "step": 19095 + }, + { + "epoch": 2.097078849110477, + "grad_norm": 2.333658456802368, + "learning_rate": 5e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.782988429069519, + "num_tokens": 494285641.0, + "step": 19096 + }, + { + "epoch": 2.09718866681309, + "grad_norm": 2.2968783378601074, + "learning_rate": 5e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.7762786149978638, + "num_tokens": 494307912.0, + "step": 19097 + }, + { + "epoch": 2.097298484515704, + "grad_norm": 1.8546675443649292, + "learning_rate": 5e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.760326623916626, + "num_tokens": 494338972.0, + "step": 19098 + }, + { + "epoch": 2.0974083022183176, + "grad_norm": 1.9327346086502075, + "learning_rate": 5e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7761246562004089, + "num_tokens": 494367278.0, + "step": 19099 + }, + { + "epoch": 2.0975181199209314, + "grad_norm": 1.8728697299957275, + "learning_rate": 5e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.7708778381347656, + "num_tokens": 494395043.0, + "step": 19100 + }, + { + "epoch": 2.0976279376235447, + "grad_norm": 1.9842737913131714, + "learning_rate": 5e-06, + "loss": 0.6278, + "mean_token_accuracy": 0.7918243408203125, + "num_tokens": 494422686.0, + "step": 19101 + }, + { + "epoch": 2.0977377553261585, + "grad_norm": 2.295170307159424, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7674477100372314, + "num_tokens": 494447586.0, + "step": 19102 + }, + { + "epoch": 2.0978475730287722, + "grad_norm": 2.5510032176971436, + "learning_rate": 5e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7761392593383789, + "num_tokens": 494467575.0, + "step": 19103 + }, + { + "epoch": 2.097957390731386, + "grad_norm": 2.117213487625122, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.766341507434845, + "num_tokens": 494492477.0, + "step": 19104 + }, + { + "epoch": 2.0980672084339997, + "grad_norm": 2.0600061416625977, + "learning_rate": 5e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7678968906402588, + "num_tokens": 494519343.0, + "step": 19105 + }, + { + "epoch": 2.098177026136613, + "grad_norm": 2.3399715423583984, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7786973118782043, + "num_tokens": 494541263.0, + "step": 19106 + }, + { + "epoch": 2.098286843839227, + "grad_norm": 1.8843914270401, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7721815705299377, + "num_tokens": 494571554.0, + "step": 19107 + }, + { + "epoch": 2.0983966615418406, + "grad_norm": 2.1052873134613037, + "learning_rate": 5e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7594105005264282, + "num_tokens": 494597977.0, + "step": 19108 + }, + { + "epoch": 2.0985064792444543, + "grad_norm": 2.0722856521606445, + "learning_rate": 5e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7698612213134766, + "num_tokens": 494624147.0, + "step": 19109 + }, + { + "epoch": 2.0986162969470676, + "grad_norm": 2.4575161933898926, + "learning_rate": 5e-06, + "loss": 0.5731, + "mean_token_accuracy": 0.8035604357719421, + "num_tokens": 494642679.0, + "step": 19110 + }, + { + "epoch": 2.0987261146496814, + "grad_norm": 1.9762598276138306, + "learning_rate": 5e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.7831012010574341, + "num_tokens": 494670039.0, + "step": 19111 + }, + { + "epoch": 2.098835932352295, + "grad_norm": 2.2094626426696777, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7863520383834839, + "num_tokens": 494692157.0, + "step": 19112 + }, + { + "epoch": 2.098945750054909, + "grad_norm": 2.09751033782959, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7684518098831177, + "num_tokens": 494717867.0, + "step": 19113 + }, + { + "epoch": 2.0990555677575227, + "grad_norm": 2.3857874870300293, + "learning_rate": 5e-06, + "loss": 0.6693, + "mean_token_accuracy": 0.7754117250442505, + "num_tokens": 494739384.0, + "step": 19114 + }, + { + "epoch": 2.099165385460136, + "grad_norm": 1.9969801902770996, + "learning_rate": 5e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.7815226316452026, + "num_tokens": 494766209.0, + "step": 19115 + }, + { + "epoch": 2.0992752031627497, + "grad_norm": 1.9645634889602661, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7815101742744446, + "num_tokens": 494795048.0, + "step": 19116 + }, + { + "epoch": 2.0993850208653635, + "grad_norm": 2.3590545654296875, + "learning_rate": 5e-06, + "loss": 0.609, + "mean_token_accuracy": 0.7961228489875793, + "num_tokens": 494814631.0, + "step": 19117 + }, + { + "epoch": 2.0994948385679773, + "grad_norm": 1.9743272066116333, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7709340453147888, + "num_tokens": 494843882.0, + "step": 19118 + }, + { + "epoch": 2.099604656270591, + "grad_norm": 2.0390396118164062, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7563363909721375, + "num_tokens": 494871604.0, + "step": 19119 + }, + { + "epoch": 2.0997144739732043, + "grad_norm": 2.0493710041046143, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7589102983474731, + "num_tokens": 494897968.0, + "step": 19120 + }, + { + "epoch": 2.099824291675818, + "grad_norm": 1.9549729824066162, + "learning_rate": 5e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7844269275665283, + "num_tokens": 494925654.0, + "step": 19121 + }, + { + "epoch": 2.099934109378432, + "grad_norm": 1.9661650657653809, + "learning_rate": 5e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7656968235969543, + "num_tokens": 494954730.0, + "step": 19122 + }, + { + "epoch": 2.1000439270810456, + "grad_norm": 1.9855660200119019, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7752411365509033, + "num_tokens": 494980740.0, + "step": 19123 + }, + { + "epoch": 2.1001537447836593, + "grad_norm": 1.9722685813903809, + "learning_rate": 5e-06, + "loss": 0.6665, + "mean_token_accuracy": 0.7782423496246338, + "num_tokens": 495007706.0, + "step": 19124 + }, + { + "epoch": 2.1002635624862727, + "grad_norm": 2.0279183387756348, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7519747018814087, + "num_tokens": 495033305.0, + "step": 19125 + }, + { + "epoch": 2.1003733801888864, + "grad_norm": 2.117924213409424, + "learning_rate": 5e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.7574834823608398, + "num_tokens": 495059655.0, + "step": 19126 + }, + { + "epoch": 2.1004831978915, + "grad_norm": 2.2406601905822754, + "learning_rate": 5e-06, + "loss": 0.536, + "mean_token_accuracy": 0.8141400218009949, + "num_tokens": 495079042.0, + "step": 19127 + }, + { + "epoch": 2.100593015594114, + "grad_norm": 1.966426134109497, + "learning_rate": 5e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7661201357841492, + "num_tokens": 495107918.0, + "step": 19128 + }, + { + "epoch": 2.1007028332967272, + "grad_norm": 2.5392801761627197, + "learning_rate": 5e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.7917307019233704, + "num_tokens": 495127561.0, + "step": 19129 + }, + { + "epoch": 2.100812650999341, + "grad_norm": 1.9801057577133179, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7687603235244751, + "num_tokens": 495155902.0, + "step": 19130 + }, + { + "epoch": 2.1009224687019548, + "grad_norm": 1.9465501308441162, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7655630111694336, + "num_tokens": 495184205.0, + "step": 19131 + }, + { + "epoch": 2.1010322864045685, + "grad_norm": 2.117379903793335, + "learning_rate": 5e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7653619050979614, + "num_tokens": 495210751.0, + "step": 19132 + }, + { + "epoch": 2.1011421041071823, + "grad_norm": 2.1654365062713623, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7529804110527039, + "num_tokens": 495236494.0, + "step": 19133 + }, + { + "epoch": 2.1012519218097956, + "grad_norm": 1.9427642822265625, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.757165253162384, + "num_tokens": 495267632.0, + "step": 19134 + }, + { + "epoch": 2.1013617395124093, + "grad_norm": 2.182185173034668, + "learning_rate": 5e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7664835453033447, + "num_tokens": 495293565.0, + "step": 19135 + }, + { + "epoch": 2.101471557215023, + "grad_norm": 1.7927905321121216, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7600893974304199, + "num_tokens": 495326973.0, + "step": 19136 + }, + { + "epoch": 2.101581374917637, + "grad_norm": 1.9155365228652954, + "learning_rate": 5e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.7920938730239868, + "num_tokens": 495352819.0, + "step": 19137 + }, + { + "epoch": 2.10169119262025, + "grad_norm": 2.164475440979004, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7570456266403198, + "num_tokens": 495380454.0, + "step": 19138 + }, + { + "epoch": 2.101801010322864, + "grad_norm": 1.9715238809585571, + "learning_rate": 5e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.7814633846282959, + "num_tokens": 495407885.0, + "step": 19139 + }, + { + "epoch": 2.1019108280254777, + "grad_norm": 2.153921604156494, + "learning_rate": 5e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.7774990200996399, + "num_tokens": 495430992.0, + "step": 19140 + }, + { + "epoch": 2.1020206457280914, + "grad_norm": 2.2746429443359375, + "learning_rate": 5e-06, + "loss": 0.5605, + "mean_token_accuracy": 0.8078961372375488, + "num_tokens": 495451437.0, + "step": 19141 + }, + { + "epoch": 2.102130463430705, + "grad_norm": 2.1522440910339355, + "learning_rate": 5e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7646690011024475, + "num_tokens": 495477214.0, + "step": 19142 + }, + { + "epoch": 2.1022402811333185, + "grad_norm": 2.1615500450134277, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7546601891517639, + "num_tokens": 495503099.0, + "step": 19143 + }, + { + "epoch": 2.1023500988359323, + "grad_norm": 2.185636043548584, + "learning_rate": 5e-06, + "loss": 0.6639, + "mean_token_accuracy": 0.7822648882865906, + "num_tokens": 495526544.0, + "step": 19144 + }, + { + "epoch": 2.102459916538546, + "grad_norm": 1.9765220880508423, + "learning_rate": 5e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.773816704750061, + "num_tokens": 495553819.0, + "step": 19145 + }, + { + "epoch": 2.10256973424116, + "grad_norm": 2.236180305480957, + "learning_rate": 5e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7644525170326233, + "num_tokens": 495577878.0, + "step": 19146 + }, + { + "epoch": 2.1026795519437735, + "grad_norm": 1.9838835000991821, + "learning_rate": 5e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7576412558555603, + "num_tokens": 495604692.0, + "step": 19147 + }, + { + "epoch": 2.102789369646387, + "grad_norm": 2.246479034423828, + "learning_rate": 5e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7627601623535156, + "num_tokens": 495629804.0, + "step": 19148 + }, + { + "epoch": 2.1028991873490006, + "grad_norm": 1.9762768745422363, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7741802334785461, + "num_tokens": 495655201.0, + "step": 19149 + }, + { + "epoch": 2.1030090050516144, + "grad_norm": 2.0074315071105957, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7763141393661499, + "num_tokens": 495682632.0, + "step": 19150 + }, + { + "epoch": 2.103118822754228, + "grad_norm": 1.9488039016723633, + "learning_rate": 5e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.7846153378486633, + "num_tokens": 495708835.0, + "step": 19151 + }, + { + "epoch": 2.1032286404568414, + "grad_norm": 1.7432392835617065, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7646297812461853, + "num_tokens": 495743235.0, + "step": 19152 + }, + { + "epoch": 2.103338458159455, + "grad_norm": 1.8602147102355957, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7571254968643188, + "num_tokens": 495772229.0, + "step": 19153 + }, + { + "epoch": 2.103448275862069, + "grad_norm": 2.370342254638672, + "learning_rate": 5e-06, + "loss": 0.5994, + "mean_token_accuracy": 0.7933675050735474, + "num_tokens": 495791056.0, + "step": 19154 + }, + { + "epoch": 2.1035580935646827, + "grad_norm": 1.8311901092529297, + "learning_rate": 5e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7638328671455383, + "num_tokens": 495822632.0, + "step": 19155 + }, + { + "epoch": 2.1036679112672965, + "grad_norm": 2.267951250076294, + "learning_rate": 5e-06, + "loss": 0.6699, + "mean_token_accuracy": 0.7780084609985352, + "num_tokens": 495844389.0, + "step": 19156 + }, + { + "epoch": 2.1037777289699098, + "grad_norm": 2.313384771347046, + "learning_rate": 5e-06, + "loss": 0.6292, + "mean_token_accuracy": 0.7885802984237671, + "num_tokens": 495863438.0, + "step": 19157 + }, + { + "epoch": 2.1038875466725235, + "grad_norm": 2.1768462657928467, + "learning_rate": 5e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7748121023178101, + "num_tokens": 495887078.0, + "step": 19158 + }, + { + "epoch": 2.1039973643751373, + "grad_norm": 2.1080145835876465, + "learning_rate": 5e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7661774158477783, + "num_tokens": 495913291.0, + "step": 19159 + }, + { + "epoch": 2.104107182077751, + "grad_norm": 2.026315212249756, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7770893573760986, + "num_tokens": 495938970.0, + "step": 19160 + }, + { + "epoch": 2.104216999780365, + "grad_norm": 2.0450806617736816, + "learning_rate": 5e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.78925621509552, + "num_tokens": 495963400.0, + "step": 19161 + }, + { + "epoch": 2.104326817482978, + "grad_norm": 2.3016674518585205, + "learning_rate": 5e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.7896810173988342, + "num_tokens": 495983709.0, + "step": 19162 + }, + { + "epoch": 2.104436635185592, + "grad_norm": 2.195483684539795, + "learning_rate": 5e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7711458206176758, + "num_tokens": 496006569.0, + "step": 19163 + }, + { + "epoch": 2.1045464528882056, + "grad_norm": 2.1629440784454346, + "learning_rate": 5e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.7499854564666748, + "num_tokens": 496035718.0, + "step": 19164 + }, + { + "epoch": 2.1046562705908194, + "grad_norm": 2.279460906982422, + "learning_rate": 5e-06, + "loss": 0.6184, + "mean_token_accuracy": 0.7957594990730286, + "num_tokens": 496057117.0, + "step": 19165 + }, + { + "epoch": 2.1047660882934327, + "grad_norm": 2.051241636276245, + "learning_rate": 5e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.7888950705528259, + "num_tokens": 496080763.0, + "step": 19166 + }, + { + "epoch": 2.1048759059960465, + "grad_norm": 2.1776530742645264, + "learning_rate": 5e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.763666570186615, + "num_tokens": 496105959.0, + "step": 19167 + }, + { + "epoch": 2.10498572369866, + "grad_norm": 2.1722702980041504, + "learning_rate": 5e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.7893773317337036, + "num_tokens": 496129301.0, + "step": 19168 + }, + { + "epoch": 2.105095541401274, + "grad_norm": 1.962270736694336, + "learning_rate": 5e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7651546001434326, + "num_tokens": 496156038.0, + "step": 19169 + }, + { + "epoch": 2.1052053591038877, + "grad_norm": 1.9971258640289307, + "learning_rate": 5e-06, + "loss": 0.567, + "mean_token_accuracy": 0.8074361085891724, + "num_tokens": 496180316.0, + "step": 19170 + }, + { + "epoch": 2.105315176806501, + "grad_norm": 2.078433036804199, + "learning_rate": 5e-06, + "loss": 0.6761, + "mean_token_accuracy": 0.7821364402770996, + "num_tokens": 496204193.0, + "step": 19171 + }, + { + "epoch": 2.105424994509115, + "grad_norm": 2.102020025253296, + "learning_rate": 5e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7700494527816772, + "num_tokens": 496230586.0, + "step": 19172 + }, + { + "epoch": 2.1055348122117286, + "grad_norm": 2.1704328060150146, + "learning_rate": 5e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7686312794685364, + "num_tokens": 496256483.0, + "step": 19173 + }, + { + "epoch": 2.1056446299143423, + "grad_norm": 2.1912591457366943, + "learning_rate": 5e-06, + "loss": 0.5848, + "mean_token_accuracy": 0.8020766377449036, + "num_tokens": 496280190.0, + "step": 19174 + }, + { + "epoch": 2.105754447616956, + "grad_norm": 1.9915963411331177, + "learning_rate": 5e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.7514649629592896, + "num_tokens": 496310187.0, + "step": 19175 + }, + { + "epoch": 2.1058642653195694, + "grad_norm": 2.098233222961426, + "learning_rate": 5e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.7767390012741089, + "num_tokens": 496336085.0, + "step": 19176 + }, + { + "epoch": 2.105974083022183, + "grad_norm": 1.8406270742416382, + "learning_rate": 5e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.759139895439148, + "num_tokens": 496367378.0, + "step": 19177 + }, + { + "epoch": 2.106083900724797, + "grad_norm": 2.3505935668945312, + "learning_rate": 5e-06, + "loss": 0.627, + "mean_token_accuracy": 0.7870767712593079, + "num_tokens": 496389392.0, + "step": 19178 + }, + { + "epoch": 2.1061937184274107, + "grad_norm": 2.2448489665985107, + "learning_rate": 5e-06, + "loss": 0.6152, + "mean_token_accuracy": 0.7970431447029114, + "num_tokens": 496409053.0, + "step": 19179 + }, + { + "epoch": 2.106303536130024, + "grad_norm": 1.9801249504089355, + "learning_rate": 5e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.7769325971603394, + "num_tokens": 496435997.0, + "step": 19180 + }, + { + "epoch": 2.1064133538326377, + "grad_norm": 1.9165641069412231, + "learning_rate": 5e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7676198482513428, + "num_tokens": 496465123.0, + "step": 19181 + }, + { + "epoch": 2.1065231715352515, + "grad_norm": 1.8280916213989258, + "learning_rate": 5e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7549780607223511, + "num_tokens": 496498190.0, + "step": 19182 + }, + { + "epoch": 2.1066329892378652, + "grad_norm": 1.9595239162445068, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7367115020751953, + "num_tokens": 496529340.0, + "step": 19183 + }, + { + "epoch": 2.106742806940479, + "grad_norm": 1.929893136024475, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7600945234298706, + "num_tokens": 496558864.0, + "step": 19184 + }, + { + "epoch": 2.1068526246430923, + "grad_norm": 2.1456797122955322, + "learning_rate": 5e-06, + "loss": 0.6728, + "mean_token_accuracy": 0.7751586437225342, + "num_tokens": 496583128.0, + "step": 19185 + }, + { + "epoch": 2.106962442345706, + "grad_norm": 2.1294286251068115, + "learning_rate": 5e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7494497895240784, + "num_tokens": 496610286.0, + "step": 19186 + }, + { + "epoch": 2.10707226004832, + "grad_norm": 1.9640074968338013, + "learning_rate": 5e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.7714876532554626, + "num_tokens": 496640299.0, + "step": 19187 + }, + { + "epoch": 2.1071820777509336, + "grad_norm": 2.233900547027588, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7702263593673706, + "num_tokens": 496662816.0, + "step": 19188 + }, + { + "epoch": 2.107291895453547, + "grad_norm": 1.946324110031128, + "learning_rate": 5e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7551151514053345, + "num_tokens": 496694745.0, + "step": 19189 + }, + { + "epoch": 2.1074017131561606, + "grad_norm": 2.2092134952545166, + "learning_rate": 5e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.7948406338691711, + "num_tokens": 496715298.0, + "step": 19190 + }, + { + "epoch": 2.1075115308587744, + "grad_norm": 2.27237606048584, + "learning_rate": 5e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.7814092636108398, + "num_tokens": 496735762.0, + "step": 19191 + }, + { + "epoch": 2.107621348561388, + "grad_norm": 1.9966200590133667, + "learning_rate": 5e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7679265737533569, + "num_tokens": 496759836.0, + "step": 19192 + }, + { + "epoch": 2.107731166264002, + "grad_norm": 2.1914710998535156, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7701209187507629, + "num_tokens": 496781751.0, + "step": 19193 + }, + { + "epoch": 2.1078409839666152, + "grad_norm": 2.3327529430389404, + "learning_rate": 5e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7797421216964722, + "num_tokens": 496802735.0, + "step": 19194 + }, + { + "epoch": 2.107950801669229, + "grad_norm": 2.08900785446167, + "learning_rate": 5e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7531952261924744, + "num_tokens": 496829953.0, + "step": 19195 + }, + { + "epoch": 2.1080606193718427, + "grad_norm": 1.804262399673462, + "learning_rate": 5e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.7639638185501099, + "num_tokens": 496859353.0, + "step": 19196 + }, + { + "epoch": 2.1081704370744565, + "grad_norm": 2.0845234394073486, + "learning_rate": 5e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.7897364497184753, + "num_tokens": 496883308.0, + "step": 19197 + }, + { + "epoch": 2.1082802547770703, + "grad_norm": 1.920615553855896, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7797248363494873, + "num_tokens": 496912263.0, + "step": 19198 + }, + { + "epoch": 2.1083900724796836, + "grad_norm": 1.9732985496520996, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7547118663787842, + "num_tokens": 496939955.0, + "step": 19199 + }, + { + "epoch": 2.1084998901822973, + "grad_norm": 2.0677990913391113, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7706533670425415, + "num_tokens": 496967265.0, + "step": 19200 + }, + { + "epoch": 2.108609707884911, + "grad_norm": 2.069333791732788, + "learning_rate": 5e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.784950852394104, + "num_tokens": 496992216.0, + "step": 19201 + }, + { + "epoch": 2.108719525587525, + "grad_norm": 1.9486260414123535, + "learning_rate": 5e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7564144134521484, + "num_tokens": 497023398.0, + "step": 19202 + }, + { + "epoch": 2.108829343290138, + "grad_norm": 1.9752079248428345, + "learning_rate": 5e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.7829858064651489, + "num_tokens": 497051636.0, + "step": 19203 + }, + { + "epoch": 2.108939160992752, + "grad_norm": 2.138917922973633, + "learning_rate": 5e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7551430463790894, + "num_tokens": 497078023.0, + "step": 19204 + }, + { + "epoch": 2.1090489786953657, + "grad_norm": 2.149700880050659, + "learning_rate": 5e-06, + "loss": 0.6289, + "mean_token_accuracy": 0.785275936126709, + "num_tokens": 497102156.0, + "step": 19205 + }, + { + "epoch": 2.1091587963979794, + "grad_norm": 2.009772300720215, + "learning_rate": 5e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.7807058691978455, + "num_tokens": 497130827.0, + "step": 19206 + }, + { + "epoch": 2.109268614100593, + "grad_norm": 2.1067428588867188, + "learning_rate": 5e-06, + "loss": 0.6565, + "mean_token_accuracy": 0.7882808446884155, + "num_tokens": 497154729.0, + "step": 19207 + }, + { + "epoch": 2.1093784318032065, + "grad_norm": 2.1112139225006104, + "learning_rate": 5e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.7849140167236328, + "num_tokens": 497178394.0, + "step": 19208 + }, + { + "epoch": 2.1094882495058203, + "grad_norm": 2.2816543579101562, + "learning_rate": 5e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.7761790752410889, + "num_tokens": 497199603.0, + "step": 19209 + }, + { + "epoch": 2.109598067208434, + "grad_norm": 2.1266820430755615, + "learning_rate": 5e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7694348692893982, + "num_tokens": 497224423.0, + "step": 19210 + }, + { + "epoch": 2.1097078849110478, + "grad_norm": 1.9808807373046875, + "learning_rate": 5e-06, + "loss": 0.6683, + "mean_token_accuracy": 0.7803904414176941, + "num_tokens": 497254423.0, + "step": 19211 + }, + { + "epoch": 2.1098177026136615, + "grad_norm": 2.1814827919006348, + "learning_rate": 5e-06, + "loss": 0.6663, + "mean_token_accuracy": 0.7803544402122498, + "num_tokens": 497278333.0, + "step": 19212 + }, + { + "epoch": 2.109927520316275, + "grad_norm": 1.959770679473877, + "learning_rate": 5e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.7633740901947021, + "num_tokens": 497309661.0, + "step": 19213 + }, + { + "epoch": 2.1100373380188886, + "grad_norm": 2.1600871086120605, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7553171515464783, + "num_tokens": 497333133.0, + "step": 19214 + }, + { + "epoch": 2.1101471557215024, + "grad_norm": 2.232771396636963, + "learning_rate": 5e-06, + "loss": 0.5875, + "mean_token_accuracy": 0.8012059330940247, + "num_tokens": 497354401.0, + "step": 19215 + }, + { + "epoch": 2.110256973424116, + "grad_norm": 1.9586923122406006, + "learning_rate": 5e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.7770578861236572, + "num_tokens": 497381061.0, + "step": 19216 + }, + { + "epoch": 2.1103667911267294, + "grad_norm": 2.4360289573669434, + "learning_rate": 5e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7682799100875854, + "num_tokens": 497400660.0, + "step": 19217 + }, + { + "epoch": 2.110476608829343, + "grad_norm": 1.911887288093567, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7664915919303894, + "num_tokens": 497430160.0, + "step": 19218 + }, + { + "epoch": 2.110586426531957, + "grad_norm": 2.1078367233276367, + "learning_rate": 5e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7564687728881836, + "num_tokens": 497454911.0, + "step": 19219 + }, + { + "epoch": 2.1106962442345707, + "grad_norm": 1.961108922958374, + "learning_rate": 5e-06, + "loss": 0.6606, + "mean_token_accuracy": 0.7817700505256653, + "num_tokens": 497482286.0, + "step": 19220 + }, + { + "epoch": 2.1108060619371845, + "grad_norm": 2.1134777069091797, + "learning_rate": 5e-06, + "loss": 0.6306, + "mean_token_accuracy": 0.7968236207962036, + "num_tokens": 497505881.0, + "step": 19221 + }, + { + "epoch": 2.1109158796397978, + "grad_norm": 1.8489065170288086, + "learning_rate": 5e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.7549898028373718, + "num_tokens": 497535939.0, + "step": 19222 + }, + { + "epoch": 2.1110256973424115, + "grad_norm": 2.261352777481079, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7496891021728516, + "num_tokens": 497562925.0, + "step": 19223 + }, + { + "epoch": 2.1111355150450253, + "grad_norm": 1.8764227628707886, + "learning_rate": 5e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7635420560836792, + "num_tokens": 497592402.0, + "step": 19224 + }, + { + "epoch": 2.111245332747639, + "grad_norm": 2.0304160118103027, + "learning_rate": 5e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7659032344818115, + "num_tokens": 497616836.0, + "step": 19225 + }, + { + "epoch": 2.111355150450253, + "grad_norm": 2.064772367477417, + "learning_rate": 5e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7702067494392395, + "num_tokens": 497644057.0, + "step": 19226 + }, + { + "epoch": 2.111464968152866, + "grad_norm": 2.0312838554382324, + "learning_rate": 5e-06, + "loss": 0.6274, + "mean_token_accuracy": 0.7915367484092712, + "num_tokens": 497668179.0, + "step": 19227 + }, + { + "epoch": 2.11157478585548, + "grad_norm": 2.0854218006134033, + "learning_rate": 5e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.7833576202392578, + "num_tokens": 497692265.0, + "step": 19228 + }, + { + "epoch": 2.1116846035580936, + "grad_norm": 1.804116129875183, + "learning_rate": 5e-06, + "loss": 0.6624, + "mean_token_accuracy": 0.7848479747772217, + "num_tokens": 497724256.0, + "step": 19229 + }, + { + "epoch": 2.1117944212607074, + "grad_norm": 2.2893710136413574, + "learning_rate": 5e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7540860772132874, + "num_tokens": 497747191.0, + "step": 19230 + }, + { + "epoch": 2.1119042389633207, + "grad_norm": 2.0148942470550537, + "learning_rate": 5e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.771479606628418, + "num_tokens": 497775323.0, + "step": 19231 + }, + { + "epoch": 2.1120140566659344, + "grad_norm": 2.0336132049560547, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7581131458282471, + "num_tokens": 497803810.0, + "step": 19232 + }, + { + "epoch": 2.112123874368548, + "grad_norm": 2.349979877471924, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7734171748161316, + "num_tokens": 497826243.0, + "step": 19233 + }, + { + "epoch": 2.112233692071162, + "grad_norm": 1.9052058458328247, + "learning_rate": 5e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.7889778017997742, + "num_tokens": 497853995.0, + "step": 19234 + }, + { + "epoch": 2.1123435097737757, + "grad_norm": 1.9676188230514526, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7742635011672974, + "num_tokens": 497880373.0, + "step": 19235 + }, + { + "epoch": 2.112453327476389, + "grad_norm": 1.8552114963531494, + "learning_rate": 5e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7609120607376099, + "num_tokens": 497911716.0, + "step": 19236 + }, + { + "epoch": 2.112563145179003, + "grad_norm": 2.1313416957855225, + "learning_rate": 5e-06, + "loss": 0.6548, + "mean_token_accuracy": 0.7884255647659302, + "num_tokens": 497934843.0, + "step": 19237 + }, + { + "epoch": 2.1126729628816165, + "grad_norm": 2.0398807525634766, + "learning_rate": 5e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.765993058681488, + "num_tokens": 497960970.0, + "step": 19238 + }, + { + "epoch": 2.1127827805842303, + "grad_norm": 2.030489683151245, + "learning_rate": 5e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.7928909659385681, + "num_tokens": 497989360.0, + "step": 19239 + }, + { + "epoch": 2.1128925982868436, + "grad_norm": 2.098592758178711, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7767655849456787, + "num_tokens": 498013556.0, + "step": 19240 + }, + { + "epoch": 2.1130024159894574, + "grad_norm": 2.2229182720184326, + "learning_rate": 5e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.7732094526290894, + "num_tokens": 498035578.0, + "step": 19241 + }, + { + "epoch": 2.113112233692071, + "grad_norm": 1.9267746210098267, + "learning_rate": 5e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7487558126449585, + "num_tokens": 498065211.0, + "step": 19242 + }, + { + "epoch": 2.113222051394685, + "grad_norm": 2.1366608142852783, + "learning_rate": 5e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7535833120346069, + "num_tokens": 498095006.0, + "step": 19243 + }, + { + "epoch": 2.1133318690972986, + "grad_norm": 2.2005703449249268, + "learning_rate": 5e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7573065757751465, + "num_tokens": 498122970.0, + "step": 19244 + }, + { + "epoch": 2.113441686799912, + "grad_norm": 2.1318717002868652, + "learning_rate": 5e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7520049810409546, + "num_tokens": 498148244.0, + "step": 19245 + }, + { + "epoch": 2.1135515045025257, + "grad_norm": 2.054281711578369, + "learning_rate": 5e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.7856445908546448, + "num_tokens": 498175540.0, + "step": 19246 + }, + { + "epoch": 2.1136613222051395, + "grad_norm": 2.1138522624969482, + "learning_rate": 5e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7675895690917969, + "num_tokens": 498199262.0, + "step": 19247 + }, + { + "epoch": 2.1137711399077532, + "grad_norm": 2.2129077911376953, + "learning_rate": 5e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7375246286392212, + "num_tokens": 498226322.0, + "step": 19248 + }, + { + "epoch": 2.113880957610367, + "grad_norm": 1.9990321397781372, + "learning_rate": 5e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.7890911102294922, + "num_tokens": 498253407.0, + "step": 19249 + }, + { + "epoch": 2.1139907753129803, + "grad_norm": 2.2233481407165527, + "learning_rate": 5e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.7884984016418457, + "num_tokens": 498274533.0, + "step": 19250 + }, + { + "epoch": 2.114100593015594, + "grad_norm": 2.0129411220550537, + "learning_rate": 5e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.7499006986618042, + "num_tokens": 498304180.0, + "step": 19251 + }, + { + "epoch": 2.114210410718208, + "grad_norm": 2.5073912143707275, + "learning_rate": 5e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.7851431369781494, + "num_tokens": 498322850.0, + "step": 19252 + }, + { + "epoch": 2.1143202284208216, + "grad_norm": 2.1225149631500244, + "learning_rate": 5e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.7643287777900696, + "num_tokens": 498346446.0, + "step": 19253 + }, + { + "epoch": 2.1144300461234353, + "grad_norm": 2.4279532432556152, + "learning_rate": 5e-06, + "loss": 0.602, + "mean_token_accuracy": 0.794345498085022, + "num_tokens": 498365351.0, + "step": 19254 + }, + { + "epoch": 2.1145398638260486, + "grad_norm": 2.344015121459961, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7782134413719177, + "num_tokens": 498388217.0, + "step": 19255 + }, + { + "epoch": 2.1146496815286624, + "grad_norm": 2.4211409091949463, + "learning_rate": 5e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.7841704487800598, + "num_tokens": 498407465.0, + "step": 19256 + }, + { + "epoch": 2.114759499231276, + "grad_norm": 2.0584871768951416, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7612422704696655, + "num_tokens": 498437722.0, + "step": 19257 + }, + { + "epoch": 2.11486931693389, + "grad_norm": 1.9878343343734741, + "learning_rate": 5e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7557567954063416, + "num_tokens": 498468317.0, + "step": 19258 + }, + { + "epoch": 2.114979134636503, + "grad_norm": 2.0040054321289062, + "learning_rate": 5e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7578042149543762, + "num_tokens": 498497697.0, + "step": 19259 + }, + { + "epoch": 2.115088952339117, + "grad_norm": 2.049090623855591, + "learning_rate": 5e-06, + "loss": 0.6086, + "mean_token_accuracy": 0.7984460592269897, + "num_tokens": 498522859.0, + "step": 19260 + }, + { + "epoch": 2.1151987700417307, + "grad_norm": 1.8527652025222778, + "learning_rate": 5e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7647101879119873, + "num_tokens": 498551767.0, + "step": 19261 + }, + { + "epoch": 2.1153085877443445, + "grad_norm": 2.2169992923736572, + "learning_rate": 5e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.771049976348877, + "num_tokens": 498576033.0, + "step": 19262 + }, + { + "epoch": 2.1154184054469582, + "grad_norm": 2.039250373840332, + "learning_rate": 5e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7695183753967285, + "num_tokens": 498602912.0, + "step": 19263 + }, + { + "epoch": 2.1155282231495716, + "grad_norm": 1.791357159614563, + "learning_rate": 5e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7534325122833252, + "num_tokens": 498639261.0, + "step": 19264 + }, + { + "epoch": 2.1156380408521853, + "grad_norm": 2.1626250743865967, + "learning_rate": 5e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.7849586009979248, + "num_tokens": 498664242.0, + "step": 19265 + }, + { + "epoch": 2.115747858554799, + "grad_norm": 1.9596312046051025, + "learning_rate": 5e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7373461723327637, + "num_tokens": 498694953.0, + "step": 19266 + }, + { + "epoch": 2.115857676257413, + "grad_norm": 2.177692174911499, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7859316468238831, + "num_tokens": 498716922.0, + "step": 19267 + }, + { + "epoch": 2.115967493960026, + "grad_norm": 1.752661943435669, + "learning_rate": 5e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7480925917625427, + "num_tokens": 498751021.0, + "step": 19268 + }, + { + "epoch": 2.11607731166264, + "grad_norm": 1.89691960811615, + "learning_rate": 5e-06, + "loss": 0.6548, + "mean_token_accuracy": 0.7861682176589966, + "num_tokens": 498778425.0, + "step": 19269 + }, + { + "epoch": 2.1161871293652537, + "grad_norm": 1.8016184568405151, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7679687738418579, + "num_tokens": 498811056.0, + "step": 19270 + }, + { + "epoch": 2.1162969470678674, + "grad_norm": 2.1956515312194824, + "learning_rate": 5e-06, + "loss": 0.6395, + "mean_token_accuracy": 0.7856481075286865, + "num_tokens": 498835837.0, + "step": 19271 + }, + { + "epoch": 2.116406764770481, + "grad_norm": 1.8684595823287964, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7731361985206604, + "num_tokens": 498869260.0, + "step": 19272 + }, + { + "epoch": 2.1165165824730945, + "grad_norm": 1.935577154159546, + "learning_rate": 5e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7756600379943848, + "num_tokens": 498898673.0, + "step": 19273 + }, + { + "epoch": 2.1166264001757082, + "grad_norm": 2.1177539825439453, + "learning_rate": 5e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7643609046936035, + "num_tokens": 498925137.0, + "step": 19274 + }, + { + "epoch": 2.116736217878322, + "grad_norm": 1.9945567846298218, + "learning_rate": 5e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7491698265075684, + "num_tokens": 498954182.0, + "step": 19275 + }, + { + "epoch": 2.1168460355809358, + "grad_norm": 2.1191768646240234, + "learning_rate": 5e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7610583305358887, + "num_tokens": 498978783.0, + "step": 19276 + }, + { + "epoch": 2.1169558532835495, + "grad_norm": 2.095501661300659, + "learning_rate": 5e-06, + "loss": 0.5494, + "mean_token_accuracy": 0.8087615370750427, + "num_tokens": 499000865.0, + "step": 19277 + }, + { + "epoch": 2.117065670986163, + "grad_norm": 1.9723402261734009, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7715644836425781, + "num_tokens": 499028281.0, + "step": 19278 + }, + { + "epoch": 2.1171754886887766, + "grad_norm": 1.8407150506973267, + "learning_rate": 5e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7650377750396729, + "num_tokens": 499059057.0, + "step": 19279 + }, + { + "epoch": 2.1172853063913903, + "grad_norm": 2.193692445755005, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7747634053230286, + "num_tokens": 499081902.0, + "step": 19280 + }, + { + "epoch": 2.117395124094004, + "grad_norm": 1.9001754522323608, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7710038423538208, + "num_tokens": 499109526.0, + "step": 19281 + }, + { + "epoch": 2.1175049417966174, + "grad_norm": 2.065966844558716, + "learning_rate": 5e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7508129477500916, + "num_tokens": 499136115.0, + "step": 19282 + }, + { + "epoch": 2.117614759499231, + "grad_norm": 2.19175124168396, + "learning_rate": 5e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7579866051673889, + "num_tokens": 499160050.0, + "step": 19283 + }, + { + "epoch": 2.117724577201845, + "grad_norm": 2.2891619205474854, + "learning_rate": 5e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.783146858215332, + "num_tokens": 499183470.0, + "step": 19284 + }, + { + "epoch": 2.1178343949044587, + "grad_norm": 1.8848624229431152, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7503097057342529, + "num_tokens": 499212980.0, + "step": 19285 + }, + { + "epoch": 2.1179442126070724, + "grad_norm": 2.3710808753967285, + "learning_rate": 5e-06, + "loss": 0.6196, + "mean_token_accuracy": 0.7939059734344482, + "num_tokens": 499231950.0, + "step": 19286 + }, + { + "epoch": 2.1180540303096858, + "grad_norm": 2.2239935398101807, + "learning_rate": 5e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7659223079681396, + "num_tokens": 499255809.0, + "step": 19287 + }, + { + "epoch": 2.1181638480122995, + "grad_norm": 2.155270576477051, + "learning_rate": 5e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.771138072013855, + "num_tokens": 499280465.0, + "step": 19288 + }, + { + "epoch": 2.1182736657149133, + "grad_norm": 1.9142472743988037, + "learning_rate": 5e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.8016685247421265, + "num_tokens": 499306985.0, + "step": 19289 + }, + { + "epoch": 2.118383483417527, + "grad_norm": 1.8290938138961792, + "learning_rate": 5e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7517627477645874, + "num_tokens": 499338426.0, + "step": 19290 + }, + { + "epoch": 2.118493301120141, + "grad_norm": 1.987946629524231, + "learning_rate": 5e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7686550617218018, + "num_tokens": 499368061.0, + "step": 19291 + }, + { + "epoch": 2.118603118822754, + "grad_norm": 2.0696070194244385, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7650413513183594, + "num_tokens": 499395090.0, + "step": 19292 + }, + { + "epoch": 2.118712936525368, + "grad_norm": 2.0628645420074463, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7601912021636963, + "num_tokens": 499421823.0, + "step": 19293 + }, + { + "epoch": 2.1188227542279816, + "grad_norm": 2.0498650074005127, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7714848518371582, + "num_tokens": 499449753.0, + "step": 19294 + }, + { + "epoch": 2.1189325719305954, + "grad_norm": 2.181319236755371, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7697809934616089, + "num_tokens": 499472872.0, + "step": 19295 + }, + { + "epoch": 2.1190423896332087, + "grad_norm": 2.1165125370025635, + "learning_rate": 5e-06, + "loss": 0.6699, + "mean_token_accuracy": 0.7816257476806641, + "num_tokens": 499497850.0, + "step": 19296 + }, + { + "epoch": 2.1191522073358224, + "grad_norm": 2.126523494720459, + "learning_rate": 5e-06, + "loss": 0.6508, + "mean_token_accuracy": 0.7790637016296387, + "num_tokens": 499520580.0, + "step": 19297 + }, + { + "epoch": 2.119262025038436, + "grad_norm": 2.1037681102752686, + "learning_rate": 5e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7625486254692078, + "num_tokens": 499547348.0, + "step": 19298 + }, + { + "epoch": 2.11937184274105, + "grad_norm": 1.9571995735168457, + "learning_rate": 5e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7761361598968506, + "num_tokens": 499573601.0, + "step": 19299 + }, + { + "epoch": 2.1194816604436637, + "grad_norm": 1.986415147781372, + "learning_rate": 5e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7713836431503296, + "num_tokens": 499601117.0, + "step": 19300 + }, + { + "epoch": 2.119591478146277, + "grad_norm": 2.08473539352417, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7610336542129517, + "num_tokens": 499626751.0, + "step": 19301 + }, + { + "epoch": 2.1197012958488908, + "grad_norm": 2.067725419998169, + "learning_rate": 5e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7474367618560791, + "num_tokens": 499654976.0, + "step": 19302 + }, + { + "epoch": 2.1198111135515045, + "grad_norm": 2.0586626529693604, + "learning_rate": 5e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7721619009971619, + "num_tokens": 499682656.0, + "step": 19303 + }, + { + "epoch": 2.1199209312541183, + "grad_norm": 1.851956844329834, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7629560232162476, + "num_tokens": 499713134.0, + "step": 19304 + }, + { + "epoch": 2.120030748956732, + "grad_norm": 1.9674510955810547, + "learning_rate": 5e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.7852705717086792, + "num_tokens": 499741664.0, + "step": 19305 + }, + { + "epoch": 2.1201405666593454, + "grad_norm": 2.072828531265259, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7790821194648743, + "num_tokens": 499768948.0, + "step": 19306 + }, + { + "epoch": 2.120250384361959, + "grad_norm": 2.011582136154175, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7474023103713989, + "num_tokens": 499798510.0, + "step": 19307 + }, + { + "epoch": 2.120360202064573, + "grad_norm": 2.1181437969207764, + "learning_rate": 5e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7600654363632202, + "num_tokens": 499824525.0, + "step": 19308 + }, + { + "epoch": 2.1204700197671866, + "grad_norm": 2.053920269012451, + "learning_rate": 5e-06, + "loss": 0.5947, + "mean_token_accuracy": 0.7951451539993286, + "num_tokens": 499847208.0, + "step": 19309 + }, + { + "epoch": 2.1205798374698, + "grad_norm": 2.17989444732666, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7670099139213562, + "num_tokens": 499871993.0, + "step": 19310 + }, + { + "epoch": 2.1206896551724137, + "grad_norm": 2.0221903324127197, + "learning_rate": 5e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.7844915390014648, + "num_tokens": 499898574.0, + "step": 19311 + }, + { + "epoch": 2.1207994728750275, + "grad_norm": 1.9568818807601929, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7521916031837463, + "num_tokens": 499928556.0, + "step": 19312 + }, + { + "epoch": 2.120909290577641, + "grad_norm": 2.108607530593872, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7764065861701965, + "num_tokens": 499954633.0, + "step": 19313 + }, + { + "epoch": 2.121019108280255, + "grad_norm": 2.499239921569824, + "learning_rate": 5e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.7892107963562012, + "num_tokens": 499973372.0, + "step": 19314 + }, + { + "epoch": 2.1211289259828683, + "grad_norm": 2.254343271255493, + "learning_rate": 5e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7411097884178162, + "num_tokens": 499997325.0, + "step": 19315 + }, + { + "epoch": 2.121238743685482, + "grad_norm": 2.001816987991333, + "learning_rate": 5e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7635598182678223, + "num_tokens": 500024069.0, + "step": 19316 + }, + { + "epoch": 2.121348561388096, + "grad_norm": 2.206756353378296, + "learning_rate": 5e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.7659091353416443, + "num_tokens": 500048511.0, + "step": 19317 + }, + { + "epoch": 2.1214583790907096, + "grad_norm": 2.117969512939453, + "learning_rate": 5e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.7858615517616272, + "num_tokens": 500073887.0, + "step": 19318 + }, + { + "epoch": 2.121568196793323, + "grad_norm": 2.188724994659424, + "learning_rate": 5e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.7809650897979736, + "num_tokens": 500099488.0, + "step": 19319 + }, + { + "epoch": 2.1216780144959366, + "grad_norm": 2.0002801418304443, + "learning_rate": 5e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7738093137741089, + "num_tokens": 500126521.0, + "step": 19320 + }, + { + "epoch": 2.1217878321985504, + "grad_norm": 2.0316128730773926, + "learning_rate": 5e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.7752567529678345, + "num_tokens": 500153456.0, + "step": 19321 + }, + { + "epoch": 2.121897649901164, + "grad_norm": 2.00693941116333, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7857905626296997, + "num_tokens": 500177820.0, + "step": 19322 + }, + { + "epoch": 2.122007467603778, + "grad_norm": 2.033811330795288, + "learning_rate": 5e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7571458220481873, + "num_tokens": 500205000.0, + "step": 19323 + }, + { + "epoch": 2.122117285306391, + "grad_norm": 2.243522882461548, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7769997119903564, + "num_tokens": 500226648.0, + "step": 19324 + }, + { + "epoch": 2.122227103009005, + "grad_norm": 2.4277522563934326, + "learning_rate": 5e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.7795076370239258, + "num_tokens": 500247732.0, + "step": 19325 + }, + { + "epoch": 2.1223369207116187, + "grad_norm": 1.9996602535247803, + "learning_rate": 5e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.7843842506408691, + "num_tokens": 500276028.0, + "step": 19326 + }, + { + "epoch": 2.1224467384142325, + "grad_norm": 2.4332871437072754, + "learning_rate": 5e-06, + "loss": 0.637, + "mean_token_accuracy": 0.7871683835983276, + "num_tokens": 500295927.0, + "step": 19327 + }, + { + "epoch": 2.1225565561168462, + "grad_norm": 2.1283457279205322, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7674052119255066, + "num_tokens": 500322805.0, + "step": 19328 + }, + { + "epoch": 2.1226663738194596, + "grad_norm": 1.9207286834716797, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7652778625488281, + "num_tokens": 500351200.0, + "step": 19329 + }, + { + "epoch": 2.1227761915220733, + "grad_norm": 1.9374897480010986, + "learning_rate": 5e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7719724774360657, + "num_tokens": 500380414.0, + "step": 19330 + }, + { + "epoch": 2.122886009224687, + "grad_norm": 2.186779499053955, + "learning_rate": 5e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7838349342346191, + "num_tokens": 500404422.0, + "step": 19331 + }, + { + "epoch": 2.122995826927301, + "grad_norm": 2.110772132873535, + "learning_rate": 5e-06, + "loss": 0.7547, + "mean_token_accuracy": 0.7567532062530518, + "num_tokens": 500430884.0, + "step": 19332 + }, + { + "epoch": 2.123105644629914, + "grad_norm": 2.169736623764038, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7447010278701782, + "num_tokens": 500459697.0, + "step": 19333 + }, + { + "epoch": 2.123215462332528, + "grad_norm": 2.273266553878784, + "learning_rate": 5e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.783643364906311, + "num_tokens": 500481673.0, + "step": 19334 + }, + { + "epoch": 2.1233252800351416, + "grad_norm": 1.9571497440338135, + "learning_rate": 5e-06, + "loss": 0.641, + "mean_token_accuracy": 0.7836373448371887, + "num_tokens": 500510023.0, + "step": 19335 + }, + { + "epoch": 2.1234350977377554, + "grad_norm": 2.280707359313965, + "learning_rate": 5e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.7799155712127686, + "num_tokens": 500532226.0, + "step": 19336 + }, + { + "epoch": 2.123544915440369, + "grad_norm": 2.1014645099639893, + "learning_rate": 5e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.7866573333740234, + "num_tokens": 500556603.0, + "step": 19337 + }, + { + "epoch": 2.1236547331429825, + "grad_norm": 2.0131916999816895, + "learning_rate": 5e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.787845253944397, + "num_tokens": 500584722.0, + "step": 19338 + }, + { + "epoch": 2.1237645508455962, + "grad_norm": 1.8669517040252686, + "learning_rate": 5e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.7637385129928589, + "num_tokens": 500614193.0, + "step": 19339 + }, + { + "epoch": 2.12387436854821, + "grad_norm": 1.9273219108581543, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.767023503780365, + "num_tokens": 500641607.0, + "step": 19340 + }, + { + "epoch": 2.1239841862508237, + "grad_norm": 2.2280030250549316, + "learning_rate": 5e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7649656534194946, + "num_tokens": 500664903.0, + "step": 19341 + }, + { + "epoch": 2.1240940039534375, + "grad_norm": 2.1255197525024414, + "learning_rate": 5e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.7769362330436707, + "num_tokens": 500689457.0, + "step": 19342 + }, + { + "epoch": 2.124203821656051, + "grad_norm": 2.1981074810028076, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7805193066596985, + "num_tokens": 500715050.0, + "step": 19343 + }, + { + "epoch": 2.1243136393586646, + "grad_norm": 1.9092347621917725, + "learning_rate": 5e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7645750045776367, + "num_tokens": 500745467.0, + "step": 19344 + }, + { + "epoch": 2.1244234570612783, + "grad_norm": 2.1576170921325684, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.762506902217865, + "num_tokens": 500769771.0, + "step": 19345 + }, + { + "epoch": 2.124533274763892, + "grad_norm": 1.8719604015350342, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.753287136554718, + "num_tokens": 500799855.0, + "step": 19346 + }, + { + "epoch": 2.1246430924665054, + "grad_norm": 1.9942724704742432, + "learning_rate": 5e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.7862767577171326, + "num_tokens": 500829880.0, + "step": 19347 + }, + { + "epoch": 2.124752910169119, + "grad_norm": 1.8748161792755127, + "learning_rate": 5e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.746598482131958, + "num_tokens": 500865470.0, + "step": 19348 + }, + { + "epoch": 2.124862727871733, + "grad_norm": 2.099174737930298, + "learning_rate": 5e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.7811897993087769, + "num_tokens": 500892530.0, + "step": 19349 + }, + { + "epoch": 2.1249725455743467, + "grad_norm": 2.1100618839263916, + "learning_rate": 5e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.773174524307251, + "num_tokens": 500918750.0, + "step": 19350 + }, + { + "epoch": 2.1250823632769604, + "grad_norm": 2.226090669631958, + "learning_rate": 5e-06, + "loss": 0.7368, + "mean_token_accuracy": 0.7641275525093079, + "num_tokens": 500942917.0, + "step": 19351 + }, + { + "epoch": 2.1251921809795737, + "grad_norm": 2.2226369380950928, + "learning_rate": 5e-06, + "loss": 0.6296, + "mean_token_accuracy": 0.7865421772003174, + "num_tokens": 500964473.0, + "step": 19352 + }, + { + "epoch": 2.1253019986821875, + "grad_norm": 1.8245328664779663, + "learning_rate": 5e-06, + "loss": 0.699, + "mean_token_accuracy": 0.7666786909103394, + "num_tokens": 500996406.0, + "step": 19353 + }, + { + "epoch": 2.1254118163848013, + "grad_norm": 2.013533592224121, + "learning_rate": 5e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7728433012962341, + "num_tokens": 501023612.0, + "step": 19354 + }, + { + "epoch": 2.125521634087415, + "grad_norm": 2.0942189693450928, + "learning_rate": 5e-06, + "loss": 0.6511, + "mean_token_accuracy": 0.782446026802063, + "num_tokens": 501045337.0, + "step": 19355 + }, + { + "epoch": 2.1256314517900288, + "grad_norm": 2.273573398590088, + "learning_rate": 5e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7662873864173889, + "num_tokens": 501068700.0, + "step": 19356 + }, + { + "epoch": 2.125741269492642, + "grad_norm": 2.0802433490753174, + "learning_rate": 5e-06, + "loss": 0.6633, + "mean_token_accuracy": 0.7839049100875854, + "num_tokens": 501093287.0, + "step": 19357 + }, + { + "epoch": 2.125851087195256, + "grad_norm": 1.8576264381408691, + "learning_rate": 5e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7573217749595642, + "num_tokens": 501125986.0, + "step": 19358 + }, + { + "epoch": 2.1259609048978696, + "grad_norm": 2.0659921169281006, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7628493905067444, + "num_tokens": 501153491.0, + "step": 19359 + }, + { + "epoch": 2.1260707226004834, + "grad_norm": 1.8860465288162231, + "learning_rate": 5e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7581386566162109, + "num_tokens": 501181590.0, + "step": 19360 + }, + { + "epoch": 2.1261805403030967, + "grad_norm": 2.190511465072632, + "learning_rate": 5e-06, + "loss": 0.6336, + "mean_token_accuracy": 0.7844787836074829, + "num_tokens": 501203872.0, + "step": 19361 + }, + { + "epoch": 2.1262903580057104, + "grad_norm": 2.084973096847534, + "learning_rate": 5e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7778773307800293, + "num_tokens": 501230537.0, + "step": 19362 + }, + { + "epoch": 2.126400175708324, + "grad_norm": 2.069195508956909, + "learning_rate": 5e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.756976842880249, + "num_tokens": 501257038.0, + "step": 19363 + }, + { + "epoch": 2.126509993410938, + "grad_norm": 2.1416523456573486, + "learning_rate": 5e-06, + "loss": 0.6442, + "mean_token_accuracy": 0.7862930297851562, + "num_tokens": 501283621.0, + "step": 19364 + }, + { + "epoch": 2.1266198111135517, + "grad_norm": 2.36065411567688, + "learning_rate": 5e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.7833681106567383, + "num_tokens": 501304573.0, + "step": 19365 + }, + { + "epoch": 2.126729628816165, + "grad_norm": 2.1823995113372803, + "learning_rate": 5e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7687571048736572, + "num_tokens": 501328400.0, + "step": 19366 + }, + { + "epoch": 2.1268394465187788, + "grad_norm": 2.3434741497039795, + "learning_rate": 5e-06, + "loss": 0.6488, + "mean_token_accuracy": 0.783338189125061, + "num_tokens": 501349392.0, + "step": 19367 + }, + { + "epoch": 2.1269492642213925, + "grad_norm": 2.2288148403167725, + "learning_rate": 5e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.775609016418457, + "num_tokens": 501370831.0, + "step": 19368 + }, + { + "epoch": 2.1270590819240063, + "grad_norm": 1.9875805377960205, + "learning_rate": 5e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.779165506362915, + "num_tokens": 501398782.0, + "step": 19369 + }, + { + "epoch": 2.1271688996266196, + "grad_norm": 2.167182445526123, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7596032619476318, + "num_tokens": 501422259.0, + "step": 19370 + }, + { + "epoch": 2.1272787173292333, + "grad_norm": 2.08955979347229, + "learning_rate": 5e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.7806498408317566, + "num_tokens": 501451207.0, + "step": 19371 + }, + { + "epoch": 2.127388535031847, + "grad_norm": 1.9029455184936523, + "learning_rate": 5e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7521683573722839, + "num_tokens": 501481201.0, + "step": 19372 + }, + { + "epoch": 2.127498352734461, + "grad_norm": 2.115663766860962, + "learning_rate": 5e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.768791913986206, + "num_tokens": 501506005.0, + "step": 19373 + }, + { + "epoch": 2.1276081704370746, + "grad_norm": 2.244354009628296, + "learning_rate": 5e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7647225856781006, + "num_tokens": 501530211.0, + "step": 19374 + }, + { + "epoch": 2.127717988139688, + "grad_norm": 2.105788469314575, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7725428342819214, + "num_tokens": 501560085.0, + "step": 19375 + }, + { + "epoch": 2.1278278058423017, + "grad_norm": 2.5112414360046387, + "learning_rate": 5e-06, + "loss": 0.6469, + "mean_token_accuracy": 0.7866789102554321, + "num_tokens": 501578784.0, + "step": 19376 + }, + { + "epoch": 2.1279376235449154, + "grad_norm": 1.9205782413482666, + "learning_rate": 5e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.7683967351913452, + "num_tokens": 501607683.0, + "step": 19377 + }, + { + "epoch": 2.128047441247529, + "grad_norm": 2.360941171646118, + "learning_rate": 5e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7689381837844849, + "num_tokens": 501629576.0, + "step": 19378 + }, + { + "epoch": 2.128157258950143, + "grad_norm": 2.2663302421569824, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7537670135498047, + "num_tokens": 501654569.0, + "step": 19379 + }, + { + "epoch": 2.1282670766527563, + "grad_norm": 2.070375919342041, + "learning_rate": 5e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7634056806564331, + "num_tokens": 501680278.0, + "step": 19380 + }, + { + "epoch": 2.12837689435537, + "grad_norm": 2.5144128799438477, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7669965028762817, + "num_tokens": 501700920.0, + "step": 19381 + }, + { + "epoch": 2.128486712057984, + "grad_norm": 2.1282758712768555, + "learning_rate": 5e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.75795978307724, + "num_tokens": 501725645.0, + "step": 19382 + }, + { + "epoch": 2.1285965297605975, + "grad_norm": 1.9188562631607056, + "learning_rate": 5e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7626173496246338, + "num_tokens": 501754707.0, + "step": 19383 + }, + { + "epoch": 2.1287063474632113, + "grad_norm": 2.033109664916992, + "learning_rate": 5e-06, + "loss": 0.6477, + "mean_token_accuracy": 0.7817701101303101, + "num_tokens": 501781737.0, + "step": 19384 + }, + { + "epoch": 2.1288161651658246, + "grad_norm": 2.0303139686584473, + "learning_rate": 5e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.768247127532959, + "num_tokens": 501810739.0, + "step": 19385 + }, + { + "epoch": 2.1289259828684384, + "grad_norm": 2.116560697555542, + "learning_rate": 5e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.7791581749916077, + "num_tokens": 501835623.0, + "step": 19386 + }, + { + "epoch": 2.129035800571052, + "grad_norm": 2.132634162902832, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7837560176849365, + "num_tokens": 501859092.0, + "step": 19387 + }, + { + "epoch": 2.129145618273666, + "grad_norm": 1.9744460582733154, + "learning_rate": 5e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7799003720283508, + "num_tokens": 501886989.0, + "step": 19388 + }, + { + "epoch": 2.129255435976279, + "grad_norm": 2.0414373874664307, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7705501914024353, + "num_tokens": 501913740.0, + "step": 19389 + }, + { + "epoch": 2.129365253678893, + "grad_norm": 1.9922655820846558, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.761978268623352, + "num_tokens": 501943738.0, + "step": 19390 + }, + { + "epoch": 2.1294750713815067, + "grad_norm": 2.1400671005249023, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7597508430480957, + "num_tokens": 501968819.0, + "step": 19391 + }, + { + "epoch": 2.1295848890841205, + "grad_norm": 2.051180124282837, + "learning_rate": 5e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.7863773107528687, + "num_tokens": 501994103.0, + "step": 19392 + }, + { + "epoch": 2.1296947067867342, + "grad_norm": 1.9824440479278564, + "learning_rate": 5e-06, + "loss": 0.6478, + "mean_token_accuracy": 0.7885237336158752, + "num_tokens": 502020462.0, + "step": 19393 + }, + { + "epoch": 2.1298045244893475, + "grad_norm": 2.2138521671295166, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7532743811607361, + "num_tokens": 502046894.0, + "step": 19394 + }, + { + "epoch": 2.1299143421919613, + "grad_norm": 2.0933849811553955, + "learning_rate": 5e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7663685083389282, + "num_tokens": 502072027.0, + "step": 19395 + }, + { + "epoch": 2.130024159894575, + "grad_norm": 2.284095048904419, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.754698634147644, + "num_tokens": 502095748.0, + "step": 19396 + }, + { + "epoch": 2.130133977597189, + "grad_norm": 2.3911521434783936, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7760595083236694, + "num_tokens": 502114504.0, + "step": 19397 + }, + { + "epoch": 2.130243795299802, + "grad_norm": 2.267573595046997, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7620968222618103, + "num_tokens": 502136155.0, + "step": 19398 + }, + { + "epoch": 2.130353613002416, + "grad_norm": 2.174532413482666, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.776275634765625, + "num_tokens": 502158986.0, + "step": 19399 + }, + { + "epoch": 2.1304634307050296, + "grad_norm": 2.3451881408691406, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7592868804931641, + "num_tokens": 502181600.0, + "step": 19400 + }, + { + "epoch": 2.1305732484076434, + "grad_norm": 2.517658233642578, + "learning_rate": 5e-06, + "loss": 0.677, + "mean_token_accuracy": 0.7772473096847534, + "num_tokens": 502200349.0, + "step": 19401 + }, + { + "epoch": 2.130683066110257, + "grad_norm": 2.1039273738861084, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7756639719009399, + "num_tokens": 502227181.0, + "step": 19402 + }, + { + "epoch": 2.1307928838128705, + "grad_norm": 2.267874240875244, + "learning_rate": 5e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.7752929329872131, + "num_tokens": 502251340.0, + "step": 19403 + }, + { + "epoch": 2.130902701515484, + "grad_norm": 2.2177722454071045, + "learning_rate": 5e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7725360989570618, + "num_tokens": 502277421.0, + "step": 19404 + }, + { + "epoch": 2.131012519218098, + "grad_norm": 2.585624933242798, + "learning_rate": 5e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.7987116575241089, + "num_tokens": 502296247.0, + "step": 19405 + }, + { + "epoch": 2.1311223369207117, + "grad_norm": 2.0208609104156494, + "learning_rate": 5e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7785589694976807, + "num_tokens": 502322552.0, + "step": 19406 + }, + { + "epoch": 2.1312321546233255, + "grad_norm": 2.197226047515869, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7715466022491455, + "num_tokens": 502349916.0, + "step": 19407 + }, + { + "epoch": 2.131341972325939, + "grad_norm": 2.1264073848724365, + "learning_rate": 5e-06, + "loss": 0.645, + "mean_token_accuracy": 0.7860351800918579, + "num_tokens": 502374567.0, + "step": 19408 + }, + { + "epoch": 2.1314517900285526, + "grad_norm": 2.251753807067871, + "learning_rate": 5e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.781862735748291, + "num_tokens": 502397482.0, + "step": 19409 + }, + { + "epoch": 2.1315616077311663, + "grad_norm": 2.0748379230499268, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7688747048377991, + "num_tokens": 502423833.0, + "step": 19410 + }, + { + "epoch": 2.13167142543378, + "grad_norm": 1.9865708351135254, + "learning_rate": 5e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7675454616546631, + "num_tokens": 502452080.0, + "step": 19411 + }, + { + "epoch": 2.1317812431363934, + "grad_norm": 2.2410168647766113, + "learning_rate": 5e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.7837487459182739, + "num_tokens": 502473862.0, + "step": 19412 + }, + { + "epoch": 2.131891060839007, + "grad_norm": 2.296469211578369, + "learning_rate": 5e-06, + "loss": 0.6064, + "mean_token_accuracy": 0.7911112308502197, + "num_tokens": 502496124.0, + "step": 19413 + }, + { + "epoch": 2.132000878541621, + "grad_norm": 2.235896110534668, + "learning_rate": 5e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7567700147628784, + "num_tokens": 502519151.0, + "step": 19414 + }, + { + "epoch": 2.1321106962442347, + "grad_norm": 2.1898229122161865, + "learning_rate": 5e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7560937404632568, + "num_tokens": 502542510.0, + "step": 19415 + }, + { + "epoch": 2.1322205139468484, + "grad_norm": 2.444164276123047, + "learning_rate": 5e-06, + "loss": 0.6395, + "mean_token_accuracy": 0.7841598987579346, + "num_tokens": 502561651.0, + "step": 19416 + }, + { + "epoch": 2.1323303316494617, + "grad_norm": 1.9920077323913574, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.767008364200592, + "num_tokens": 502589570.0, + "step": 19417 + }, + { + "epoch": 2.1324401493520755, + "grad_norm": 2.0557661056518555, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.764844536781311, + "num_tokens": 502618649.0, + "step": 19418 + }, + { + "epoch": 2.1325499670546892, + "grad_norm": 1.9786540269851685, + "learning_rate": 5e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7693830728530884, + "num_tokens": 502645532.0, + "step": 19419 + }, + { + "epoch": 2.132659784757303, + "grad_norm": 2.0091683864593506, + "learning_rate": 5e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7593450546264648, + "num_tokens": 502673060.0, + "step": 19420 + }, + { + "epoch": 2.1327696024599163, + "grad_norm": 2.474600315093994, + "learning_rate": 5e-06, + "loss": 0.6139, + "mean_token_accuracy": 0.7904229760169983, + "num_tokens": 502691221.0, + "step": 19421 + }, + { + "epoch": 2.13287942016253, + "grad_norm": 2.0277998447418213, + "learning_rate": 5e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7587842345237732, + "num_tokens": 502723004.0, + "step": 19422 + }, + { + "epoch": 2.132989237865144, + "grad_norm": 2.2606356143951416, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7715232372283936, + "num_tokens": 502746701.0, + "step": 19423 + }, + { + "epoch": 2.1330990555677576, + "grad_norm": 1.8956621885299683, + "learning_rate": 5e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7529704570770264, + "num_tokens": 502776550.0, + "step": 19424 + }, + { + "epoch": 2.1332088732703713, + "grad_norm": 2.147094964981079, + "learning_rate": 5e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.7735722064971924, + "num_tokens": 502800843.0, + "step": 19425 + }, + { + "epoch": 2.1333186909729847, + "grad_norm": 2.135836601257324, + "learning_rate": 5e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7530676126480103, + "num_tokens": 502829483.0, + "step": 19426 + }, + { + "epoch": 2.1334285086755984, + "grad_norm": 2.131976842880249, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7556354999542236, + "num_tokens": 502856478.0, + "step": 19427 + }, + { + "epoch": 2.133538326378212, + "grad_norm": 2.0988590717315674, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7589011192321777, + "num_tokens": 502881800.0, + "step": 19428 + }, + { + "epoch": 2.133648144080826, + "grad_norm": 2.190484046936035, + "learning_rate": 5e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7693433165550232, + "num_tokens": 502906361.0, + "step": 19429 + }, + { + "epoch": 2.1337579617834397, + "grad_norm": 2.1180591583251953, + "learning_rate": 5e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7557297348976135, + "num_tokens": 502932578.0, + "step": 19430 + }, + { + "epoch": 2.133867779486053, + "grad_norm": 1.7963709831237793, + "learning_rate": 5e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7571736574172974, + "num_tokens": 502963959.0, + "step": 19431 + }, + { + "epoch": 2.1339775971886668, + "grad_norm": 2.530815362930298, + "learning_rate": 5e-06, + "loss": 0.5965, + "mean_token_accuracy": 0.7974167466163635, + "num_tokens": 502983970.0, + "step": 19432 + }, + { + "epoch": 2.1340874148912805, + "grad_norm": 2.1713972091674805, + "learning_rate": 5e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7463289499282837, + "num_tokens": 503010293.0, + "step": 19433 + }, + { + "epoch": 2.1341972325938943, + "grad_norm": 2.0821778774261475, + "learning_rate": 5e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7567965984344482, + "num_tokens": 503038919.0, + "step": 19434 + }, + { + "epoch": 2.134307050296508, + "grad_norm": 1.8078454732894897, + "learning_rate": 5e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7622269988059998, + "num_tokens": 503072245.0, + "step": 19435 + }, + { + "epoch": 2.1344168679991213, + "grad_norm": 2.128042459487915, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.770048975944519, + "num_tokens": 503095860.0, + "step": 19436 + }, + { + "epoch": 2.134526685701735, + "grad_norm": 2.124342918395996, + "learning_rate": 5e-06, + "loss": 0.5973, + "mean_token_accuracy": 0.8000860214233398, + "num_tokens": 503119035.0, + "step": 19437 + }, + { + "epoch": 2.134636503404349, + "grad_norm": 2.4438135623931885, + "learning_rate": 5e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7788362503051758, + "num_tokens": 503137383.0, + "step": 19438 + }, + { + "epoch": 2.1347463211069626, + "grad_norm": 2.308912754058838, + "learning_rate": 5e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.7702322006225586, + "num_tokens": 503159148.0, + "step": 19439 + }, + { + "epoch": 2.134856138809576, + "grad_norm": 2.0977516174316406, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7587862610816956, + "num_tokens": 503185777.0, + "step": 19440 + }, + { + "epoch": 2.1349659565121897, + "grad_norm": 2.189990282058716, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7638503313064575, + "num_tokens": 503210075.0, + "step": 19441 + }, + { + "epoch": 2.1350757742148034, + "grad_norm": 2.2965495586395264, + "learning_rate": 5e-06, + "loss": 0.6373, + "mean_token_accuracy": 0.7865546941757202, + "num_tokens": 503232057.0, + "step": 19442 + }, + { + "epoch": 2.135185591917417, + "grad_norm": 1.9584108591079712, + "learning_rate": 5e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7694706916809082, + "num_tokens": 503259137.0, + "step": 19443 + }, + { + "epoch": 2.135295409620031, + "grad_norm": 2.089278221130371, + "learning_rate": 5e-06, + "loss": 0.6477, + "mean_token_accuracy": 0.8048012256622314, + "num_tokens": 503286489.0, + "step": 19444 + }, + { + "epoch": 2.1354052273226443, + "grad_norm": 1.9996050596237183, + "learning_rate": 5e-06, + "loss": 0.6865, + "mean_token_accuracy": 0.7766596078872681, + "num_tokens": 503313184.0, + "step": 19445 + }, + { + "epoch": 2.135515045025258, + "grad_norm": 2.174811840057373, + "learning_rate": 5e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7784653902053833, + "num_tokens": 503336789.0, + "step": 19446 + }, + { + "epoch": 2.1356248627278718, + "grad_norm": 1.9231650829315186, + "learning_rate": 5e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7591050863265991, + "num_tokens": 503370045.0, + "step": 19447 + }, + { + "epoch": 2.1357346804304855, + "grad_norm": 2.34822940826416, + "learning_rate": 5e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7506446242332458, + "num_tokens": 503393708.0, + "step": 19448 + }, + { + "epoch": 2.135844498133099, + "grad_norm": 2.055511474609375, + "learning_rate": 5e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7770543098449707, + "num_tokens": 503419696.0, + "step": 19449 + }, + { + "epoch": 2.1359543158357126, + "grad_norm": 2.364553689956665, + "learning_rate": 5e-06, + "loss": 0.6249, + "mean_token_accuracy": 0.7914401888847351, + "num_tokens": 503439026.0, + "step": 19450 + }, + { + "epoch": 2.1360641335383264, + "grad_norm": 2.061969518661499, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7482144832611084, + "num_tokens": 503468414.0, + "step": 19451 + }, + { + "epoch": 2.13617395124094, + "grad_norm": 1.986210584640503, + "learning_rate": 5e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.7793251276016235, + "num_tokens": 503495217.0, + "step": 19452 + }, + { + "epoch": 2.136283768943554, + "grad_norm": 1.72670316696167, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.767315149307251, + "num_tokens": 503531605.0, + "step": 19453 + }, + { + "epoch": 2.136393586646167, + "grad_norm": 1.9095745086669922, + "learning_rate": 5e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7753574252128601, + "num_tokens": 503560560.0, + "step": 19454 + }, + { + "epoch": 2.136503404348781, + "grad_norm": 2.1949427127838135, + "learning_rate": 5e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7776165008544922, + "num_tokens": 503586179.0, + "step": 19455 + }, + { + "epoch": 2.1366132220513947, + "grad_norm": 2.060976266860962, + "learning_rate": 5e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.7769515514373779, + "num_tokens": 503610830.0, + "step": 19456 + }, + { + "epoch": 2.1367230397540085, + "grad_norm": 1.8048194646835327, + "learning_rate": 5e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7489705085754395, + "num_tokens": 503647148.0, + "step": 19457 + }, + { + "epoch": 2.136832857456622, + "grad_norm": 2.2320010662078857, + "learning_rate": 5e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7660443782806396, + "num_tokens": 503670090.0, + "step": 19458 + }, + { + "epoch": 2.1369426751592355, + "grad_norm": 1.9353467226028442, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7584981918334961, + "num_tokens": 503698821.0, + "step": 19459 + }, + { + "epoch": 2.1370524928618493, + "grad_norm": 2.1881351470947266, + "learning_rate": 5e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7754852771759033, + "num_tokens": 503722219.0, + "step": 19460 + }, + { + "epoch": 2.137162310564463, + "grad_norm": 2.2033703327178955, + "learning_rate": 5e-06, + "loss": 0.5909, + "mean_token_accuracy": 0.7995752096176147, + "num_tokens": 503743442.0, + "step": 19461 + }, + { + "epoch": 2.137272128267077, + "grad_norm": 2.184323310852051, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7579345107078552, + "num_tokens": 503768079.0, + "step": 19462 + }, + { + "epoch": 2.13738194596969, + "grad_norm": 1.8625798225402832, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7591438889503479, + "num_tokens": 503799906.0, + "step": 19463 + }, + { + "epoch": 2.137491763672304, + "grad_norm": 2.1802961826324463, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7576499581336975, + "num_tokens": 503825609.0, + "step": 19464 + }, + { + "epoch": 2.1376015813749176, + "grad_norm": 2.214869499206543, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7747388482093811, + "num_tokens": 503850060.0, + "step": 19465 + }, + { + "epoch": 2.1377113990775314, + "grad_norm": 2.160329818725586, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7698647975921631, + "num_tokens": 503874478.0, + "step": 19466 + }, + { + "epoch": 2.137821216780145, + "grad_norm": 2.1957685947418213, + "learning_rate": 5e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.7799087762832642, + "num_tokens": 503897537.0, + "step": 19467 + }, + { + "epoch": 2.1379310344827585, + "grad_norm": 2.0493812561035156, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7479467391967773, + "num_tokens": 503928411.0, + "step": 19468 + }, + { + "epoch": 2.138040852185372, + "grad_norm": 2.1656317710876465, + "learning_rate": 5e-06, + "loss": 0.6511, + "mean_token_accuracy": 0.7822554111480713, + "num_tokens": 503952188.0, + "step": 19469 + }, + { + "epoch": 2.138150669887986, + "grad_norm": 2.160991668701172, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7693705558776855, + "num_tokens": 503977889.0, + "step": 19470 + }, + { + "epoch": 2.1382604875905997, + "grad_norm": 2.105684518814087, + "learning_rate": 5e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.7735542058944702, + "num_tokens": 504003850.0, + "step": 19471 + }, + { + "epoch": 2.138370305293213, + "grad_norm": 2.0619313716888428, + "learning_rate": 5e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7609503269195557, + "num_tokens": 504030608.0, + "step": 19472 + }, + { + "epoch": 2.138480122995827, + "grad_norm": 1.9419037103652954, + "learning_rate": 5e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.7817869186401367, + "num_tokens": 504058508.0, + "step": 19473 + }, + { + "epoch": 2.1385899406984406, + "grad_norm": 1.8287770748138428, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7549475431442261, + "num_tokens": 504089847.0, + "step": 19474 + }, + { + "epoch": 2.1386997584010543, + "grad_norm": 2.1017868518829346, + "learning_rate": 5e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7653100490570068, + "num_tokens": 504116384.0, + "step": 19475 + }, + { + "epoch": 2.138809576103668, + "grad_norm": 2.1399965286254883, + "learning_rate": 5e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.7956748604774475, + "num_tokens": 504142746.0, + "step": 19476 + }, + { + "epoch": 2.1389193938062814, + "grad_norm": 2.1939544677734375, + "learning_rate": 5e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.7753756046295166, + "num_tokens": 504164532.0, + "step": 19477 + }, + { + "epoch": 2.139029211508895, + "grad_norm": 2.1285698413848877, + "learning_rate": 5e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7629472613334656, + "num_tokens": 504188961.0, + "step": 19478 + }, + { + "epoch": 2.139139029211509, + "grad_norm": 2.144713878631592, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7686296701431274, + "num_tokens": 504213195.0, + "step": 19479 + }, + { + "epoch": 2.1392488469141226, + "grad_norm": 2.0726284980773926, + "learning_rate": 5e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.783679187297821, + "num_tokens": 504236841.0, + "step": 19480 + }, + { + "epoch": 2.1393586646167364, + "grad_norm": 2.2441914081573486, + "learning_rate": 5e-06, + "loss": 0.594, + "mean_token_accuracy": 0.8003406524658203, + "num_tokens": 504258790.0, + "step": 19481 + }, + { + "epoch": 2.1394684823193497, + "grad_norm": 2.262255907058716, + "learning_rate": 5e-06, + "loss": 0.6331, + "mean_token_accuracy": 0.7855522036552429, + "num_tokens": 504280309.0, + "step": 19482 + }, + { + "epoch": 2.1395783000219635, + "grad_norm": 2.146378517150879, + "learning_rate": 5e-06, + "loss": 0.6218, + "mean_token_accuracy": 0.7926836013793945, + "num_tokens": 504303201.0, + "step": 19483 + }, + { + "epoch": 2.1396881177245772, + "grad_norm": 2.0503153800964355, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7649520635604858, + "num_tokens": 504329711.0, + "step": 19484 + }, + { + "epoch": 2.139797935427191, + "grad_norm": 2.3258450031280518, + "learning_rate": 5e-06, + "loss": 0.679, + "mean_token_accuracy": 0.779128909111023, + "num_tokens": 504350266.0, + "step": 19485 + }, + { + "epoch": 2.1399077531298047, + "grad_norm": 2.52074933052063, + "learning_rate": 5e-06, + "loss": 0.5603, + "mean_token_accuracy": 0.8067080974578857, + "num_tokens": 504366652.0, + "step": 19486 + }, + { + "epoch": 2.140017570832418, + "grad_norm": 2.011176824569702, + "learning_rate": 5e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7733331918716431, + "num_tokens": 504393793.0, + "step": 19487 + }, + { + "epoch": 2.140127388535032, + "grad_norm": 2.3184807300567627, + "learning_rate": 5e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7738323211669922, + "num_tokens": 504414623.0, + "step": 19488 + }, + { + "epoch": 2.1402372062376456, + "grad_norm": 2.105619192123413, + "learning_rate": 5e-06, + "loss": 0.5672, + "mean_token_accuracy": 0.806070864200592, + "num_tokens": 504436644.0, + "step": 19489 + }, + { + "epoch": 2.1403470239402593, + "grad_norm": 2.1451361179351807, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7790249586105347, + "num_tokens": 504460225.0, + "step": 19490 + }, + { + "epoch": 2.1404568416428726, + "grad_norm": 2.060941457748413, + "learning_rate": 5e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7577661275863647, + "num_tokens": 504488094.0, + "step": 19491 + }, + { + "epoch": 2.1405666593454864, + "grad_norm": 2.2587411403656006, + "learning_rate": 5e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.767283022403717, + "num_tokens": 504511486.0, + "step": 19492 + }, + { + "epoch": 2.1406764770481, + "grad_norm": 1.9509516954421997, + "learning_rate": 5e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7656930685043335, + "num_tokens": 504539219.0, + "step": 19493 + }, + { + "epoch": 2.140786294750714, + "grad_norm": 2.3132739067077637, + "learning_rate": 5e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.7657037973403931, + "num_tokens": 504561746.0, + "step": 19494 + }, + { + "epoch": 2.1408961124533277, + "grad_norm": 2.3916420936584473, + "learning_rate": 5e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7733423709869385, + "num_tokens": 504583347.0, + "step": 19495 + }, + { + "epoch": 2.141005930155941, + "grad_norm": 1.9797848463058472, + "learning_rate": 5e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.7843272089958191, + "num_tokens": 504610934.0, + "step": 19496 + }, + { + "epoch": 2.1411157478585547, + "grad_norm": 2.3359837532043457, + "learning_rate": 5e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.7872614860534668, + "num_tokens": 504634557.0, + "step": 19497 + }, + { + "epoch": 2.1412255655611685, + "grad_norm": 2.0473897457122803, + "learning_rate": 5e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.781021237373352, + "num_tokens": 504660316.0, + "step": 19498 + }, + { + "epoch": 2.1413353832637823, + "grad_norm": 1.9997199773788452, + "learning_rate": 5e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7507258653640747, + "num_tokens": 504689728.0, + "step": 19499 + }, + { + "epoch": 2.1414452009663956, + "grad_norm": 1.8892738819122314, + "learning_rate": 5e-06, + "loss": 0.6023, + "mean_token_accuracy": 0.7993044853210449, + "num_tokens": 504716767.0, + "step": 19500 + }, + { + "epoch": 2.1415550186690093, + "grad_norm": 2.067018985748291, + "learning_rate": 5e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7585283517837524, + "num_tokens": 504742677.0, + "step": 19501 + }, + { + "epoch": 2.141664836371623, + "grad_norm": 2.4402687549591064, + "learning_rate": 5e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7673616409301758, + "num_tokens": 504764234.0, + "step": 19502 + }, + { + "epoch": 2.141774654074237, + "grad_norm": 1.9370497465133667, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7572426795959473, + "num_tokens": 504793682.0, + "step": 19503 + }, + { + "epoch": 2.1418844717768506, + "grad_norm": 2.017970323562622, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7429955005645752, + "num_tokens": 504823478.0, + "step": 19504 + }, + { + "epoch": 2.141994289479464, + "grad_norm": 2.293067455291748, + "learning_rate": 5e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.7883332371711731, + "num_tokens": 504844654.0, + "step": 19505 + }, + { + "epoch": 2.1421041071820777, + "grad_norm": 2.691960573196411, + "learning_rate": 5e-06, + "loss": 0.5721, + "mean_token_accuracy": 0.8057656288146973, + "num_tokens": 504861780.0, + "step": 19506 + }, + { + "epoch": 2.1422139248846914, + "grad_norm": 2.047091484069824, + "learning_rate": 5e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.7961304783821106, + "num_tokens": 504887999.0, + "step": 19507 + }, + { + "epoch": 2.142323742587305, + "grad_norm": 2.269868850708008, + "learning_rate": 5e-06, + "loss": 0.6212, + "mean_token_accuracy": 0.7944047451019287, + "num_tokens": 504909955.0, + "step": 19508 + }, + { + "epoch": 2.142433560289919, + "grad_norm": 2.5195014476776123, + "learning_rate": 5e-06, + "loss": 0.6075, + "mean_token_accuracy": 0.8000833988189697, + "num_tokens": 504928587.0, + "step": 19509 + }, + { + "epoch": 2.1425433779925322, + "grad_norm": 2.2309038639068604, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7790924310684204, + "num_tokens": 504953617.0, + "step": 19510 + }, + { + "epoch": 2.142653195695146, + "grad_norm": 2.2910892963409424, + "learning_rate": 5e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.769000232219696, + "num_tokens": 504975935.0, + "step": 19511 + }, + { + "epoch": 2.1427630133977598, + "grad_norm": 2.09123158454895, + "learning_rate": 5e-06, + "loss": 0.6488, + "mean_token_accuracy": 0.7775533199310303, + "num_tokens": 505000632.0, + "step": 19512 + }, + { + "epoch": 2.1428728311003735, + "grad_norm": 2.161325216293335, + "learning_rate": 5e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7677661180496216, + "num_tokens": 505025618.0, + "step": 19513 + }, + { + "epoch": 2.142982648802987, + "grad_norm": 2.101480722427368, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.7851147651672363, + "num_tokens": 505050575.0, + "step": 19514 + }, + { + "epoch": 2.1430924665056006, + "grad_norm": 1.8417538404464722, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7536348700523376, + "num_tokens": 505081794.0, + "step": 19515 + }, + { + "epoch": 2.1432022842082143, + "grad_norm": 2.173445224761963, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.755251407623291, + "num_tokens": 505108014.0, + "step": 19516 + }, + { + "epoch": 2.143312101910828, + "grad_norm": 2.1214425563812256, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7630586624145508, + "num_tokens": 505135993.0, + "step": 19517 + }, + { + "epoch": 2.143421919613442, + "grad_norm": 2.135099411010742, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7711745500564575, + "num_tokens": 505160406.0, + "step": 19518 + }, + { + "epoch": 2.143531737316055, + "grad_norm": 2.4724314212799072, + "learning_rate": 5e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.7757958173751831, + "num_tokens": 505180174.0, + "step": 19519 + }, + { + "epoch": 2.143641555018669, + "grad_norm": 1.9442734718322754, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7506879568099976, + "num_tokens": 505209530.0, + "step": 19520 + }, + { + "epoch": 2.1437513727212827, + "grad_norm": 2.109175443649292, + "learning_rate": 5e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.7864542007446289, + "num_tokens": 505233608.0, + "step": 19521 + }, + { + "epoch": 2.1438611904238964, + "grad_norm": 2.0256948471069336, + "learning_rate": 5e-06, + "loss": 0.6821, + "mean_token_accuracy": 0.7789928913116455, + "num_tokens": 505259728.0, + "step": 19522 + }, + { + "epoch": 2.1439710081265098, + "grad_norm": 2.0380489826202393, + "learning_rate": 5e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7560594081878662, + "num_tokens": 505288078.0, + "step": 19523 + }, + { + "epoch": 2.1440808258291235, + "grad_norm": 2.113421678543091, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7677778005599976, + "num_tokens": 505312019.0, + "step": 19524 + }, + { + "epoch": 2.1441906435317373, + "grad_norm": 2.066871166229248, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7716061472892761, + "num_tokens": 505338807.0, + "step": 19525 + }, + { + "epoch": 2.144300461234351, + "grad_norm": 1.9846534729003906, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7681256532669067, + "num_tokens": 505365037.0, + "step": 19526 + }, + { + "epoch": 2.144410278936965, + "grad_norm": 2.0931522846221924, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7696446180343628, + "num_tokens": 505390544.0, + "step": 19527 + }, + { + "epoch": 2.144520096639578, + "grad_norm": 2.264660120010376, + "learning_rate": 5e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7733768224716187, + "num_tokens": 505414064.0, + "step": 19528 + }, + { + "epoch": 2.144629914342192, + "grad_norm": 2.145364999771118, + "learning_rate": 5e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.7823621034622192, + "num_tokens": 505439833.0, + "step": 19529 + }, + { + "epoch": 2.1447397320448056, + "grad_norm": 2.371736764907837, + "learning_rate": 5e-06, + "loss": 0.6503, + "mean_token_accuracy": 0.7815741896629333, + "num_tokens": 505461626.0, + "step": 19530 + }, + { + "epoch": 2.1448495497474194, + "grad_norm": 1.9127496480941772, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7635465860366821, + "num_tokens": 505490135.0, + "step": 19531 + }, + { + "epoch": 2.144959367450033, + "grad_norm": 1.9871991872787476, + "learning_rate": 5e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.7674087285995483, + "num_tokens": 505520331.0, + "step": 19532 + }, + { + "epoch": 2.1450691851526464, + "grad_norm": 1.8141287565231323, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.760770320892334, + "num_tokens": 505552173.0, + "step": 19533 + }, + { + "epoch": 2.14517900285526, + "grad_norm": 1.913543462753296, + "learning_rate": 5e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7584508061408997, + "num_tokens": 505580969.0, + "step": 19534 + }, + { + "epoch": 2.145288820557874, + "grad_norm": 1.8547565937042236, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.758611798286438, + "num_tokens": 505610291.0, + "step": 19535 + }, + { + "epoch": 2.1453986382604877, + "grad_norm": 2.287095785140991, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7675899267196655, + "num_tokens": 505633108.0, + "step": 19536 + }, + { + "epoch": 2.1455084559631015, + "grad_norm": 2.0985634326934814, + "learning_rate": 5e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7697066068649292, + "num_tokens": 505659532.0, + "step": 19537 + }, + { + "epoch": 2.145618273665715, + "grad_norm": 2.1909193992614746, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7677755355834961, + "num_tokens": 505682552.0, + "step": 19538 + }, + { + "epoch": 2.1457280913683285, + "grad_norm": 1.9281482696533203, + "learning_rate": 5e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7536603212356567, + "num_tokens": 505712246.0, + "step": 19539 + }, + { + "epoch": 2.1458379090709423, + "grad_norm": 2.129110813140869, + "learning_rate": 5e-06, + "loss": 0.6312, + "mean_token_accuracy": 0.786648154258728, + "num_tokens": 505737262.0, + "step": 19540 + }, + { + "epoch": 2.145947726773556, + "grad_norm": 2.109377384185791, + "learning_rate": 5e-06, + "loss": 0.6628, + "mean_token_accuracy": 0.7792681455612183, + "num_tokens": 505761802.0, + "step": 19541 + }, + { + "epoch": 2.1460575444761694, + "grad_norm": 2.190678834915161, + "learning_rate": 5e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.7805554270744324, + "num_tokens": 505786119.0, + "step": 19542 + }, + { + "epoch": 2.146167362178783, + "grad_norm": 2.148740291595459, + "learning_rate": 5e-06, + "loss": 0.6199, + "mean_token_accuracy": 0.7887617349624634, + "num_tokens": 505808822.0, + "step": 19543 + }, + { + "epoch": 2.146277179881397, + "grad_norm": 2.127864122390747, + "learning_rate": 5e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.7819916605949402, + "num_tokens": 505834392.0, + "step": 19544 + }, + { + "epoch": 2.1463869975840106, + "grad_norm": 1.951345682144165, + "learning_rate": 5e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7744290828704834, + "num_tokens": 505862764.0, + "step": 19545 + }, + { + "epoch": 2.1464968152866244, + "grad_norm": 2.295776128768921, + "learning_rate": 5e-06, + "loss": 0.6215, + "mean_token_accuracy": 0.7921538352966309, + "num_tokens": 505885512.0, + "step": 19546 + }, + { + "epoch": 2.1466066329892377, + "grad_norm": 2.142364740371704, + "learning_rate": 5e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.7838037610054016, + "num_tokens": 505908938.0, + "step": 19547 + }, + { + "epoch": 2.1467164506918515, + "grad_norm": 1.8420108556747437, + "learning_rate": 5e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7748293876647949, + "num_tokens": 505939561.0, + "step": 19548 + }, + { + "epoch": 2.146826268394465, + "grad_norm": 2.1915955543518066, + "learning_rate": 5e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.7864410877227783, + "num_tokens": 505960713.0, + "step": 19549 + }, + { + "epoch": 2.146936086097079, + "grad_norm": 2.2165467739105225, + "learning_rate": 5e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.7772353887557983, + "num_tokens": 505984721.0, + "step": 19550 + }, + { + "epoch": 2.1470459037996923, + "grad_norm": 2.1542809009552, + "learning_rate": 5e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.7825502157211304, + "num_tokens": 506007400.0, + "step": 19551 + }, + { + "epoch": 2.147155721502306, + "grad_norm": 2.1062233448028564, + "learning_rate": 5e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7538236379623413, + "num_tokens": 506033560.0, + "step": 19552 + }, + { + "epoch": 2.14726553920492, + "grad_norm": 2.402643918991089, + "learning_rate": 5e-06, + "loss": 0.6456, + "mean_token_accuracy": 0.7885016202926636, + "num_tokens": 506054745.0, + "step": 19553 + }, + { + "epoch": 2.1473753569075336, + "grad_norm": 1.9493160247802734, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7517980337142944, + "num_tokens": 506083296.0, + "step": 19554 + }, + { + "epoch": 2.1474851746101473, + "grad_norm": 2.0009665489196777, + "learning_rate": 5e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7706594467163086, + "num_tokens": 506110758.0, + "step": 19555 + }, + { + "epoch": 2.1475949923127606, + "grad_norm": 2.1678333282470703, + "learning_rate": 5e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7503215670585632, + "num_tokens": 506137317.0, + "step": 19556 + }, + { + "epoch": 2.1477048100153744, + "grad_norm": 2.072092056274414, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7664492130279541, + "num_tokens": 506165183.0, + "step": 19557 + }, + { + "epoch": 2.147814627717988, + "grad_norm": 2.072993278503418, + "learning_rate": 5e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.7866959571838379, + "num_tokens": 506188476.0, + "step": 19558 + }, + { + "epoch": 2.147924445420602, + "grad_norm": 2.0617480278015137, + "learning_rate": 5e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7721058130264282, + "num_tokens": 506215379.0, + "step": 19559 + }, + { + "epoch": 2.1480342631232157, + "grad_norm": 2.0720205307006836, + "learning_rate": 5e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.758526086807251, + "num_tokens": 506241336.0, + "step": 19560 + }, + { + "epoch": 2.148144080825829, + "grad_norm": 2.5481088161468506, + "learning_rate": 5e-06, + "loss": 0.5975, + "mean_token_accuracy": 0.8002550005912781, + "num_tokens": 506260992.0, + "step": 19561 + }, + { + "epoch": 2.1482538985284427, + "grad_norm": 2.027390480041504, + "learning_rate": 5e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.7645630836486816, + "num_tokens": 506287561.0, + "step": 19562 + }, + { + "epoch": 2.1483637162310565, + "grad_norm": 2.078078269958496, + "learning_rate": 5e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7603895664215088, + "num_tokens": 506313577.0, + "step": 19563 + }, + { + "epoch": 2.1484735339336702, + "grad_norm": 2.299323797225952, + "learning_rate": 5e-06, + "loss": 0.645, + "mean_token_accuracy": 0.7907798290252686, + "num_tokens": 506335932.0, + "step": 19564 + }, + { + "epoch": 2.148583351636284, + "grad_norm": 2.135578155517578, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7591554522514343, + "num_tokens": 506361060.0, + "step": 19565 + }, + { + "epoch": 2.1486931693388973, + "grad_norm": 2.150507926940918, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7683003544807434, + "num_tokens": 506385608.0, + "step": 19566 + }, + { + "epoch": 2.148802987041511, + "grad_norm": 2.1265056133270264, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7584577798843384, + "num_tokens": 506411173.0, + "step": 19567 + }, + { + "epoch": 2.148912804744125, + "grad_norm": 2.029487133026123, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7721402645111084, + "num_tokens": 506435538.0, + "step": 19568 + }, + { + "epoch": 2.1490226224467386, + "grad_norm": 2.336278200149536, + "learning_rate": 5e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7676854133605957, + "num_tokens": 506457531.0, + "step": 19569 + }, + { + "epoch": 2.149132440149352, + "grad_norm": 2.490537643432617, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7753165364265442, + "num_tokens": 506477895.0, + "step": 19570 + }, + { + "epoch": 2.1492422578519657, + "grad_norm": 2.1996278762817383, + "learning_rate": 5e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7689946889877319, + "num_tokens": 506501867.0, + "step": 19571 + }, + { + "epoch": 2.1493520755545794, + "grad_norm": 2.1338624954223633, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7754226326942444, + "num_tokens": 506528481.0, + "step": 19572 + }, + { + "epoch": 2.149461893257193, + "grad_norm": 2.2002480030059814, + "learning_rate": 5e-06, + "loss": 0.6701, + "mean_token_accuracy": 0.7749069333076477, + "num_tokens": 506550084.0, + "step": 19573 + }, + { + "epoch": 2.149571710959807, + "grad_norm": 2.365318536758423, + "learning_rate": 5e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.7870336771011353, + "num_tokens": 506569501.0, + "step": 19574 + }, + { + "epoch": 2.1496815286624202, + "grad_norm": 1.9473443031311035, + "learning_rate": 5e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.7792859077453613, + "num_tokens": 506594582.0, + "step": 19575 + }, + { + "epoch": 2.149791346365034, + "grad_norm": 1.9603126049041748, + "learning_rate": 5e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7582765817642212, + "num_tokens": 506624793.0, + "step": 19576 + }, + { + "epoch": 2.1499011640676478, + "grad_norm": 2.2463455200195312, + "learning_rate": 5e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7671161890029907, + "num_tokens": 506649618.0, + "step": 19577 + }, + { + "epoch": 2.1500109817702615, + "grad_norm": 2.259528398513794, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7770038843154907, + "num_tokens": 506675108.0, + "step": 19578 + }, + { + "epoch": 2.150120799472875, + "grad_norm": 2.207178831100464, + "learning_rate": 5e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7711013555526733, + "num_tokens": 506699876.0, + "step": 19579 + }, + { + "epoch": 2.1502306171754886, + "grad_norm": 2.558102607727051, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7816901206970215, + "num_tokens": 506719431.0, + "step": 19580 + }, + { + "epoch": 2.1503404348781023, + "grad_norm": 2.158578634262085, + "learning_rate": 5e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.7897384762763977, + "num_tokens": 506744301.0, + "step": 19581 + }, + { + "epoch": 2.150450252580716, + "grad_norm": 2.4415946006774902, + "learning_rate": 5e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.773587167263031, + "num_tokens": 506764193.0, + "step": 19582 + }, + { + "epoch": 2.15056007028333, + "grad_norm": 2.165191411972046, + "learning_rate": 5e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7629945278167725, + "num_tokens": 506789217.0, + "step": 19583 + }, + { + "epoch": 2.150669887985943, + "grad_norm": 1.9573246240615845, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7676552534103394, + "num_tokens": 506817819.0, + "step": 19584 + }, + { + "epoch": 2.150779705688557, + "grad_norm": 2.200395345687866, + "learning_rate": 5e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.7857114672660828, + "num_tokens": 506837868.0, + "step": 19585 + }, + { + "epoch": 2.1508895233911707, + "grad_norm": 1.9516255855560303, + "learning_rate": 5e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7460981607437134, + "num_tokens": 506868443.0, + "step": 19586 + }, + { + "epoch": 2.1509993410937844, + "grad_norm": 1.7294131517410278, + "learning_rate": 5e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7608770132064819, + "num_tokens": 506904671.0, + "step": 19587 + }, + { + "epoch": 2.151109158796398, + "grad_norm": 2.2516136169433594, + "learning_rate": 5e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8035106658935547, + "num_tokens": 506924496.0, + "step": 19588 + }, + { + "epoch": 2.1512189764990115, + "grad_norm": 1.991061806678772, + "learning_rate": 5e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7540744543075562, + "num_tokens": 506955988.0, + "step": 19589 + }, + { + "epoch": 2.1513287942016253, + "grad_norm": 2.6403210163116455, + "learning_rate": 5e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.7835525274276733, + "num_tokens": 506972832.0, + "step": 19590 + }, + { + "epoch": 2.151438611904239, + "grad_norm": 2.203077793121338, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7709400057792664, + "num_tokens": 506998088.0, + "step": 19591 + }, + { + "epoch": 2.1515484296068528, + "grad_norm": 2.2597217559814453, + "learning_rate": 5e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.7876665592193604, + "num_tokens": 507020230.0, + "step": 19592 + }, + { + "epoch": 2.151658247309466, + "grad_norm": 2.2142863273620605, + "learning_rate": 5e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.8002200126647949, + "num_tokens": 507043960.0, + "step": 19593 + }, + { + "epoch": 2.15176806501208, + "grad_norm": 2.1694512367248535, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7676377892494202, + "num_tokens": 507067604.0, + "step": 19594 + }, + { + "epoch": 2.1518778827146936, + "grad_norm": 2.224050283432007, + "learning_rate": 5e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7693195939064026, + "num_tokens": 507091195.0, + "step": 19595 + }, + { + "epoch": 2.1519877004173074, + "grad_norm": 1.9728004932403564, + "learning_rate": 5e-06, + "loss": 0.6733, + "mean_token_accuracy": 0.7901461124420166, + "num_tokens": 507119748.0, + "step": 19596 + }, + { + "epoch": 2.152097518119921, + "grad_norm": 2.3334908485412598, + "learning_rate": 5e-06, + "loss": 0.5724, + "mean_token_accuracy": 0.8072182536125183, + "num_tokens": 507138900.0, + "step": 19597 + }, + { + "epoch": 2.1522073358225344, + "grad_norm": 2.0179760456085205, + "learning_rate": 5e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7571318745613098, + "num_tokens": 507168494.0, + "step": 19598 + }, + { + "epoch": 2.152317153525148, + "grad_norm": 2.0916900634765625, + "learning_rate": 5e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7614080905914307, + "num_tokens": 507195874.0, + "step": 19599 + }, + { + "epoch": 2.152426971227762, + "grad_norm": 2.2191340923309326, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7693254351615906, + "num_tokens": 507220795.0, + "step": 19600 + }, + { + "epoch": 2.1525367889303757, + "grad_norm": 2.0241363048553467, + "learning_rate": 5e-06, + "loss": 0.6378, + "mean_token_accuracy": 0.7868713736534119, + "num_tokens": 507245820.0, + "step": 19601 + }, + { + "epoch": 2.152646606632989, + "grad_norm": 2.2860934734344482, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7580902576446533, + "num_tokens": 507267088.0, + "step": 19602 + }, + { + "epoch": 2.1527564243356028, + "grad_norm": 1.9204872846603394, + "learning_rate": 5e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.7838337421417236, + "num_tokens": 507297497.0, + "step": 19603 + }, + { + "epoch": 2.1528662420382165, + "grad_norm": 2.0993266105651855, + "learning_rate": 5e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7733667492866516, + "num_tokens": 507322773.0, + "step": 19604 + }, + { + "epoch": 2.1529760597408303, + "grad_norm": 2.2735953330993652, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7740318775177002, + "num_tokens": 507343422.0, + "step": 19605 + }, + { + "epoch": 2.153085877443444, + "grad_norm": 2.451310396194458, + "learning_rate": 5e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.7901477217674255, + "num_tokens": 507361091.0, + "step": 19606 + }, + { + "epoch": 2.1531956951460574, + "grad_norm": 1.9741572141647339, + "learning_rate": 5e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.7942258715629578, + "num_tokens": 507388839.0, + "step": 19607 + }, + { + "epoch": 2.153305512848671, + "grad_norm": 2.156916856765747, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7401474714279175, + "num_tokens": 507413501.0, + "step": 19608 + }, + { + "epoch": 2.153415330551285, + "grad_norm": 1.9173760414123535, + "learning_rate": 5e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7400841116905212, + "num_tokens": 507444667.0, + "step": 19609 + }, + { + "epoch": 2.1535251482538986, + "grad_norm": 2.2609989643096924, + "learning_rate": 5e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7634249925613403, + "num_tokens": 507470536.0, + "step": 19610 + }, + { + "epoch": 2.1536349659565124, + "grad_norm": 1.939040184020996, + "learning_rate": 5e-06, + "loss": 0.6564, + "mean_token_accuracy": 0.7888494729995728, + "num_tokens": 507499685.0, + "step": 19611 + }, + { + "epoch": 2.1537447836591257, + "grad_norm": 2.012479782104492, + "learning_rate": 5e-06, + "loss": 0.6621, + "mean_token_accuracy": 0.7823591232299805, + "num_tokens": 507525451.0, + "step": 19612 + }, + { + "epoch": 2.1538546013617395, + "grad_norm": 1.962627649307251, + "learning_rate": 5e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7820415496826172, + "num_tokens": 507554540.0, + "step": 19613 + }, + { + "epoch": 2.153964419064353, + "grad_norm": 2.300572395324707, + "learning_rate": 5e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.7729030847549438, + "num_tokens": 507576531.0, + "step": 19614 + }, + { + "epoch": 2.154074236766967, + "grad_norm": 2.1348514556884766, + "learning_rate": 5e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7787312269210815, + "num_tokens": 507601254.0, + "step": 19615 + }, + { + "epoch": 2.1541840544695807, + "grad_norm": 2.1491987705230713, + "learning_rate": 5e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.759102463722229, + "num_tokens": 507627087.0, + "step": 19616 + }, + { + "epoch": 2.154293872172194, + "grad_norm": 2.156855821609497, + "learning_rate": 5e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.777849555015564, + "num_tokens": 507652241.0, + "step": 19617 + }, + { + "epoch": 2.154403689874808, + "grad_norm": 1.8398538827896118, + "learning_rate": 5e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7594711780548096, + "num_tokens": 507687014.0, + "step": 19618 + }, + { + "epoch": 2.1545135075774215, + "grad_norm": 1.9173017740249634, + "learning_rate": 5e-06, + "loss": 0.6706, + "mean_token_accuracy": 0.7749466896057129, + "num_tokens": 507715994.0, + "step": 19619 + }, + { + "epoch": 2.1546233252800353, + "grad_norm": 2.3077895641326904, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7754013538360596, + "num_tokens": 507736410.0, + "step": 19620 + }, + { + "epoch": 2.1547331429826486, + "grad_norm": 2.0643908977508545, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7665665745735168, + "num_tokens": 507763497.0, + "step": 19621 + }, + { + "epoch": 2.1548429606852624, + "grad_norm": 2.123321771621704, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7653372287750244, + "num_tokens": 507788525.0, + "step": 19622 + }, + { + "epoch": 2.154952778387876, + "grad_norm": 2.179824113845825, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7539005279541016, + "num_tokens": 507814227.0, + "step": 19623 + }, + { + "epoch": 2.15506259609049, + "grad_norm": 2.30574369430542, + "learning_rate": 5e-06, + "loss": 0.6699, + "mean_token_accuracy": 0.7820205092430115, + "num_tokens": 507835648.0, + "step": 19624 + }, + { + "epoch": 2.1551724137931036, + "grad_norm": 2.058211088180542, + "learning_rate": 5e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.7931196689605713, + "num_tokens": 507859692.0, + "step": 19625 + }, + { + "epoch": 2.155282231495717, + "grad_norm": 1.864906668663025, + "learning_rate": 5e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7725315093994141, + "num_tokens": 507889898.0, + "step": 19626 + }, + { + "epoch": 2.1553920491983307, + "grad_norm": 2.0942888259887695, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7610790729522705, + "num_tokens": 507914358.0, + "step": 19627 + }, + { + "epoch": 2.1555018669009445, + "grad_norm": 2.0514442920684814, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7710622549057007, + "num_tokens": 507942016.0, + "step": 19628 + }, + { + "epoch": 2.1556116846035582, + "grad_norm": 2.151745080947876, + "learning_rate": 5e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7620317339897156, + "num_tokens": 507967361.0, + "step": 19629 + }, + { + "epoch": 2.1557215023061715, + "grad_norm": 2.0669541358947754, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7539447546005249, + "num_tokens": 507996393.0, + "step": 19630 + }, + { + "epoch": 2.1558313200087853, + "grad_norm": 2.211967945098877, + "learning_rate": 5e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7748250961303711, + "num_tokens": 508021917.0, + "step": 19631 + }, + { + "epoch": 2.155941137711399, + "grad_norm": 2.4062492847442627, + "learning_rate": 5e-06, + "loss": 0.674, + "mean_token_accuracy": 0.7744503617286682, + "num_tokens": 508042787.0, + "step": 19632 + }, + { + "epoch": 2.156050955414013, + "grad_norm": 2.2580604553222656, + "learning_rate": 5e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.7790842056274414, + "num_tokens": 508065091.0, + "step": 19633 + }, + { + "epoch": 2.1561607731166266, + "grad_norm": 2.196058988571167, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7775129079818726, + "num_tokens": 508089805.0, + "step": 19634 + }, + { + "epoch": 2.15627059081924, + "grad_norm": 1.6549038887023926, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7619197964668274, + "num_tokens": 508126388.0, + "step": 19635 + }, + { + "epoch": 2.1563804085218536, + "grad_norm": 2.2010440826416016, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7769325971603394, + "num_tokens": 508151729.0, + "step": 19636 + }, + { + "epoch": 2.1564902262244674, + "grad_norm": 2.144955635070801, + "learning_rate": 5e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7337162494659424, + "num_tokens": 508180895.0, + "step": 19637 + }, + { + "epoch": 2.156600043927081, + "grad_norm": 1.9822025299072266, + "learning_rate": 5e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.7854399085044861, + "num_tokens": 508207648.0, + "step": 19638 + }, + { + "epoch": 2.156709861629695, + "grad_norm": 2.1593174934387207, + "learning_rate": 5e-06, + "loss": 0.665, + "mean_token_accuracy": 0.7811139225959778, + "num_tokens": 508230225.0, + "step": 19639 + }, + { + "epoch": 2.1568196793323082, + "grad_norm": 2.0026791095733643, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.760068416595459, + "num_tokens": 508256573.0, + "step": 19640 + }, + { + "epoch": 2.156929497034922, + "grad_norm": 2.259587049484253, + "learning_rate": 5e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.7815264463424683, + "num_tokens": 508279311.0, + "step": 19641 + }, + { + "epoch": 2.1570393147375357, + "grad_norm": 1.8896821737289429, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7669930458068848, + "num_tokens": 508307467.0, + "step": 19642 + }, + { + "epoch": 2.1571491324401495, + "grad_norm": 2.0218584537506104, + "learning_rate": 5e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.7835150957107544, + "num_tokens": 508331643.0, + "step": 19643 + }, + { + "epoch": 2.157258950142763, + "grad_norm": 2.007683753967285, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.76278156042099, + "num_tokens": 508360043.0, + "step": 19644 + }, + { + "epoch": 2.1573687678453766, + "grad_norm": 2.554049253463745, + "learning_rate": 5e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7565884590148926, + "num_tokens": 508385859.0, + "step": 19645 + }, + { + "epoch": 2.1574785855479903, + "grad_norm": 2.1604220867156982, + "learning_rate": 5e-06, + "loss": 0.6332, + "mean_token_accuracy": 0.7904309034347534, + "num_tokens": 508408478.0, + "step": 19646 + }, + { + "epoch": 2.157588403250604, + "grad_norm": 2.052839756011963, + "learning_rate": 5e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7749083638191223, + "num_tokens": 508435017.0, + "step": 19647 + }, + { + "epoch": 2.157698220953218, + "grad_norm": 2.434080123901367, + "learning_rate": 5e-06, + "loss": 0.5883, + "mean_token_accuracy": 0.8029133081436157, + "num_tokens": 508454265.0, + "step": 19648 + }, + { + "epoch": 2.157808038655831, + "grad_norm": 2.0023224353790283, + "learning_rate": 5e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7743836641311646, + "num_tokens": 508483530.0, + "step": 19649 + }, + { + "epoch": 2.157917856358445, + "grad_norm": 2.0228471755981445, + "learning_rate": 5e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7777825593948364, + "num_tokens": 508508641.0, + "step": 19650 + }, + { + "epoch": 2.1580276740610587, + "grad_norm": 1.9536477327346802, + "learning_rate": 5e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.7823023796081543, + "num_tokens": 508535880.0, + "step": 19651 + }, + { + "epoch": 2.1581374917636724, + "grad_norm": 2.084831714630127, + "learning_rate": 5e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7618062496185303, + "num_tokens": 508561772.0, + "step": 19652 + }, + { + "epoch": 2.1582473094662857, + "grad_norm": 2.2312116622924805, + "learning_rate": 5e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7601038813591003, + "num_tokens": 508585115.0, + "step": 19653 + }, + { + "epoch": 2.1583571271688995, + "grad_norm": 1.939021348953247, + "learning_rate": 5e-06, + "loss": 0.607, + "mean_token_accuracy": 0.7926880121231079, + "num_tokens": 508612078.0, + "step": 19654 + }, + { + "epoch": 2.1584669448715132, + "grad_norm": 1.998965859413147, + "learning_rate": 5e-06, + "loss": 0.6188, + "mean_token_accuracy": 0.7912154197692871, + "num_tokens": 508638924.0, + "step": 19655 + }, + { + "epoch": 2.158576762574127, + "grad_norm": 2.259139060974121, + "learning_rate": 5e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.7783033847808838, + "num_tokens": 508664574.0, + "step": 19656 + }, + { + "epoch": 2.1586865802767408, + "grad_norm": 1.805649757385254, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.766213059425354, + "num_tokens": 508698955.0, + "step": 19657 + }, + { + "epoch": 2.158796397979354, + "grad_norm": 2.0291590690612793, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7421360015869141, + "num_tokens": 508732562.0, + "step": 19658 + }, + { + "epoch": 2.158906215681968, + "grad_norm": 2.2391843795776367, + "learning_rate": 5e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.7832663059234619, + "num_tokens": 508756502.0, + "step": 19659 + }, + { + "epoch": 2.1590160333845816, + "grad_norm": 2.3134567737579346, + "learning_rate": 5e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.7846371531486511, + "num_tokens": 508778565.0, + "step": 19660 + }, + { + "epoch": 2.1591258510871953, + "grad_norm": 1.9993373155593872, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7696322798728943, + "num_tokens": 508808199.0, + "step": 19661 + }, + { + "epoch": 2.159235668789809, + "grad_norm": 2.228808879852295, + "learning_rate": 5e-06, + "loss": 0.613, + "mean_token_accuracy": 0.7933200001716614, + "num_tokens": 508830697.0, + "step": 19662 + }, + { + "epoch": 2.1593454864924224, + "grad_norm": 2.2601568698883057, + "learning_rate": 5e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7634056806564331, + "num_tokens": 508852849.0, + "step": 19663 + }, + { + "epoch": 2.159455304195036, + "grad_norm": 2.014965057373047, + "learning_rate": 5e-06, + "loss": 0.662, + "mean_token_accuracy": 0.78001868724823, + "num_tokens": 508878698.0, + "step": 19664 + }, + { + "epoch": 2.15956512189765, + "grad_norm": 2.316955804824829, + "learning_rate": 5e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.7965772747993469, + "num_tokens": 508899158.0, + "step": 19665 + }, + { + "epoch": 2.1596749396002637, + "grad_norm": 2.271132469177246, + "learning_rate": 5e-06, + "loss": 0.6866, + "mean_token_accuracy": 0.7763097286224365, + "num_tokens": 508922213.0, + "step": 19666 + }, + { + "epoch": 2.1597847573028774, + "grad_norm": 1.922572374343872, + "learning_rate": 5e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7716121673583984, + "num_tokens": 508948583.0, + "step": 19667 + }, + { + "epoch": 2.1598945750054908, + "grad_norm": 2.1876144409179688, + "learning_rate": 5e-06, + "loss": 0.578, + "mean_token_accuracy": 0.7985803484916687, + "num_tokens": 508970598.0, + "step": 19668 + }, + { + "epoch": 2.1600043927081045, + "grad_norm": 1.9373482465744019, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7643577456474304, + "num_tokens": 509000808.0, + "step": 19669 + }, + { + "epoch": 2.1601142104107183, + "grad_norm": 1.9173274040222168, + "learning_rate": 5e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7655946016311646, + "num_tokens": 509031416.0, + "step": 19670 + }, + { + "epoch": 2.160224028113332, + "grad_norm": 1.9535728693008423, + "learning_rate": 5e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.7760134339332581, + "num_tokens": 509060042.0, + "step": 19671 + }, + { + "epoch": 2.1603338458159453, + "grad_norm": 1.9205188751220703, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7700332403182983, + "num_tokens": 509088003.0, + "step": 19672 + }, + { + "epoch": 2.160443663518559, + "grad_norm": 2.1366629600524902, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7732324600219727, + "num_tokens": 509116238.0, + "step": 19673 + }, + { + "epoch": 2.160553481221173, + "grad_norm": 1.9417612552642822, + "learning_rate": 5e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.7866169214248657, + "num_tokens": 509143320.0, + "step": 19674 + }, + { + "epoch": 2.1606632989237866, + "grad_norm": 2.1038308143615723, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7656290531158447, + "num_tokens": 509168493.0, + "step": 19675 + }, + { + "epoch": 2.1607731166264004, + "grad_norm": 2.070838451385498, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.749683141708374, + "num_tokens": 509199400.0, + "step": 19676 + }, + { + "epoch": 2.1608829343290137, + "grad_norm": 2.3462977409362793, + "learning_rate": 5e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.771262526512146, + "num_tokens": 509219624.0, + "step": 19677 + }, + { + "epoch": 2.1609927520316274, + "grad_norm": 2.377284526824951, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7803213000297546, + "num_tokens": 509239902.0, + "step": 19678 + }, + { + "epoch": 2.161102569734241, + "grad_norm": 2.0076019763946533, + "learning_rate": 5e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7590028047561646, + "num_tokens": 509269873.0, + "step": 19679 + }, + { + "epoch": 2.161212387436855, + "grad_norm": 1.9279333353042603, + "learning_rate": 5e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.737738847732544, + "num_tokens": 509303161.0, + "step": 19680 + }, + { + "epoch": 2.1613222051394683, + "grad_norm": 2.245065927505493, + "learning_rate": 5e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7659870386123657, + "num_tokens": 509326512.0, + "step": 19681 + }, + { + "epoch": 2.161432022842082, + "grad_norm": 1.9596126079559326, + "learning_rate": 5e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7561801075935364, + "num_tokens": 509355680.0, + "step": 19682 + }, + { + "epoch": 2.161541840544696, + "grad_norm": 1.987082839012146, + "learning_rate": 5e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7591382265090942, + "num_tokens": 509384579.0, + "step": 19683 + }, + { + "epoch": 2.1616516582473095, + "grad_norm": 2.138065814971924, + "learning_rate": 5e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7623578310012817, + "num_tokens": 509408667.0, + "step": 19684 + }, + { + "epoch": 2.1617614759499233, + "grad_norm": 1.9206699132919312, + "learning_rate": 5e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7579132318496704, + "num_tokens": 509437902.0, + "step": 19685 + }, + { + "epoch": 2.1618712936525366, + "grad_norm": 1.97933828830719, + "learning_rate": 5e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7455046772956848, + "num_tokens": 509466885.0, + "step": 19686 + }, + { + "epoch": 2.1619811113551504, + "grad_norm": 1.9708495140075684, + "learning_rate": 5e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7486670613288879, + "num_tokens": 509495990.0, + "step": 19687 + }, + { + "epoch": 2.162090929057764, + "grad_norm": 2.056894302368164, + "learning_rate": 5e-06, + "loss": 0.6667, + "mean_token_accuracy": 0.7804238796234131, + "num_tokens": 509522800.0, + "step": 19688 + }, + { + "epoch": 2.162200746760378, + "grad_norm": 2.282975196838379, + "learning_rate": 5e-06, + "loss": 0.612, + "mean_token_accuracy": 0.7919216156005859, + "num_tokens": 509545888.0, + "step": 19689 + }, + { + "epoch": 2.1623105644629916, + "grad_norm": 2.197075366973877, + "learning_rate": 5e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7654953002929688, + "num_tokens": 509570978.0, + "step": 19690 + }, + { + "epoch": 2.162420382165605, + "grad_norm": 1.7624614238739014, + "learning_rate": 5e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.763130784034729, + "num_tokens": 509605071.0, + "step": 19691 + }, + { + "epoch": 2.1625301998682187, + "grad_norm": 2.075510263442993, + "learning_rate": 5e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7578272223472595, + "num_tokens": 509630067.0, + "step": 19692 + }, + { + "epoch": 2.1626400175708325, + "grad_norm": 2.5055811405181885, + "learning_rate": 5e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7644233703613281, + "num_tokens": 509651604.0, + "step": 19693 + }, + { + "epoch": 2.162749835273446, + "grad_norm": 2.014612913131714, + "learning_rate": 5e-06, + "loss": 0.5847, + "mean_token_accuracy": 0.8054807186126709, + "num_tokens": 509676858.0, + "step": 19694 + }, + { + "epoch": 2.16285965297606, + "grad_norm": 1.886989951133728, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7560590505599976, + "num_tokens": 509707198.0, + "step": 19695 + }, + { + "epoch": 2.1629694706786733, + "grad_norm": 2.1088507175445557, + "learning_rate": 5e-06, + "loss": 0.6741, + "mean_token_accuracy": 0.7780559062957764, + "num_tokens": 509731889.0, + "step": 19696 + }, + { + "epoch": 2.163079288381287, + "grad_norm": 2.4057657718658447, + "learning_rate": 5e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.7891751527786255, + "num_tokens": 509753728.0, + "step": 19697 + }, + { + "epoch": 2.163189106083901, + "grad_norm": 2.3038949966430664, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7833423614501953, + "num_tokens": 509774909.0, + "step": 19698 + }, + { + "epoch": 2.1632989237865146, + "grad_norm": 2.359577178955078, + "learning_rate": 5e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.7791823148727417, + "num_tokens": 509796199.0, + "step": 19699 + }, + { + "epoch": 2.163408741489128, + "grad_norm": 2.1913437843322754, + "learning_rate": 5e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7735217809677124, + "num_tokens": 509820552.0, + "step": 19700 + }, + { + "epoch": 2.1635185591917416, + "grad_norm": 2.0413520336151123, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7525468468666077, + "num_tokens": 509849127.0, + "step": 19701 + }, + { + "epoch": 2.1636283768943554, + "grad_norm": 1.9641867876052856, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7752453684806824, + "num_tokens": 509876546.0, + "step": 19702 + }, + { + "epoch": 2.163738194596969, + "grad_norm": 2.1282198429107666, + "learning_rate": 5e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7553799152374268, + "num_tokens": 509902858.0, + "step": 19703 + }, + { + "epoch": 2.1638480122995825, + "grad_norm": 2.1747822761535645, + "learning_rate": 5e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8057813048362732, + "num_tokens": 509925274.0, + "step": 19704 + }, + { + "epoch": 2.163957830002196, + "grad_norm": 1.9799726009368896, + "learning_rate": 5e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7543870806694031, + "num_tokens": 509954833.0, + "step": 19705 + }, + { + "epoch": 2.16406764770481, + "grad_norm": 1.9748674631118774, + "learning_rate": 5e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7681146860122681, + "num_tokens": 509981005.0, + "step": 19706 + }, + { + "epoch": 2.1641774654074237, + "grad_norm": 1.9213097095489502, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7546769380569458, + "num_tokens": 510012742.0, + "step": 19707 + }, + { + "epoch": 2.1642872831100375, + "grad_norm": 2.2330846786499023, + "learning_rate": 5e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.7800068855285645, + "num_tokens": 510036441.0, + "step": 19708 + }, + { + "epoch": 2.164397100812651, + "grad_norm": 2.1267411708831787, + "learning_rate": 5e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7596070766448975, + "num_tokens": 510064104.0, + "step": 19709 + }, + { + "epoch": 2.1645069185152646, + "grad_norm": 2.0563695430755615, + "learning_rate": 5e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.7763991951942444, + "num_tokens": 510091014.0, + "step": 19710 + }, + { + "epoch": 2.1646167362178783, + "grad_norm": 2.2550435066223145, + "learning_rate": 5e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.7741290330886841, + "num_tokens": 510111252.0, + "step": 19711 + }, + { + "epoch": 2.164726553920492, + "grad_norm": 1.8298437595367432, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7580864429473877, + "num_tokens": 510142454.0, + "step": 19712 + }, + { + "epoch": 2.164836371623106, + "grad_norm": 2.0938098430633545, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7736328840255737, + "num_tokens": 510168275.0, + "step": 19713 + }, + { + "epoch": 2.164946189325719, + "grad_norm": 2.2128818035125732, + "learning_rate": 5e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.7967376708984375, + "num_tokens": 510191016.0, + "step": 19714 + }, + { + "epoch": 2.165056007028333, + "grad_norm": 2.2733795642852783, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7633605003356934, + "num_tokens": 510213385.0, + "step": 19715 + }, + { + "epoch": 2.1651658247309467, + "grad_norm": 2.0067365169525146, + "learning_rate": 5e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7478955984115601, + "num_tokens": 510240490.0, + "step": 19716 + }, + { + "epoch": 2.1652756424335604, + "grad_norm": 2.1216816902160645, + "learning_rate": 5e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7662990689277649, + "num_tokens": 510266290.0, + "step": 19717 + }, + { + "epoch": 2.165385460136174, + "grad_norm": 1.946961522102356, + "learning_rate": 5e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.783963680267334, + "num_tokens": 510293009.0, + "step": 19718 + }, + { + "epoch": 2.1654952778387875, + "grad_norm": 1.9136338233947754, + "learning_rate": 5e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7465318441390991, + "num_tokens": 510325050.0, + "step": 19719 + }, + { + "epoch": 2.1656050955414012, + "grad_norm": 1.7083427906036377, + "learning_rate": 5e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.737877607345581, + "num_tokens": 510360344.0, + "step": 19720 + }, + { + "epoch": 2.165714913244015, + "grad_norm": 2.1427035331726074, + "learning_rate": 5e-06, + "loss": 0.6633, + "mean_token_accuracy": 0.7785706520080566, + "num_tokens": 510386624.0, + "step": 19721 + }, + { + "epoch": 2.1658247309466288, + "grad_norm": 2.0744264125823975, + "learning_rate": 5e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7780434489250183, + "num_tokens": 510413816.0, + "step": 19722 + }, + { + "epoch": 2.165934548649242, + "grad_norm": 2.094064712524414, + "learning_rate": 5e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7777154445648193, + "num_tokens": 510439469.0, + "step": 19723 + }, + { + "epoch": 2.166044366351856, + "grad_norm": 1.9622974395751953, + "learning_rate": 5e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7759770154953003, + "num_tokens": 510468180.0, + "step": 19724 + }, + { + "epoch": 2.1661541840544696, + "grad_norm": 1.811224341392517, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.764845609664917, + "num_tokens": 510500307.0, + "step": 19725 + }, + { + "epoch": 2.1662640017570833, + "grad_norm": 1.9794055223464966, + "learning_rate": 5e-06, + "loss": 0.6612, + "mean_token_accuracy": 0.7801820039749146, + "num_tokens": 510529766.0, + "step": 19726 + }, + { + "epoch": 2.166373819459697, + "grad_norm": 2.1961002349853516, + "learning_rate": 5e-06, + "loss": 0.6618, + "mean_token_accuracy": 0.7808928489685059, + "num_tokens": 510553484.0, + "step": 19727 + }, + { + "epoch": 2.1664836371623104, + "grad_norm": 2.127063512802124, + "learning_rate": 5e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7496175765991211, + "num_tokens": 510579814.0, + "step": 19728 + }, + { + "epoch": 2.166593454864924, + "grad_norm": 2.141538619995117, + "learning_rate": 5e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7607699036598206, + "num_tokens": 510607659.0, + "step": 19729 + }, + { + "epoch": 2.166703272567538, + "grad_norm": 2.030174493789673, + "learning_rate": 5e-06, + "loss": 0.672, + "mean_token_accuracy": 0.7772679328918457, + "num_tokens": 510635661.0, + "step": 19730 + }, + { + "epoch": 2.1668130902701517, + "grad_norm": 2.316333770751953, + "learning_rate": 5e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7720199227333069, + "num_tokens": 510656142.0, + "step": 19731 + }, + { + "epoch": 2.166922907972765, + "grad_norm": 1.8823236227035522, + "learning_rate": 5e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7614800930023193, + "num_tokens": 510684574.0, + "step": 19732 + }, + { + "epoch": 2.1670327256753787, + "grad_norm": 1.998673677444458, + "learning_rate": 5e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7446461319923401, + "num_tokens": 510715360.0, + "step": 19733 + }, + { + "epoch": 2.1671425433779925, + "grad_norm": 2.1201670169830322, + "learning_rate": 5e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7575869560241699, + "num_tokens": 510741872.0, + "step": 19734 + }, + { + "epoch": 2.1672523610806063, + "grad_norm": 1.9124958515167236, + "learning_rate": 5e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.759437620639801, + "num_tokens": 510773325.0, + "step": 19735 + }, + { + "epoch": 2.16736217878322, + "grad_norm": 1.8856940269470215, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7738820314407349, + "num_tokens": 510802809.0, + "step": 19736 + }, + { + "epoch": 2.1674719964858333, + "grad_norm": 1.914507508277893, + "learning_rate": 5e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7755905389785767, + "num_tokens": 510832776.0, + "step": 19737 + }, + { + "epoch": 2.167581814188447, + "grad_norm": 2.261592149734497, + "learning_rate": 5e-06, + "loss": 0.6776, + "mean_token_accuracy": 0.7789671421051025, + "num_tokens": 510853846.0, + "step": 19738 + }, + { + "epoch": 2.167691631891061, + "grad_norm": 2.125544786453247, + "learning_rate": 5e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7837886810302734, + "num_tokens": 510879098.0, + "step": 19739 + }, + { + "epoch": 2.1678014495936746, + "grad_norm": 2.075538396835327, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7611367702484131, + "num_tokens": 510908068.0, + "step": 19740 + }, + { + "epoch": 2.1679112672962884, + "grad_norm": 2.212800979614258, + "learning_rate": 5e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7653541564941406, + "num_tokens": 510934142.0, + "step": 19741 + }, + { + "epoch": 2.1680210849989017, + "grad_norm": 2.3511412143707275, + "learning_rate": 5e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7712817788124084, + "num_tokens": 510955470.0, + "step": 19742 + }, + { + "epoch": 2.1681309027015154, + "grad_norm": 2.0616507530212402, + "learning_rate": 5e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7563619613647461, + "num_tokens": 510984254.0, + "step": 19743 + }, + { + "epoch": 2.168240720404129, + "grad_norm": 1.9395068883895874, + "learning_rate": 5e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.7769179940223694, + "num_tokens": 511010261.0, + "step": 19744 + }, + { + "epoch": 2.168350538106743, + "grad_norm": 2.3753788471221924, + "learning_rate": 5e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.7811639308929443, + "num_tokens": 511030521.0, + "step": 19745 + }, + { + "epoch": 2.1684603558093567, + "grad_norm": 2.193537712097168, + "learning_rate": 5e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.7906508445739746, + "num_tokens": 511055875.0, + "step": 19746 + }, + { + "epoch": 2.16857017351197, + "grad_norm": 2.1667823791503906, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7519071102142334, + "num_tokens": 511083248.0, + "step": 19747 + }, + { + "epoch": 2.1686799912145838, + "grad_norm": 1.9927581548690796, + "learning_rate": 5e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7710467576980591, + "num_tokens": 511111442.0, + "step": 19748 + }, + { + "epoch": 2.1687898089171975, + "grad_norm": 1.9086931943893433, + "learning_rate": 5e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.7703886032104492, + "num_tokens": 511142109.0, + "step": 19749 + }, + { + "epoch": 2.1688996266198113, + "grad_norm": 1.89146888256073, + "learning_rate": 5e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.7808293104171753, + "num_tokens": 511170324.0, + "step": 19750 + }, + { + "epoch": 2.1690094443224246, + "grad_norm": 1.9707435369491577, + "learning_rate": 5e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7600769996643066, + "num_tokens": 511200919.0, + "step": 19751 + }, + { + "epoch": 2.1691192620250384, + "grad_norm": 2.05033540725708, + "learning_rate": 5e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7561067342758179, + "num_tokens": 511227222.0, + "step": 19752 + }, + { + "epoch": 2.169229079727652, + "grad_norm": 2.2005650997161865, + "learning_rate": 5e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.7751577496528625, + "num_tokens": 511251052.0, + "step": 19753 + }, + { + "epoch": 2.169338897430266, + "grad_norm": 1.943603277206421, + "learning_rate": 5e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.753013014793396, + "num_tokens": 511281974.0, + "step": 19754 + }, + { + "epoch": 2.1694487151328796, + "grad_norm": 2.044468879699707, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7704639434814453, + "num_tokens": 511311151.0, + "step": 19755 + }, + { + "epoch": 2.169558532835493, + "grad_norm": 2.1278738975524902, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7483662366867065, + "num_tokens": 511337736.0, + "step": 19756 + }, + { + "epoch": 2.1696683505381067, + "grad_norm": 2.3265268802642822, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7632226943969727, + "num_tokens": 511359553.0, + "step": 19757 + }, + { + "epoch": 2.1697781682407205, + "grad_norm": 1.870079755783081, + "learning_rate": 5e-06, + "loss": 0.711, + "mean_token_accuracy": 0.7620658278465271, + "num_tokens": 511391624.0, + "step": 19758 + }, + { + "epoch": 2.169887985943334, + "grad_norm": 2.2109463214874268, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7691749334335327, + "num_tokens": 511416019.0, + "step": 19759 + }, + { + "epoch": 2.1699978036459475, + "grad_norm": 1.9160455465316772, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7630966901779175, + "num_tokens": 511445175.0, + "step": 19760 + }, + { + "epoch": 2.1701076213485613, + "grad_norm": 2.2249226570129395, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.764293909072876, + "num_tokens": 511470747.0, + "step": 19761 + }, + { + "epoch": 2.170217439051175, + "grad_norm": 1.7970515489578247, + "learning_rate": 5e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7440153360366821, + "num_tokens": 511504878.0, + "step": 19762 + }, + { + "epoch": 2.170327256753789, + "grad_norm": 2.1193981170654297, + "learning_rate": 5e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7512243390083313, + "num_tokens": 511531227.0, + "step": 19763 + }, + { + "epoch": 2.1704370744564025, + "grad_norm": 2.0004637241363525, + "learning_rate": 5e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.769882321357727, + "num_tokens": 511557508.0, + "step": 19764 + }, + { + "epoch": 2.170546892159016, + "grad_norm": 2.1172327995300293, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.768104076385498, + "num_tokens": 511583392.0, + "step": 19765 + }, + { + "epoch": 2.1706567098616296, + "grad_norm": 2.374204158782959, + "learning_rate": 5e-06, + "loss": 0.6091, + "mean_token_accuracy": 0.7916718125343323, + "num_tokens": 511601953.0, + "step": 19766 + }, + { + "epoch": 2.1707665275642434, + "grad_norm": 2.0255959033966064, + "learning_rate": 5e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.7977846264839172, + "num_tokens": 511626340.0, + "step": 19767 + }, + { + "epoch": 2.170876345266857, + "grad_norm": 2.073293685913086, + "learning_rate": 5e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7605389356613159, + "num_tokens": 511652545.0, + "step": 19768 + }, + { + "epoch": 2.170986162969471, + "grad_norm": 2.17834210395813, + "learning_rate": 5e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7557527422904968, + "num_tokens": 511677895.0, + "step": 19769 + }, + { + "epoch": 2.171095980672084, + "grad_norm": 2.2861547470092773, + "learning_rate": 5e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7697926759719849, + "num_tokens": 511700183.0, + "step": 19770 + }, + { + "epoch": 2.171205798374698, + "grad_norm": 2.0501351356506348, + "learning_rate": 5e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7445182800292969, + "num_tokens": 511729846.0, + "step": 19771 + }, + { + "epoch": 2.1713156160773117, + "grad_norm": 2.1405417919158936, + "learning_rate": 5e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7570335865020752, + "num_tokens": 511754899.0, + "step": 19772 + }, + { + "epoch": 2.1714254337799255, + "grad_norm": 2.132936954498291, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7794749736785889, + "num_tokens": 511777617.0, + "step": 19773 + }, + { + "epoch": 2.171535251482539, + "grad_norm": 2.452145576477051, + "learning_rate": 5e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.7837991118431091, + "num_tokens": 511797714.0, + "step": 19774 + }, + { + "epoch": 2.1716450691851525, + "grad_norm": 1.996037244796753, + "learning_rate": 5e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7848786115646362, + "num_tokens": 511826078.0, + "step": 19775 + }, + { + "epoch": 2.1717548868877663, + "grad_norm": 2.134457588195801, + "learning_rate": 5e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7784693241119385, + "num_tokens": 511849600.0, + "step": 19776 + }, + { + "epoch": 2.17186470459038, + "grad_norm": 2.279247760772705, + "learning_rate": 5e-06, + "loss": 0.6772, + "mean_token_accuracy": 0.7784112095832825, + "num_tokens": 511871259.0, + "step": 19777 + }, + { + "epoch": 2.171974522292994, + "grad_norm": 2.051100254058838, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7706392407417297, + "num_tokens": 511897819.0, + "step": 19778 + }, + { + "epoch": 2.172084339995607, + "grad_norm": 2.25673246383667, + "learning_rate": 5e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.7840887904167175, + "num_tokens": 511920203.0, + "step": 19779 + }, + { + "epoch": 2.172194157698221, + "grad_norm": 1.953306794166565, + "learning_rate": 5e-06, + "loss": 0.664, + "mean_token_accuracy": 0.7805404663085938, + "num_tokens": 511948649.0, + "step": 19780 + }, + { + "epoch": 2.1723039754008346, + "grad_norm": 1.8844218254089355, + "learning_rate": 5e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7699527740478516, + "num_tokens": 511977063.0, + "step": 19781 + }, + { + "epoch": 2.1724137931034484, + "grad_norm": 2.0545692443847656, + "learning_rate": 5e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7845309972763062, + "num_tokens": 512003705.0, + "step": 19782 + }, + { + "epoch": 2.1725236108060617, + "grad_norm": 1.9659368991851807, + "learning_rate": 5e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.747216522693634, + "num_tokens": 512032647.0, + "step": 19783 + }, + { + "epoch": 2.1726334285086755, + "grad_norm": 2.004223585128784, + "learning_rate": 5e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7748225927352905, + "num_tokens": 512059610.0, + "step": 19784 + }, + { + "epoch": 2.1727432462112892, + "grad_norm": 2.1861252784729004, + "learning_rate": 5e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.7835164666175842, + "num_tokens": 512081896.0, + "step": 19785 + }, + { + "epoch": 2.172853063913903, + "grad_norm": 2.069443464279175, + "learning_rate": 5e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.7777940034866333, + "num_tokens": 512106955.0, + "step": 19786 + }, + { + "epoch": 2.1729628816165167, + "grad_norm": 1.921459436416626, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7722663283348083, + "num_tokens": 512136991.0, + "step": 19787 + }, + { + "epoch": 2.17307269931913, + "grad_norm": 2.133202314376831, + "learning_rate": 5e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7862410545349121, + "num_tokens": 512161395.0, + "step": 19788 + }, + { + "epoch": 2.173182517021744, + "grad_norm": 2.0790836811065674, + "learning_rate": 5e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7689617276191711, + "num_tokens": 512188362.0, + "step": 19789 + }, + { + "epoch": 2.1732923347243576, + "grad_norm": 1.98366379737854, + "learning_rate": 5e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7672725319862366, + "num_tokens": 512214380.0, + "step": 19790 + }, + { + "epoch": 2.1734021524269713, + "grad_norm": 2.039824962615967, + "learning_rate": 5e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.7782541513442993, + "num_tokens": 512245052.0, + "step": 19791 + }, + { + "epoch": 2.173511970129585, + "grad_norm": 2.3886802196502686, + "learning_rate": 5e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.7798514366149902, + "num_tokens": 512266051.0, + "step": 19792 + }, + { + "epoch": 2.1736217878321984, + "grad_norm": 2.3843588829040527, + "learning_rate": 5e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.7809758186340332, + "num_tokens": 512289000.0, + "step": 19793 + }, + { + "epoch": 2.173731605534812, + "grad_norm": 1.8736416101455688, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7341457605361938, + "num_tokens": 512322087.0, + "step": 19794 + }, + { + "epoch": 2.173841423237426, + "grad_norm": 2.335491418838501, + "learning_rate": 5e-06, + "loss": 0.701, + "mean_token_accuracy": 0.770439863204956, + "num_tokens": 512344252.0, + "step": 19795 + }, + { + "epoch": 2.1739512409400397, + "grad_norm": 2.0752501487731934, + "learning_rate": 5e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7663354873657227, + "num_tokens": 512371800.0, + "step": 19796 + }, + { + "epoch": 2.1740610586426534, + "grad_norm": 2.1674764156341553, + "learning_rate": 5e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7684379816055298, + "num_tokens": 512397987.0, + "step": 19797 + }, + { + "epoch": 2.1741708763452667, + "grad_norm": 2.298049211502075, + "learning_rate": 5e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7804426550865173, + "num_tokens": 512420296.0, + "step": 19798 + }, + { + "epoch": 2.1742806940478805, + "grad_norm": 2.29799485206604, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7836452722549438, + "num_tokens": 512440280.0, + "step": 19799 + }, + { + "epoch": 2.1743905117504942, + "grad_norm": 2.1083483695983887, + "learning_rate": 5e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7715528011322021, + "num_tokens": 512467648.0, + "step": 19800 + }, + { + "epoch": 2.174500329453108, + "grad_norm": 1.873421311378479, + "learning_rate": 5e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7598452568054199, + "num_tokens": 512498428.0, + "step": 19801 + }, + { + "epoch": 2.1746101471557213, + "grad_norm": 2.3474504947662354, + "learning_rate": 5e-06, + "loss": 0.6531, + "mean_token_accuracy": 0.7802795171737671, + "num_tokens": 512518501.0, + "step": 19802 + }, + { + "epoch": 2.174719964858335, + "grad_norm": 2.0128302574157715, + "learning_rate": 5e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.7688121795654297, + "num_tokens": 512545482.0, + "step": 19803 + }, + { + "epoch": 2.174829782560949, + "grad_norm": 2.0135338306427, + "learning_rate": 5e-06, + "loss": 0.6454, + "mean_token_accuracy": 0.7841521501541138, + "num_tokens": 512572910.0, + "step": 19804 + }, + { + "epoch": 2.1749396002635626, + "grad_norm": 1.9756138324737549, + "learning_rate": 5e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.779751718044281, + "num_tokens": 512600376.0, + "step": 19805 + }, + { + "epoch": 2.1750494179661763, + "grad_norm": 2.039414644241333, + "learning_rate": 5e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7758397459983826, + "num_tokens": 512627160.0, + "step": 19806 + }, + { + "epoch": 2.1751592356687897, + "grad_norm": 1.9659901857376099, + "learning_rate": 5e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.7827290892601013, + "num_tokens": 512653585.0, + "step": 19807 + }, + { + "epoch": 2.1752690533714034, + "grad_norm": 2.1242880821228027, + "learning_rate": 5e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7517650127410889, + "num_tokens": 512681039.0, + "step": 19808 + }, + { + "epoch": 2.175378871074017, + "grad_norm": 1.8761720657348633, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7847663164138794, + "num_tokens": 512710448.0, + "step": 19809 + }, + { + "epoch": 2.175488688776631, + "grad_norm": 2.127537727355957, + "learning_rate": 5e-06, + "loss": 0.653, + "mean_token_accuracy": 0.7860338687896729, + "num_tokens": 512736085.0, + "step": 19810 + }, + { + "epoch": 2.1755985064792442, + "grad_norm": 2.2212159633636475, + "learning_rate": 5e-06, + "loss": 0.6671, + "mean_token_accuracy": 0.7828112244606018, + "num_tokens": 512756897.0, + "step": 19811 + }, + { + "epoch": 2.175708324181858, + "grad_norm": 1.9100879430770874, + "learning_rate": 5e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.7832728028297424, + "num_tokens": 512782722.0, + "step": 19812 + }, + { + "epoch": 2.1758181418844718, + "grad_norm": 2.223011016845703, + "learning_rate": 5e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7761964797973633, + "num_tokens": 512808218.0, + "step": 19813 + }, + { + "epoch": 2.1759279595870855, + "grad_norm": 2.1547205448150635, + "learning_rate": 5e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8236177563667297, + "num_tokens": 512828066.0, + "step": 19814 + }, + { + "epoch": 2.1760377772896993, + "grad_norm": 1.9658936262130737, + "learning_rate": 5e-06, + "loss": 0.6734, + "mean_token_accuracy": 0.7754793167114258, + "num_tokens": 512854617.0, + "step": 19815 + }, + { + "epoch": 2.1761475949923126, + "grad_norm": 1.9015809297561646, + "learning_rate": 5e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7588196992874146, + "num_tokens": 512885993.0, + "step": 19816 + }, + { + "epoch": 2.1762574126949263, + "grad_norm": 2.2123939990997314, + "learning_rate": 5e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7727528810501099, + "num_tokens": 512911506.0, + "step": 19817 + }, + { + "epoch": 2.17636723039754, + "grad_norm": 2.06423282623291, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7517844438552856, + "num_tokens": 512940546.0, + "step": 19818 + }, + { + "epoch": 2.176477048100154, + "grad_norm": 1.980882167816162, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7649378180503845, + "num_tokens": 512969425.0, + "step": 19819 + }, + { + "epoch": 2.1765868658027676, + "grad_norm": 2.035526752471924, + "learning_rate": 5e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.7660741806030273, + "num_tokens": 513001782.0, + "step": 19820 + }, + { + "epoch": 2.176696683505381, + "grad_norm": 1.9879639148712158, + "learning_rate": 5e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7488821744918823, + "num_tokens": 513032131.0, + "step": 19821 + }, + { + "epoch": 2.1768065012079947, + "grad_norm": 2.311455726623535, + "learning_rate": 5e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.7957587242126465, + "num_tokens": 513052412.0, + "step": 19822 + }, + { + "epoch": 2.1769163189106084, + "grad_norm": 1.944685697555542, + "learning_rate": 5e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.7757395505905151, + "num_tokens": 513080481.0, + "step": 19823 + }, + { + "epoch": 2.177026136613222, + "grad_norm": 2.164146661758423, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7653651237487793, + "num_tokens": 513106577.0, + "step": 19824 + }, + { + "epoch": 2.1771359543158355, + "grad_norm": 2.258030414581299, + "learning_rate": 5e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7635681629180908, + "num_tokens": 513128419.0, + "step": 19825 + }, + { + "epoch": 2.1772457720184493, + "grad_norm": 1.997718334197998, + "learning_rate": 5e-06, + "loss": 0.674, + "mean_token_accuracy": 0.7791461944580078, + "num_tokens": 513154446.0, + "step": 19826 + }, + { + "epoch": 2.177355589721063, + "grad_norm": 2.050133466720581, + "learning_rate": 5e-06, + "loss": 0.653, + "mean_token_accuracy": 0.7857875823974609, + "num_tokens": 513179313.0, + "step": 19827 + }, + { + "epoch": 2.177465407423677, + "grad_norm": 2.0012714862823486, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7745510339736938, + "num_tokens": 513205990.0, + "step": 19828 + }, + { + "epoch": 2.1775752251262905, + "grad_norm": 2.179718494415283, + "learning_rate": 5e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7756770849227905, + "num_tokens": 513230047.0, + "step": 19829 + }, + { + "epoch": 2.177685042828904, + "grad_norm": 2.1362640857696533, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7770922183990479, + "num_tokens": 513255147.0, + "step": 19830 + }, + { + "epoch": 2.1777948605315176, + "grad_norm": 2.0531368255615234, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7660045623779297, + "num_tokens": 513283409.0, + "step": 19831 + }, + { + "epoch": 2.1779046782341314, + "grad_norm": 2.378856658935547, + "learning_rate": 5e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.7805938720703125, + "num_tokens": 513303810.0, + "step": 19832 + }, + { + "epoch": 2.178014495936745, + "grad_norm": 2.3500120639801025, + "learning_rate": 5e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.7707425355911255, + "num_tokens": 513325952.0, + "step": 19833 + }, + { + "epoch": 2.1781243136393584, + "grad_norm": 2.2173144817352295, + "learning_rate": 5e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7621173858642578, + "num_tokens": 513349206.0, + "step": 19834 + }, + { + "epoch": 2.178234131341972, + "grad_norm": 2.0155415534973145, + "learning_rate": 5e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7687529921531677, + "num_tokens": 513375887.0, + "step": 19835 + }, + { + "epoch": 2.178343949044586, + "grad_norm": 2.1456339359283447, + "learning_rate": 5e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7602548599243164, + "num_tokens": 513402820.0, + "step": 19836 + }, + { + "epoch": 2.1784537667471997, + "grad_norm": 2.0206966400146484, + "learning_rate": 5e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7584803700447083, + "num_tokens": 513430663.0, + "step": 19837 + }, + { + "epoch": 2.1785635844498135, + "grad_norm": 2.111884832382202, + "learning_rate": 5e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.764030396938324, + "num_tokens": 513455574.0, + "step": 19838 + }, + { + "epoch": 2.1786734021524268, + "grad_norm": 1.981854796409607, + "learning_rate": 5e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.7723497748374939, + "num_tokens": 513482885.0, + "step": 19839 + }, + { + "epoch": 2.1787832198550405, + "grad_norm": 2.3911337852478027, + "learning_rate": 5e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7756044864654541, + "num_tokens": 513502825.0, + "step": 19840 + }, + { + "epoch": 2.1788930375576543, + "grad_norm": 2.0610878467559814, + "learning_rate": 5e-06, + "loss": 0.6411, + "mean_token_accuracy": 0.7888873219490051, + "num_tokens": 513528454.0, + "step": 19841 + }, + { + "epoch": 2.179002855260268, + "grad_norm": 2.2659103870391846, + "learning_rate": 5e-06, + "loss": 0.6477, + "mean_token_accuracy": 0.7782613635063171, + "num_tokens": 513549654.0, + "step": 19842 + }, + { + "epoch": 2.179112672962882, + "grad_norm": 1.833731770515442, + "learning_rate": 5e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7542309761047363, + "num_tokens": 513584701.0, + "step": 19843 + }, + { + "epoch": 2.179222490665495, + "grad_norm": 2.165600538253784, + "learning_rate": 5e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7751116156578064, + "num_tokens": 513610082.0, + "step": 19844 + }, + { + "epoch": 2.179332308368109, + "grad_norm": 1.9931080341339111, + "learning_rate": 5e-06, + "loss": 0.6483, + "mean_token_accuracy": 0.7856940031051636, + "num_tokens": 513638147.0, + "step": 19845 + }, + { + "epoch": 2.1794421260707226, + "grad_norm": 1.9972277879714966, + "learning_rate": 5e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7721964120864868, + "num_tokens": 513668926.0, + "step": 19846 + }, + { + "epoch": 2.1795519437733364, + "grad_norm": 2.2132599353790283, + "learning_rate": 5e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.780375599861145, + "num_tokens": 513691577.0, + "step": 19847 + }, + { + "epoch": 2.17966176147595, + "grad_norm": 2.024409770965576, + "learning_rate": 5e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7662152647972107, + "num_tokens": 513717855.0, + "step": 19848 + }, + { + "epoch": 2.1797715791785635, + "grad_norm": 2.185137987136841, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7609947323799133, + "num_tokens": 513744788.0, + "step": 19849 + }, + { + "epoch": 2.179881396881177, + "grad_norm": 2.2270278930664062, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7608753442764282, + "num_tokens": 513769114.0, + "step": 19850 + }, + { + "epoch": 2.179991214583791, + "grad_norm": 2.0306994915008545, + "learning_rate": 5e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.781548261642456, + "num_tokens": 513794285.0, + "step": 19851 + }, + { + "epoch": 2.1801010322864047, + "grad_norm": 2.007160186767578, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.751630961894989, + "num_tokens": 513824491.0, + "step": 19852 + }, + { + "epoch": 2.180210849989018, + "grad_norm": 2.2184760570526123, + "learning_rate": 5e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7719144225120544, + "num_tokens": 513848498.0, + "step": 19853 + }, + { + "epoch": 2.180320667691632, + "grad_norm": 2.2750661373138428, + "learning_rate": 5e-06, + "loss": 0.5836, + "mean_token_accuracy": 0.802716076374054, + "num_tokens": 513870700.0, + "step": 19854 + }, + { + "epoch": 2.1804304853942456, + "grad_norm": 1.9091907739639282, + "learning_rate": 5e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7611427903175354, + "num_tokens": 513899924.0, + "step": 19855 + }, + { + "epoch": 2.1805403030968593, + "grad_norm": 2.0010454654693604, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.777621865272522, + "num_tokens": 513925859.0, + "step": 19856 + }, + { + "epoch": 2.180650120799473, + "grad_norm": 2.096011161804199, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7520749568939209, + "num_tokens": 513953717.0, + "step": 19857 + }, + { + "epoch": 2.1807599385020864, + "grad_norm": 1.950093150138855, + "learning_rate": 5e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7544058561325073, + "num_tokens": 513980857.0, + "step": 19858 + }, + { + "epoch": 2.1808697562047, + "grad_norm": 2.005762815475464, + "learning_rate": 5e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7670035362243652, + "num_tokens": 514009225.0, + "step": 19859 + }, + { + "epoch": 2.180979573907314, + "grad_norm": 2.280229091644287, + "learning_rate": 5e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.7990856170654297, + "num_tokens": 514029857.0, + "step": 19860 + }, + { + "epoch": 2.1810893916099277, + "grad_norm": 2.3191611766815186, + "learning_rate": 5e-06, + "loss": 0.6485, + "mean_token_accuracy": 0.782983124256134, + "num_tokens": 514051622.0, + "step": 19861 + }, + { + "epoch": 2.181199209312541, + "grad_norm": 1.9981738328933716, + "learning_rate": 5e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.7785144448280334, + "num_tokens": 514077827.0, + "step": 19862 + }, + { + "epoch": 2.1813090270151547, + "grad_norm": 2.1623687744140625, + "learning_rate": 5e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7691882252693176, + "num_tokens": 514102148.0, + "step": 19863 + }, + { + "epoch": 2.1814188447177685, + "grad_norm": 2.2140088081359863, + "learning_rate": 5e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7582078576087952, + "num_tokens": 514128631.0, + "step": 19864 + }, + { + "epoch": 2.1815286624203822, + "grad_norm": 2.065844774246216, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7664200663566589, + "num_tokens": 514153475.0, + "step": 19865 + }, + { + "epoch": 2.181638480122996, + "grad_norm": 1.8174545764923096, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7689632177352905, + "num_tokens": 514187024.0, + "step": 19866 + }, + { + "epoch": 2.1817482978256093, + "grad_norm": 1.8982473611831665, + "learning_rate": 5e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7640154361724854, + "num_tokens": 514213848.0, + "step": 19867 + }, + { + "epoch": 2.181858115528223, + "grad_norm": 2.3606035709381104, + "learning_rate": 5e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.7852479219436646, + "num_tokens": 514233870.0, + "step": 19868 + }, + { + "epoch": 2.181967933230837, + "grad_norm": 2.0129332542419434, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7445803880691528, + "num_tokens": 514262653.0, + "step": 19869 + }, + { + "epoch": 2.1820777509334506, + "grad_norm": 1.909542441368103, + "learning_rate": 5e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7664132118225098, + "num_tokens": 514293589.0, + "step": 19870 + }, + { + "epoch": 2.1821875686360643, + "grad_norm": 2.0732357501983643, + "learning_rate": 5e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7605119943618774, + "num_tokens": 514321464.0, + "step": 19871 + }, + { + "epoch": 2.1822973863386776, + "grad_norm": 2.1723926067352295, + "learning_rate": 5e-06, + "loss": 0.5748, + "mean_token_accuracy": 0.808983564376831, + "num_tokens": 514343539.0, + "step": 19872 + }, + { + "epoch": 2.1824072040412914, + "grad_norm": 2.077866315841675, + "learning_rate": 5e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7563774585723877, + "num_tokens": 514368859.0, + "step": 19873 + }, + { + "epoch": 2.182517021743905, + "grad_norm": 1.970293641090393, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7626081705093384, + "num_tokens": 514397039.0, + "step": 19874 + }, + { + "epoch": 2.182626839446519, + "grad_norm": 2.117032527923584, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7595565915107727, + "num_tokens": 514421666.0, + "step": 19875 + }, + { + "epoch": 2.1827366571491327, + "grad_norm": 1.9456974267959595, + "learning_rate": 5e-06, + "loss": 0.6263, + "mean_token_accuracy": 0.7920060157775879, + "num_tokens": 514449059.0, + "step": 19876 + }, + { + "epoch": 2.182846474851746, + "grad_norm": 2.171480417251587, + "learning_rate": 5e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7537662982940674, + "num_tokens": 514472800.0, + "step": 19877 + }, + { + "epoch": 2.1829562925543597, + "grad_norm": 2.015702486038208, + "learning_rate": 5e-06, + "loss": 0.6279, + "mean_token_accuracy": 0.7937557101249695, + "num_tokens": 514498447.0, + "step": 19878 + }, + { + "epoch": 2.1830661102569735, + "grad_norm": 2.1233508586883545, + "learning_rate": 5e-06, + "loss": 0.6683, + "mean_token_accuracy": 0.778490424156189, + "num_tokens": 514526467.0, + "step": 19879 + }, + { + "epoch": 2.1831759279595873, + "grad_norm": 2.2673141956329346, + "learning_rate": 5e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.7799326181411743, + "num_tokens": 514549050.0, + "step": 19880 + }, + { + "epoch": 2.1832857456622006, + "grad_norm": 1.8806712627410889, + "learning_rate": 5e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7534228563308716, + "num_tokens": 514580407.0, + "step": 19881 + }, + { + "epoch": 2.1833955633648143, + "grad_norm": 2.01236629486084, + "learning_rate": 5e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7285544276237488, + "num_tokens": 514610133.0, + "step": 19882 + }, + { + "epoch": 2.183505381067428, + "grad_norm": 2.217242956161499, + "learning_rate": 5e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7793588638305664, + "num_tokens": 514634054.0, + "step": 19883 + }, + { + "epoch": 2.183615198770042, + "grad_norm": 1.9711542129516602, + "learning_rate": 5e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7614589333534241, + "num_tokens": 514662198.0, + "step": 19884 + }, + { + "epoch": 2.1837250164726556, + "grad_norm": 2.522117853164673, + "learning_rate": 5e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.7885832190513611, + "num_tokens": 514680942.0, + "step": 19885 + }, + { + "epoch": 2.183834834175269, + "grad_norm": 2.3347930908203125, + "learning_rate": 5e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.7783646583557129, + "num_tokens": 514702208.0, + "step": 19886 + }, + { + "epoch": 2.1839446518778827, + "grad_norm": 2.3174655437469482, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7707335948944092, + "num_tokens": 514725695.0, + "step": 19887 + }, + { + "epoch": 2.1840544695804964, + "grad_norm": 2.1989622116088867, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7685757875442505, + "num_tokens": 514749589.0, + "step": 19888 + }, + { + "epoch": 2.18416428728311, + "grad_norm": 2.141117572784424, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7695156335830688, + "num_tokens": 514774175.0, + "step": 19889 + }, + { + "epoch": 2.1842741049857235, + "grad_norm": 1.8965225219726562, + "learning_rate": 5e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7664063572883606, + "num_tokens": 514804378.0, + "step": 19890 + }, + { + "epoch": 2.1843839226883373, + "grad_norm": 1.9749339818954468, + "learning_rate": 5e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7530222535133362, + "num_tokens": 514832807.0, + "step": 19891 + }, + { + "epoch": 2.184493740390951, + "grad_norm": 2.253848075866699, + "learning_rate": 5e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.7760090827941895, + "num_tokens": 514857452.0, + "step": 19892 + }, + { + "epoch": 2.1846035580935648, + "grad_norm": 2.241924524307251, + "learning_rate": 5e-06, + "loss": 0.6101, + "mean_token_accuracy": 0.7940654754638672, + "num_tokens": 514879549.0, + "step": 19893 + }, + { + "epoch": 2.1847133757961785, + "grad_norm": 2.0256593227386475, + "learning_rate": 5e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7643725872039795, + "num_tokens": 514909392.0, + "step": 19894 + }, + { + "epoch": 2.184823193498792, + "grad_norm": 1.9479812383651733, + "learning_rate": 5e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7699796557426453, + "num_tokens": 514939109.0, + "step": 19895 + }, + { + "epoch": 2.1849330112014056, + "grad_norm": 1.9416424036026, + "learning_rate": 5e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7636494040489197, + "num_tokens": 514968120.0, + "step": 19896 + }, + { + "epoch": 2.1850428289040194, + "grad_norm": 2.2560901641845703, + "learning_rate": 5e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.7790609002113342, + "num_tokens": 514991984.0, + "step": 19897 + }, + { + "epoch": 2.185152646606633, + "grad_norm": 1.9764759540557861, + "learning_rate": 5e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7488457560539246, + "num_tokens": 515024608.0, + "step": 19898 + }, + { + "epoch": 2.185262464309247, + "grad_norm": 2.3498220443725586, + "learning_rate": 5e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.782823920249939, + "num_tokens": 515046985.0, + "step": 19899 + }, + { + "epoch": 2.18537228201186, + "grad_norm": 1.8923323154449463, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7649160623550415, + "num_tokens": 515080739.0, + "step": 19900 + }, + { + "epoch": 2.185482099714474, + "grad_norm": 2.2202141284942627, + "learning_rate": 5e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7620274424552917, + "num_tokens": 515107519.0, + "step": 19901 + }, + { + "epoch": 2.1855919174170877, + "grad_norm": 2.0851972103118896, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7632147073745728, + "num_tokens": 515132883.0, + "step": 19902 + }, + { + "epoch": 2.1857017351197014, + "grad_norm": 2.2139265537261963, + "learning_rate": 5e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.782279372215271, + "num_tokens": 515155370.0, + "step": 19903 + }, + { + "epoch": 2.1858115528223148, + "grad_norm": 2.0367982387542725, + "learning_rate": 5e-06, + "loss": 0.6189, + "mean_token_accuracy": 0.7906533479690552, + "num_tokens": 515182379.0, + "step": 19904 + }, + { + "epoch": 2.1859213705249285, + "grad_norm": 1.9719831943511963, + "learning_rate": 5e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7586899399757385, + "num_tokens": 515212285.0, + "step": 19905 + }, + { + "epoch": 2.1860311882275423, + "grad_norm": 2.1195435523986816, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7676697373390198, + "num_tokens": 515238266.0, + "step": 19906 + }, + { + "epoch": 2.186141005930156, + "grad_norm": 2.1080105304718018, + "learning_rate": 5e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8132030963897705, + "num_tokens": 515261345.0, + "step": 19907 + }, + { + "epoch": 2.18625082363277, + "grad_norm": 2.028294324874878, + "learning_rate": 5e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.761864185333252, + "num_tokens": 515287848.0, + "step": 19908 + }, + { + "epoch": 2.186360641335383, + "grad_norm": 1.955438256263733, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.776289701461792, + "num_tokens": 515317087.0, + "step": 19909 + }, + { + "epoch": 2.186470459037997, + "grad_norm": 2.1645052433013916, + "learning_rate": 5e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7407201528549194, + "num_tokens": 515343075.0, + "step": 19910 + }, + { + "epoch": 2.1865802767406106, + "grad_norm": 2.078274965286255, + "learning_rate": 5e-06, + "loss": 0.65, + "mean_token_accuracy": 0.7920675277709961, + "num_tokens": 515369238.0, + "step": 19911 + }, + { + "epoch": 2.1866900944432244, + "grad_norm": 2.2407784461975098, + "learning_rate": 5e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.7779787182807922, + "num_tokens": 515396549.0, + "step": 19912 + }, + { + "epoch": 2.1867999121458377, + "grad_norm": 1.7108619213104248, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.768819272518158, + "num_tokens": 515432400.0, + "step": 19913 + }, + { + "epoch": 2.1869097298484514, + "grad_norm": 2.1682755947113037, + "learning_rate": 5e-06, + "loss": 0.6621, + "mean_token_accuracy": 0.7851489782333374, + "num_tokens": 515455479.0, + "step": 19914 + }, + { + "epoch": 2.187019547551065, + "grad_norm": 1.9837175607681274, + "learning_rate": 5e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7410478591918945, + "num_tokens": 515486055.0, + "step": 19915 + }, + { + "epoch": 2.187129365253679, + "grad_norm": 1.922670602798462, + "learning_rate": 5e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7610599398612976, + "num_tokens": 515515332.0, + "step": 19916 + }, + { + "epoch": 2.1872391829562927, + "grad_norm": 2.222158908843994, + "learning_rate": 5e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.7826046347618103, + "num_tokens": 515537271.0, + "step": 19917 + }, + { + "epoch": 2.187349000658906, + "grad_norm": 2.0724306106567383, + "learning_rate": 5e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.7780367136001587, + "num_tokens": 515562062.0, + "step": 19918 + }, + { + "epoch": 2.18745881836152, + "grad_norm": 1.9235361814498901, + "learning_rate": 5e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7684556245803833, + "num_tokens": 515588704.0, + "step": 19919 + }, + { + "epoch": 2.1875686360641335, + "grad_norm": 2.1148548126220703, + "learning_rate": 5e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7752244472503662, + "num_tokens": 515612952.0, + "step": 19920 + }, + { + "epoch": 2.1876784537667473, + "grad_norm": 1.9610822200775146, + "learning_rate": 5e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7456403374671936, + "num_tokens": 515641173.0, + "step": 19921 + }, + { + "epoch": 2.187788271469361, + "grad_norm": 2.204317569732666, + "learning_rate": 5e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7721502184867859, + "num_tokens": 515665056.0, + "step": 19922 + }, + { + "epoch": 2.1878980891719744, + "grad_norm": 1.9341638088226318, + "learning_rate": 5e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7555965185165405, + "num_tokens": 515694946.0, + "step": 19923 + }, + { + "epoch": 2.188007906874588, + "grad_norm": 1.9515470266342163, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7460816502571106, + "num_tokens": 515725904.0, + "step": 19924 + }, + { + "epoch": 2.188117724577202, + "grad_norm": 2.021346092224121, + "learning_rate": 5e-06, + "loss": 0.6825, + "mean_token_accuracy": 0.7715201377868652, + "num_tokens": 515750031.0, + "step": 19925 + }, + { + "epoch": 2.1882275422798156, + "grad_norm": 1.8979980945587158, + "learning_rate": 5e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7574730515480042, + "num_tokens": 515777975.0, + "step": 19926 + }, + { + "epoch": 2.1883373599824294, + "grad_norm": 2.2489681243896484, + "learning_rate": 5e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.770528256893158, + "num_tokens": 515801497.0, + "step": 19927 + }, + { + "epoch": 2.1884471776850427, + "grad_norm": 2.0781517028808594, + "learning_rate": 5e-06, + "loss": 0.6343, + "mean_token_accuracy": 0.7908805012702942, + "num_tokens": 515827548.0, + "step": 19928 + }, + { + "epoch": 2.1885569953876565, + "grad_norm": 1.9244108200073242, + "learning_rate": 5e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7529612183570862, + "num_tokens": 515858714.0, + "step": 19929 + }, + { + "epoch": 2.1886668130902702, + "grad_norm": 1.8713730573654175, + "learning_rate": 5e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.7801226377487183, + "num_tokens": 515888548.0, + "step": 19930 + }, + { + "epoch": 2.188776630792884, + "grad_norm": 2.0096969604492188, + "learning_rate": 5e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7439302206039429, + "num_tokens": 515915579.0, + "step": 19931 + }, + { + "epoch": 2.1888864484954973, + "grad_norm": 2.2022178173065186, + "learning_rate": 5e-06, + "loss": 0.668, + "mean_token_accuracy": 0.782113790512085, + "num_tokens": 515938757.0, + "step": 19932 + }, + { + "epoch": 2.188996266198111, + "grad_norm": 2.261213541030884, + "learning_rate": 5e-06, + "loss": 0.5759, + "mean_token_accuracy": 0.8093147277832031, + "num_tokens": 515958379.0, + "step": 19933 + }, + { + "epoch": 2.189106083900725, + "grad_norm": 2.196397542953491, + "learning_rate": 5e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.7838096618652344, + "num_tokens": 515980985.0, + "step": 19934 + }, + { + "epoch": 2.1892159016033386, + "grad_norm": 2.040919780731201, + "learning_rate": 5e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7506245970726013, + "num_tokens": 516010357.0, + "step": 19935 + }, + { + "epoch": 2.1893257193059523, + "grad_norm": 2.2478621006011963, + "learning_rate": 5e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7752904891967773, + "num_tokens": 516033516.0, + "step": 19936 + }, + { + "epoch": 2.1894355370085656, + "grad_norm": 2.3024938106536865, + "learning_rate": 5e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.785180926322937, + "num_tokens": 516055407.0, + "step": 19937 + }, + { + "epoch": 2.1895453547111794, + "grad_norm": 2.409980058670044, + "learning_rate": 5e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7688446044921875, + "num_tokens": 516077887.0, + "step": 19938 + }, + { + "epoch": 2.189655172413793, + "grad_norm": 2.3105576038360596, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.769320011138916, + "num_tokens": 516097751.0, + "step": 19939 + }, + { + "epoch": 2.189764990116407, + "grad_norm": 1.8820466995239258, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7725932598114014, + "num_tokens": 516128518.0, + "step": 19940 + }, + { + "epoch": 2.18987480781902, + "grad_norm": 2.009376287460327, + "learning_rate": 5e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7634767293930054, + "num_tokens": 516156693.0, + "step": 19941 + }, + { + "epoch": 2.189984625521634, + "grad_norm": 1.924047827720642, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7763649225234985, + "num_tokens": 516188097.0, + "step": 19942 + }, + { + "epoch": 2.1900944432242477, + "grad_norm": 2.0016205310821533, + "learning_rate": 5e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7704240679740906, + "num_tokens": 516215757.0, + "step": 19943 + }, + { + "epoch": 2.1902042609268615, + "grad_norm": 1.9983890056610107, + "learning_rate": 5e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7522635459899902, + "num_tokens": 516245184.0, + "step": 19944 + }, + { + "epoch": 2.1903140786294752, + "grad_norm": 1.957423210144043, + "learning_rate": 5e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7574009895324707, + "num_tokens": 516273742.0, + "step": 19945 + }, + { + "epoch": 2.1904238963320886, + "grad_norm": 2.1654059886932373, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7704918384552002, + "num_tokens": 516299831.0, + "step": 19946 + }, + { + "epoch": 2.1905337140347023, + "grad_norm": 2.070519208908081, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7534854412078857, + "num_tokens": 516327810.0, + "step": 19947 + }, + { + "epoch": 2.190643531737316, + "grad_norm": 2.282325029373169, + "learning_rate": 5e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7740868926048279, + "num_tokens": 516351269.0, + "step": 19948 + }, + { + "epoch": 2.19075334943993, + "grad_norm": 1.750685691833496, + "learning_rate": 5e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7572168111801147, + "num_tokens": 516388963.0, + "step": 19949 + }, + { + "epoch": 2.1908631671425436, + "grad_norm": 1.993491768836975, + "learning_rate": 5e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7499290704727173, + "num_tokens": 516418673.0, + "step": 19950 + }, + { + "epoch": 2.190972984845157, + "grad_norm": 1.9956655502319336, + "learning_rate": 5e-06, + "loss": 0.6454, + "mean_token_accuracy": 0.7850096821784973, + "num_tokens": 516444031.0, + "step": 19951 + }, + { + "epoch": 2.1910828025477707, + "grad_norm": 1.886643409729004, + "learning_rate": 5e-06, + "loss": 0.6696, + "mean_token_accuracy": 0.776005208492279, + "num_tokens": 516473639.0, + "step": 19952 + }, + { + "epoch": 2.1911926202503844, + "grad_norm": 2.0346527099609375, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7687891721725464, + "num_tokens": 516500461.0, + "step": 19953 + }, + { + "epoch": 2.191302437952998, + "grad_norm": 2.298312187194824, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7565064430236816, + "num_tokens": 516521485.0, + "step": 19954 + }, + { + "epoch": 2.1914122556556115, + "grad_norm": 2.2196803092956543, + "learning_rate": 5e-06, + "loss": 0.6661, + "mean_token_accuracy": 0.7776167392730713, + "num_tokens": 516544193.0, + "step": 19955 + }, + { + "epoch": 2.1915220733582252, + "grad_norm": 2.2122457027435303, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7553229928016663, + "num_tokens": 516567783.0, + "step": 19956 + }, + { + "epoch": 2.191631891060839, + "grad_norm": 1.9531413316726685, + "learning_rate": 5e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7406328320503235, + "num_tokens": 516598326.0, + "step": 19957 + }, + { + "epoch": 2.1917417087634528, + "grad_norm": 2.0813519954681396, + "learning_rate": 5e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7772605419158936, + "num_tokens": 516623457.0, + "step": 19958 + }, + { + "epoch": 2.1918515264660665, + "grad_norm": 2.0185630321502686, + "learning_rate": 5e-06, + "loss": 0.701, + "mean_token_accuracy": 0.7743908166885376, + "num_tokens": 516651309.0, + "step": 19959 + }, + { + "epoch": 2.19196134416868, + "grad_norm": 2.291185140609741, + "learning_rate": 5e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7710428237915039, + "num_tokens": 516674557.0, + "step": 19960 + }, + { + "epoch": 2.1920711618712936, + "grad_norm": 1.958719253540039, + "learning_rate": 5e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7523012757301331, + "num_tokens": 516704530.0, + "step": 19961 + }, + { + "epoch": 2.1921809795739073, + "grad_norm": 1.9475394487380981, + "learning_rate": 5e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7717534899711609, + "num_tokens": 516732065.0, + "step": 19962 + }, + { + "epoch": 2.192290797276521, + "grad_norm": 1.867667317390442, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7593589425086975, + "num_tokens": 516764251.0, + "step": 19963 + }, + { + "epoch": 2.1924006149791344, + "grad_norm": 2.086313247680664, + "learning_rate": 5e-06, + "loss": 0.6216, + "mean_token_accuracy": 0.7922565340995789, + "num_tokens": 516788943.0, + "step": 19964 + }, + { + "epoch": 2.192510432681748, + "grad_norm": 2.4052064418792725, + "learning_rate": 5e-06, + "loss": 0.6556, + "mean_token_accuracy": 0.78162682056427, + "num_tokens": 516810854.0, + "step": 19965 + }, + { + "epoch": 2.192620250384362, + "grad_norm": 1.9194682836532593, + "learning_rate": 5e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7688157558441162, + "num_tokens": 516838867.0, + "step": 19966 + }, + { + "epoch": 2.1927300680869757, + "grad_norm": 1.941156268119812, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7577852010726929, + "num_tokens": 516871451.0, + "step": 19967 + }, + { + "epoch": 2.1928398857895894, + "grad_norm": 1.9376736879348755, + "learning_rate": 5e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7459152340888977, + "num_tokens": 516902805.0, + "step": 19968 + }, + { + "epoch": 2.1929497034922028, + "grad_norm": 2.0239100456237793, + "learning_rate": 5e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7641361951828003, + "num_tokens": 516929370.0, + "step": 19969 + }, + { + "epoch": 2.1930595211948165, + "grad_norm": 2.4137096405029297, + "learning_rate": 5e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.7911692261695862, + "num_tokens": 516948186.0, + "step": 19970 + }, + { + "epoch": 2.1931693388974303, + "grad_norm": 2.0914664268493652, + "learning_rate": 5e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7545727491378784, + "num_tokens": 516976983.0, + "step": 19971 + }, + { + "epoch": 2.193279156600044, + "grad_norm": 2.2012879848480225, + "learning_rate": 5e-06, + "loss": 0.6202, + "mean_token_accuracy": 0.793563961982727, + "num_tokens": 516997554.0, + "step": 19972 + }, + { + "epoch": 2.193388974302658, + "grad_norm": 2.1249327659606934, + "learning_rate": 5e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7855323553085327, + "num_tokens": 517024194.0, + "step": 19973 + }, + { + "epoch": 2.193498792005271, + "grad_norm": 2.311885118484497, + "learning_rate": 5e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.7675620317459106, + "num_tokens": 517045283.0, + "step": 19974 + }, + { + "epoch": 2.193608609707885, + "grad_norm": 2.647132396697998, + "learning_rate": 5e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7622246742248535, + "num_tokens": 517075722.0, + "step": 19975 + }, + { + "epoch": 2.1937184274104986, + "grad_norm": 1.919062614440918, + "learning_rate": 5e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7674760818481445, + "num_tokens": 517106759.0, + "step": 19976 + }, + { + "epoch": 2.1938282451131124, + "grad_norm": 2.170966148376465, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7700480222702026, + "num_tokens": 517129454.0, + "step": 19977 + }, + { + "epoch": 2.193938062815726, + "grad_norm": 1.9628225564956665, + "learning_rate": 5e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7666217088699341, + "num_tokens": 517160482.0, + "step": 19978 + }, + { + "epoch": 2.1940478805183394, + "grad_norm": 1.9253145456314087, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7578442096710205, + "num_tokens": 517191031.0, + "step": 19979 + }, + { + "epoch": 2.194157698220953, + "grad_norm": 2.102328300476074, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7791622877120972, + "num_tokens": 517215580.0, + "step": 19980 + }, + { + "epoch": 2.194267515923567, + "grad_norm": 2.2457735538482666, + "learning_rate": 5e-06, + "loss": 0.6703, + "mean_token_accuracy": 0.7830671072006226, + "num_tokens": 517237998.0, + "step": 19981 + }, + { + "epoch": 2.1943773336261807, + "grad_norm": 2.1422970294952393, + "learning_rate": 5e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.7813739776611328, + "num_tokens": 517262735.0, + "step": 19982 + }, + { + "epoch": 2.194487151328794, + "grad_norm": 2.1164519786834717, + "learning_rate": 5e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7681744694709778, + "num_tokens": 517289201.0, + "step": 19983 + }, + { + "epoch": 2.1945969690314078, + "grad_norm": 2.0061757564544678, + "learning_rate": 5e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7671399116516113, + "num_tokens": 517316229.0, + "step": 19984 + }, + { + "epoch": 2.1947067867340215, + "grad_norm": 1.9774926900863647, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7696483135223389, + "num_tokens": 517345846.0, + "step": 19985 + }, + { + "epoch": 2.1948166044366353, + "grad_norm": 2.0684118270874023, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7682641744613647, + "num_tokens": 517371207.0, + "step": 19986 + }, + { + "epoch": 2.194926422139249, + "grad_norm": 2.7220404148101807, + "learning_rate": 5e-06, + "loss": 0.6471, + "mean_token_accuracy": 0.7892780303955078, + "num_tokens": 517390073.0, + "step": 19987 + }, + { + "epoch": 2.1950362398418624, + "grad_norm": 2.0554966926574707, + "learning_rate": 5e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7612559795379639, + "num_tokens": 517417044.0, + "step": 19988 + }, + { + "epoch": 2.195146057544476, + "grad_norm": 2.204857587814331, + "learning_rate": 5e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.7883008718490601, + "num_tokens": 517439830.0, + "step": 19989 + }, + { + "epoch": 2.19525587524709, + "grad_norm": 2.096349000930786, + "learning_rate": 5e-06, + "loss": 0.7931, + "mean_token_accuracy": 0.7499574422836304, + "num_tokens": 517464844.0, + "step": 19990 + }, + { + "epoch": 2.1953656929497036, + "grad_norm": 2.1445202827453613, + "learning_rate": 5e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.7883986234664917, + "num_tokens": 517487229.0, + "step": 19991 + }, + { + "epoch": 2.195475510652317, + "grad_norm": 2.0654215812683105, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7526441812515259, + "num_tokens": 517516138.0, + "step": 19992 + }, + { + "epoch": 2.1955853283549307, + "grad_norm": 2.1277265548706055, + "learning_rate": 5e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.7723273634910583, + "num_tokens": 517540208.0, + "step": 19993 + }, + { + "epoch": 2.1956951460575445, + "grad_norm": 2.4372498989105225, + "learning_rate": 5e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7847398519515991, + "num_tokens": 517561144.0, + "step": 19994 + }, + { + "epoch": 2.195804963760158, + "grad_norm": 2.235408306121826, + "learning_rate": 5e-06, + "loss": 0.6151, + "mean_token_accuracy": 0.7866053581237793, + "num_tokens": 517583824.0, + "step": 19995 + }, + { + "epoch": 2.195914781462772, + "grad_norm": 2.0919790267944336, + "learning_rate": 5e-06, + "loss": 0.6743, + "mean_token_accuracy": 0.7788208723068237, + "num_tokens": 517609407.0, + "step": 19996 + }, + { + "epoch": 2.1960245991653853, + "grad_norm": 2.1703834533691406, + "learning_rate": 5e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.777114987373352, + "num_tokens": 517633145.0, + "step": 19997 + }, + { + "epoch": 2.196134416867999, + "grad_norm": 2.088838815689087, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7609057426452637, + "num_tokens": 517656481.0, + "step": 19998 + }, + { + "epoch": 2.196244234570613, + "grad_norm": 2.234713315963745, + "learning_rate": 5e-06, + "loss": 0.6716, + "mean_token_accuracy": 0.7834168076515198, + "num_tokens": 517680919.0, + "step": 19999 + }, + { + "epoch": 2.1963540522732266, + "grad_norm": 1.9818296432495117, + "learning_rate": 5e-06, + "loss": 0.654, + "mean_token_accuracy": 0.7822399139404297, + "num_tokens": 517707654.0, + "step": 20000 + }, + { + "epoch": 2.1964638699758403, + "grad_norm": 2.0371005535125732, + "learning_rate": 5e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7633357048034668, + "num_tokens": 517735250.0, + "step": 20001 + }, + { + "epoch": 2.1965736876784536, + "grad_norm": 2.124730110168457, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7529373168945312, + "num_tokens": 517761048.0, + "step": 20002 + }, + { + "epoch": 2.1966835053810674, + "grad_norm": 2.1919050216674805, + "learning_rate": 5e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.7739862203598022, + "num_tokens": 517785621.0, + "step": 20003 + }, + { + "epoch": 2.196793323083681, + "grad_norm": 2.170761823654175, + "learning_rate": 5e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.7714976668357849, + "num_tokens": 517809447.0, + "step": 20004 + }, + { + "epoch": 2.196903140786295, + "grad_norm": 2.4035685062408447, + "learning_rate": 5e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7782927751541138, + "num_tokens": 517831308.0, + "step": 20005 + }, + { + "epoch": 2.197012958488908, + "grad_norm": 2.0693411827087402, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7556170225143433, + "num_tokens": 517856911.0, + "step": 20006 + }, + { + "epoch": 2.197122776191522, + "grad_norm": 2.0875906944274902, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.767224907875061, + "num_tokens": 517883622.0, + "step": 20007 + }, + { + "epoch": 2.1972325938941357, + "grad_norm": 2.0029032230377197, + "learning_rate": 5e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7614583969116211, + "num_tokens": 517911840.0, + "step": 20008 + }, + { + "epoch": 2.1973424115967495, + "grad_norm": 2.321786403656006, + "learning_rate": 5e-06, + "loss": 0.6665, + "mean_token_accuracy": 0.7875281572341919, + "num_tokens": 517934084.0, + "step": 20009 + }, + { + "epoch": 2.1974522292993632, + "grad_norm": 1.9826306104660034, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.774377167224884, + "num_tokens": 517963130.0, + "step": 20010 + }, + { + "epoch": 2.1975620470019765, + "grad_norm": 2.308436155319214, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7752054333686829, + "num_tokens": 517987338.0, + "step": 20011 + }, + { + "epoch": 2.1976718647045903, + "grad_norm": 1.873306393623352, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7671632766723633, + "num_tokens": 518018222.0, + "step": 20012 + }, + { + "epoch": 2.197781682407204, + "grad_norm": 2.4435253143310547, + "learning_rate": 5e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.7744188904762268, + "num_tokens": 518038704.0, + "step": 20013 + }, + { + "epoch": 2.197891500109818, + "grad_norm": 2.4176478385925293, + "learning_rate": 5e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.7913247346878052, + "num_tokens": 518057379.0, + "step": 20014 + }, + { + "epoch": 2.198001317812431, + "grad_norm": 2.1485657691955566, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7601554989814758, + "num_tokens": 518085291.0, + "step": 20015 + }, + { + "epoch": 2.198111135515045, + "grad_norm": 2.1821537017822266, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7757224440574646, + "num_tokens": 518110375.0, + "step": 20016 + }, + { + "epoch": 2.1982209532176586, + "grad_norm": 2.0647497177124023, + "learning_rate": 5e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.7819880247116089, + "num_tokens": 518134233.0, + "step": 20017 + }, + { + "epoch": 2.1983307709202724, + "grad_norm": 2.3896279335021973, + "learning_rate": 5e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.7736489772796631, + "num_tokens": 518154078.0, + "step": 20018 + }, + { + "epoch": 2.198440588622886, + "grad_norm": 2.0827958583831787, + "learning_rate": 5e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.7695910334587097, + "num_tokens": 518178345.0, + "step": 20019 + }, + { + "epoch": 2.1985504063254995, + "grad_norm": 2.0341553688049316, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7623684406280518, + "num_tokens": 518204564.0, + "step": 20020 + }, + { + "epoch": 2.1986602240281132, + "grad_norm": 1.9660283327102661, + "learning_rate": 5e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7724725008010864, + "num_tokens": 518236327.0, + "step": 20021 + }, + { + "epoch": 2.198770041730727, + "grad_norm": 2.335948944091797, + "learning_rate": 5e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7771247625350952, + "num_tokens": 518258953.0, + "step": 20022 + }, + { + "epoch": 2.1988798594333407, + "grad_norm": 2.494913101196289, + "learning_rate": 5e-06, + "loss": 0.6286, + "mean_token_accuracy": 0.7849016785621643, + "num_tokens": 518278835.0, + "step": 20023 + }, + { + "epoch": 2.1989896771359545, + "grad_norm": 2.2280726432800293, + "learning_rate": 5e-06, + "loss": 0.6115, + "mean_token_accuracy": 0.7912657260894775, + "num_tokens": 518300849.0, + "step": 20024 + }, + { + "epoch": 2.199099494838568, + "grad_norm": 1.8908228874206543, + "learning_rate": 5e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7739635109901428, + "num_tokens": 518330747.0, + "step": 20025 + }, + { + "epoch": 2.1992093125411816, + "grad_norm": 2.028825044631958, + "learning_rate": 5e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7750107049942017, + "num_tokens": 518356383.0, + "step": 20026 + }, + { + "epoch": 2.1993191302437953, + "grad_norm": 2.165351390838623, + "learning_rate": 5e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7704878449440002, + "num_tokens": 518382748.0, + "step": 20027 + }, + { + "epoch": 2.199428947946409, + "grad_norm": 2.127208709716797, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7642791271209717, + "num_tokens": 518407374.0, + "step": 20028 + }, + { + "epoch": 2.199538765649023, + "grad_norm": 2.2668707370758057, + "learning_rate": 5e-06, + "loss": 0.6455, + "mean_token_accuracy": 0.7826852798461914, + "num_tokens": 518429676.0, + "step": 20029 + }, + { + "epoch": 2.199648583351636, + "grad_norm": 2.3687660694122314, + "learning_rate": 5e-06, + "loss": 0.6439, + "mean_token_accuracy": 0.7871791124343872, + "num_tokens": 518449198.0, + "step": 20030 + }, + { + "epoch": 2.19975840105425, + "grad_norm": 2.0994341373443604, + "learning_rate": 5e-06, + "loss": 0.6522, + "mean_token_accuracy": 0.7845991849899292, + "num_tokens": 518473769.0, + "step": 20031 + }, + { + "epoch": 2.1998682187568637, + "grad_norm": 1.9531078338623047, + "learning_rate": 5e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.7764066457748413, + "num_tokens": 518501636.0, + "step": 20032 + }, + { + "epoch": 2.1999780364594774, + "grad_norm": 1.9422601461410522, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7523101568222046, + "num_tokens": 518532225.0, + "step": 20033 + }, + { + "epoch": 2.2000878541620907, + "grad_norm": 2.0642590522766113, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7601475119590759, + "num_tokens": 518560418.0, + "step": 20034 + }, + { + "epoch": 2.2001976718647045, + "grad_norm": 1.878129482269287, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7513579726219177, + "num_tokens": 518592332.0, + "step": 20035 + }, + { + "epoch": 2.2003074895673183, + "grad_norm": 2.1190552711486816, + "learning_rate": 5e-06, + "loss": 0.6526, + "mean_token_accuracy": 0.7951381802558899, + "num_tokens": 518615382.0, + "step": 20036 + }, + { + "epoch": 2.200417307269932, + "grad_norm": 2.170853853225708, + "learning_rate": 5e-06, + "loss": 0.6531, + "mean_token_accuracy": 0.7836840152740479, + "num_tokens": 518638366.0, + "step": 20037 + }, + { + "epoch": 2.2005271249725458, + "grad_norm": 2.174316644668579, + "learning_rate": 5e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.7901571989059448, + "num_tokens": 518663512.0, + "step": 20038 + }, + { + "epoch": 2.200636942675159, + "grad_norm": 2.672050952911377, + "learning_rate": 5e-06, + "loss": 0.6097, + "mean_token_accuracy": 0.7941721677780151, + "num_tokens": 518681544.0, + "step": 20039 + }, + { + "epoch": 2.200746760377773, + "grad_norm": 2.3398399353027344, + "learning_rate": 5e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7749415636062622, + "num_tokens": 518703825.0, + "step": 20040 + }, + { + "epoch": 2.2008565780803866, + "grad_norm": 1.9044214487075806, + "learning_rate": 5e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7592924237251282, + "num_tokens": 518734425.0, + "step": 20041 + }, + { + "epoch": 2.2009663957830004, + "grad_norm": 1.8318482637405396, + "learning_rate": 5e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.774975061416626, + "num_tokens": 518767170.0, + "step": 20042 + }, + { + "epoch": 2.2010762134856137, + "grad_norm": 2.3547205924987793, + "learning_rate": 5e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.7786288261413574, + "num_tokens": 518790190.0, + "step": 20043 + }, + { + "epoch": 2.2011860311882274, + "grad_norm": 2.358914613723755, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7861406803131104, + "num_tokens": 518810282.0, + "step": 20044 + }, + { + "epoch": 2.201295848890841, + "grad_norm": 2.0492782592773438, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7630633115768433, + "num_tokens": 518836180.0, + "step": 20045 + }, + { + "epoch": 2.201405666593455, + "grad_norm": 2.034515857696533, + "learning_rate": 5e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.7891274690628052, + "num_tokens": 518862543.0, + "step": 20046 + }, + { + "epoch": 2.2015154842960687, + "grad_norm": 2.0335240364074707, + "learning_rate": 5e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7776445150375366, + "num_tokens": 518889738.0, + "step": 20047 + }, + { + "epoch": 2.201625301998682, + "grad_norm": 2.2246391773223877, + "learning_rate": 5e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.7874918580055237, + "num_tokens": 518912680.0, + "step": 20048 + }, + { + "epoch": 2.2017351197012958, + "grad_norm": 2.192929744720459, + "learning_rate": 5e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7571370601654053, + "num_tokens": 518939662.0, + "step": 20049 + }, + { + "epoch": 2.2018449374039095, + "grad_norm": 2.2354323863983154, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7673352956771851, + "num_tokens": 518964151.0, + "step": 20050 + }, + { + "epoch": 2.2019547551065233, + "grad_norm": 1.9010432958602905, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7624636888504028, + "num_tokens": 518995185.0, + "step": 20051 + }, + { + "epoch": 2.202064572809137, + "grad_norm": 2.008634090423584, + "learning_rate": 5e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7505191564559937, + "num_tokens": 519024907.0, + "step": 20052 + }, + { + "epoch": 2.2021743905117503, + "grad_norm": 2.098973512649536, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7505825161933899, + "num_tokens": 519050375.0, + "step": 20053 + }, + { + "epoch": 2.202284208214364, + "grad_norm": 1.9109240770339966, + "learning_rate": 5e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7494333982467651, + "num_tokens": 519082509.0, + "step": 20054 + }, + { + "epoch": 2.202394025916978, + "grad_norm": 2.0973899364471436, + "learning_rate": 5e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.7756022810935974, + "num_tokens": 519108296.0, + "step": 20055 + }, + { + "epoch": 2.2025038436195916, + "grad_norm": 2.335733413696289, + "learning_rate": 5e-06, + "loss": 0.664, + "mean_token_accuracy": 0.7825027704238892, + "num_tokens": 519130168.0, + "step": 20056 + }, + { + "epoch": 2.2026136613222054, + "grad_norm": 2.1638476848602295, + "learning_rate": 5e-06, + "loss": 0.634, + "mean_token_accuracy": 0.789139449596405, + "num_tokens": 519154656.0, + "step": 20057 + }, + { + "epoch": 2.2027234790248187, + "grad_norm": 2.5551533699035645, + "learning_rate": 5e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.7917894124984741, + "num_tokens": 519173801.0, + "step": 20058 + }, + { + "epoch": 2.2028332967274324, + "grad_norm": 2.269285202026367, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7754818201065063, + "num_tokens": 519194109.0, + "step": 20059 + }, + { + "epoch": 2.202943114430046, + "grad_norm": 1.9249595403671265, + "learning_rate": 5e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7751283049583435, + "num_tokens": 519223098.0, + "step": 20060 + }, + { + "epoch": 2.20305293213266, + "grad_norm": 2.1749985218048096, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7779651880264282, + "num_tokens": 519245243.0, + "step": 20061 + }, + { + "epoch": 2.2031627498352733, + "grad_norm": 2.2800304889678955, + "learning_rate": 5e-06, + "loss": 0.655, + "mean_token_accuracy": 0.7822747230529785, + "num_tokens": 519267927.0, + "step": 20062 + }, + { + "epoch": 2.203272567537887, + "grad_norm": 1.952632188796997, + "learning_rate": 5e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7345523834228516, + "num_tokens": 519298779.0, + "step": 20063 + }, + { + "epoch": 2.203382385240501, + "grad_norm": 1.9291189908981323, + "learning_rate": 5e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.7772535681724548, + "num_tokens": 519327895.0, + "step": 20064 + }, + { + "epoch": 2.2034922029431145, + "grad_norm": 1.876248836517334, + "learning_rate": 5e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7714481949806213, + "num_tokens": 519358454.0, + "step": 20065 + }, + { + "epoch": 2.2036020206457283, + "grad_norm": 1.9800734519958496, + "learning_rate": 5e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7396359443664551, + "num_tokens": 519389168.0, + "step": 20066 + }, + { + "epoch": 2.2037118383483416, + "grad_norm": 2.3218250274658203, + "learning_rate": 5e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8063744306564331, + "num_tokens": 519408980.0, + "step": 20067 + }, + { + "epoch": 2.2038216560509554, + "grad_norm": 2.0698986053466797, + "learning_rate": 5e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.7739996910095215, + "num_tokens": 519439080.0, + "step": 20068 + }, + { + "epoch": 2.203931473753569, + "grad_norm": 2.107614278793335, + "learning_rate": 5e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.7988266944885254, + "num_tokens": 519461310.0, + "step": 20069 + }, + { + "epoch": 2.204041291456183, + "grad_norm": 2.0167531967163086, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7628254294395447, + "num_tokens": 519485124.0, + "step": 20070 + }, + { + "epoch": 2.204151109158796, + "grad_norm": 2.1082963943481445, + "learning_rate": 5e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7576429843902588, + "num_tokens": 519512434.0, + "step": 20071 + }, + { + "epoch": 2.20426092686141, + "grad_norm": 2.201456069946289, + "learning_rate": 5e-06, + "loss": 0.6223, + "mean_token_accuracy": 0.7911489009857178, + "num_tokens": 519535091.0, + "step": 20072 + }, + { + "epoch": 2.2043707445640237, + "grad_norm": 2.1571974754333496, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7498473525047302, + "num_tokens": 519562762.0, + "step": 20073 + }, + { + "epoch": 2.2044805622666375, + "grad_norm": 2.004464626312256, + "learning_rate": 5e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.7852851152420044, + "num_tokens": 519588640.0, + "step": 20074 + }, + { + "epoch": 2.2045903799692512, + "grad_norm": 1.9463140964508057, + "learning_rate": 5e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7707725763320923, + "num_tokens": 519616032.0, + "step": 20075 + }, + { + "epoch": 2.2047001976718645, + "grad_norm": 2.153693914413452, + "learning_rate": 5e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7696471810340881, + "num_tokens": 519641257.0, + "step": 20076 + }, + { + "epoch": 2.2048100153744783, + "grad_norm": 1.9999566078186035, + "learning_rate": 5e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7807778716087341, + "num_tokens": 519666809.0, + "step": 20077 + }, + { + "epoch": 2.204919833077092, + "grad_norm": 1.8940982818603516, + "learning_rate": 5e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7285153269767761, + "num_tokens": 519697108.0, + "step": 20078 + }, + { + "epoch": 2.205029650779706, + "grad_norm": 1.9196689128875732, + "learning_rate": 5e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7549782395362854, + "num_tokens": 519728800.0, + "step": 20079 + }, + { + "epoch": 2.2051394684823196, + "grad_norm": 2.1167848110198975, + "learning_rate": 5e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7722838521003723, + "num_tokens": 519753910.0, + "step": 20080 + }, + { + "epoch": 2.205249286184933, + "grad_norm": 2.034972667694092, + "learning_rate": 5e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7653716802597046, + "num_tokens": 519780650.0, + "step": 20081 + }, + { + "epoch": 2.2053591038875466, + "grad_norm": 1.8850592374801636, + "learning_rate": 5e-06, + "loss": 0.6355, + "mean_token_accuracy": 0.7886903285980225, + "num_tokens": 519809727.0, + "step": 20082 + }, + { + "epoch": 2.2054689215901604, + "grad_norm": 2.078848123550415, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7697967290878296, + "num_tokens": 519836509.0, + "step": 20083 + }, + { + "epoch": 2.205578739292774, + "grad_norm": 1.9145866632461548, + "learning_rate": 5e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7520112991333008, + "num_tokens": 519867556.0, + "step": 20084 + }, + { + "epoch": 2.2056885569953875, + "grad_norm": 2.1755363941192627, + "learning_rate": 5e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7594179511070251, + "num_tokens": 519892825.0, + "step": 20085 + }, + { + "epoch": 2.205798374698001, + "grad_norm": 2.0667481422424316, + "learning_rate": 5e-06, + "loss": 0.659, + "mean_token_accuracy": 0.781748354434967, + "num_tokens": 519917745.0, + "step": 20086 + }, + { + "epoch": 2.205908192400615, + "grad_norm": 2.2007694244384766, + "learning_rate": 5e-06, + "loss": 0.6212, + "mean_token_accuracy": 0.7920085787773132, + "num_tokens": 519941902.0, + "step": 20087 + }, + { + "epoch": 2.2060180101032287, + "grad_norm": 2.0260653495788574, + "learning_rate": 5e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7662550210952759, + "num_tokens": 519969216.0, + "step": 20088 + }, + { + "epoch": 2.2061278278058425, + "grad_norm": 1.8879891633987427, + "learning_rate": 5e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7582939863204956, + "num_tokens": 519998377.0, + "step": 20089 + }, + { + "epoch": 2.206237645508456, + "grad_norm": 1.964606761932373, + "learning_rate": 5e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7638492584228516, + "num_tokens": 520028883.0, + "step": 20090 + }, + { + "epoch": 2.2063474632110696, + "grad_norm": 2.151801347732544, + "learning_rate": 5e-06, + "loss": 0.6036, + "mean_token_accuracy": 0.7922751307487488, + "num_tokens": 520051630.0, + "step": 20091 + }, + { + "epoch": 2.2064572809136833, + "grad_norm": 1.9608123302459717, + "learning_rate": 5e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.769761323928833, + "num_tokens": 520078499.0, + "step": 20092 + }, + { + "epoch": 2.206567098616297, + "grad_norm": 2.081739902496338, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7746191024780273, + "num_tokens": 520103136.0, + "step": 20093 + }, + { + "epoch": 2.2066769163189104, + "grad_norm": 2.2976794242858887, + "learning_rate": 5e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7646231651306152, + "num_tokens": 520126448.0, + "step": 20094 + }, + { + "epoch": 2.206786734021524, + "grad_norm": 1.9881404638290405, + "learning_rate": 5e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7565355896949768, + "num_tokens": 520158079.0, + "step": 20095 + }, + { + "epoch": 2.206896551724138, + "grad_norm": 1.9466382265090942, + "learning_rate": 5e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.7858706116676331, + "num_tokens": 520186514.0, + "step": 20096 + }, + { + "epoch": 2.2070063694267517, + "grad_norm": 2.07395601272583, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7705250978469849, + "num_tokens": 520212961.0, + "step": 20097 + }, + { + "epoch": 2.2071161871293654, + "grad_norm": 2.1988022327423096, + "learning_rate": 5e-06, + "loss": 0.78, + "mean_token_accuracy": 0.7511780261993408, + "num_tokens": 520236696.0, + "step": 20098 + }, + { + "epoch": 2.2072260048319787, + "grad_norm": 2.0726194381713867, + "learning_rate": 5e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.7884700894355774, + "num_tokens": 520263578.0, + "step": 20099 + }, + { + "epoch": 2.2073358225345925, + "grad_norm": 2.0710291862487793, + "learning_rate": 5e-06, + "loss": 0.6608, + "mean_token_accuracy": 0.7792730927467346, + "num_tokens": 520291746.0, + "step": 20100 + }, + { + "epoch": 2.2074456402372062, + "grad_norm": 2.3412528038024902, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7773104310035706, + "num_tokens": 520312063.0, + "step": 20101 + }, + { + "epoch": 2.20755545793982, + "grad_norm": 2.1043291091918945, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7764272689819336, + "num_tokens": 520338071.0, + "step": 20102 + }, + { + "epoch": 2.2076652756424338, + "grad_norm": 2.236217975616455, + "learning_rate": 5e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7780355215072632, + "num_tokens": 520361784.0, + "step": 20103 + }, + { + "epoch": 2.207775093345047, + "grad_norm": 1.9930354356765747, + "learning_rate": 5e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7599722146987915, + "num_tokens": 520389188.0, + "step": 20104 + }, + { + "epoch": 2.207884911047661, + "grad_norm": 2.0480616092681885, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7602280378341675, + "num_tokens": 520416792.0, + "step": 20105 + }, + { + "epoch": 2.2079947287502746, + "grad_norm": 2.050637722015381, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7635107040405273, + "num_tokens": 520444732.0, + "step": 20106 + }, + { + "epoch": 2.2081045464528883, + "grad_norm": 2.258805751800537, + "learning_rate": 5e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.7786208391189575, + "num_tokens": 520466005.0, + "step": 20107 + }, + { + "epoch": 2.208214364155502, + "grad_norm": 2.0828139781951904, + "learning_rate": 5e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.7848873138427734, + "num_tokens": 520492821.0, + "step": 20108 + }, + { + "epoch": 2.2083241818581154, + "grad_norm": 2.0048186779022217, + "learning_rate": 5e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.7973884344100952, + "num_tokens": 520518834.0, + "step": 20109 + }, + { + "epoch": 2.208433999560729, + "grad_norm": 1.9532992839813232, + "learning_rate": 5e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7616592645645142, + "num_tokens": 520549992.0, + "step": 20110 + }, + { + "epoch": 2.208543817263343, + "grad_norm": 2.4089739322662354, + "learning_rate": 5e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.8023440837860107, + "num_tokens": 520571319.0, + "step": 20111 + }, + { + "epoch": 2.2086536349659567, + "grad_norm": 2.061465263366699, + "learning_rate": 5e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.7885708808898926, + "num_tokens": 520593751.0, + "step": 20112 + }, + { + "epoch": 2.20876345266857, + "grad_norm": 1.982237458229065, + "learning_rate": 5e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7630259394645691, + "num_tokens": 520620823.0, + "step": 20113 + }, + { + "epoch": 2.2088732703711838, + "grad_norm": 2.254871368408203, + "learning_rate": 5e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7612740993499756, + "num_tokens": 520644148.0, + "step": 20114 + }, + { + "epoch": 2.2089830880737975, + "grad_norm": 2.120012044906616, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7476093769073486, + "num_tokens": 520673028.0, + "step": 20115 + }, + { + "epoch": 2.2090929057764113, + "grad_norm": 2.409576892852783, + "learning_rate": 5e-06, + "loss": 0.6026, + "mean_token_accuracy": 0.7892006635665894, + "num_tokens": 520693074.0, + "step": 20116 + }, + { + "epoch": 2.209202723479025, + "grad_norm": 2.265104293823242, + "learning_rate": 5e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7533244490623474, + "num_tokens": 520716820.0, + "step": 20117 + }, + { + "epoch": 2.2093125411816383, + "grad_norm": 2.300551176071167, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.7802493572235107, + "num_tokens": 520740372.0, + "step": 20118 + }, + { + "epoch": 2.209422358884252, + "grad_norm": 2.156855583190918, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7774290442466736, + "num_tokens": 520764700.0, + "step": 20119 + }, + { + "epoch": 2.209532176586866, + "grad_norm": 2.1538264751434326, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7666788101196289, + "num_tokens": 520790220.0, + "step": 20120 + }, + { + "epoch": 2.2096419942894796, + "grad_norm": 2.14253306388855, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7531014084815979, + "num_tokens": 520818086.0, + "step": 20121 + }, + { + "epoch": 2.209751811992093, + "grad_norm": 2.0552735328674316, + "learning_rate": 5e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7495626211166382, + "num_tokens": 520846096.0, + "step": 20122 + }, + { + "epoch": 2.2098616296947067, + "grad_norm": 2.1943776607513428, + "learning_rate": 5e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.7807246446609497, + "num_tokens": 520869833.0, + "step": 20123 + }, + { + "epoch": 2.2099714473973204, + "grad_norm": 1.975494623184204, + "learning_rate": 5e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7503505945205688, + "num_tokens": 520895998.0, + "step": 20124 + }, + { + "epoch": 2.210081265099934, + "grad_norm": 2.0241172313690186, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7708460092544556, + "num_tokens": 520920802.0, + "step": 20125 + }, + { + "epoch": 2.210191082802548, + "grad_norm": 2.3121542930603027, + "learning_rate": 5e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.7916061282157898, + "num_tokens": 520941873.0, + "step": 20126 + }, + { + "epoch": 2.2103009005051613, + "grad_norm": 2.201564311981201, + "learning_rate": 5e-06, + "loss": 0.6408, + "mean_token_accuracy": 0.7875945568084717, + "num_tokens": 520963387.0, + "step": 20127 + }, + { + "epoch": 2.210410718207775, + "grad_norm": 2.041801691055298, + "learning_rate": 5e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7525496482849121, + "num_tokens": 520989135.0, + "step": 20128 + }, + { + "epoch": 2.2105205359103888, + "grad_norm": 2.1042208671569824, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7726684808731079, + "num_tokens": 521013996.0, + "step": 20129 + }, + { + "epoch": 2.2106303536130025, + "grad_norm": 2.038757562637329, + "learning_rate": 5e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.7988215684890747, + "num_tokens": 521038355.0, + "step": 20130 + }, + { + "epoch": 2.2107401713156163, + "grad_norm": 2.5181691646575928, + "learning_rate": 5e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7731155157089233, + "num_tokens": 521060622.0, + "step": 20131 + }, + { + "epoch": 2.2108499890182296, + "grad_norm": 2.19506573677063, + "learning_rate": 5e-06, + "loss": 0.6556, + "mean_token_accuracy": 0.780174970626831, + "num_tokens": 521084660.0, + "step": 20132 + }, + { + "epoch": 2.2109598067208434, + "grad_norm": 2.0959739685058594, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7715717554092407, + "num_tokens": 521110133.0, + "step": 20133 + }, + { + "epoch": 2.211069624423457, + "grad_norm": 2.1212306022644043, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7729165554046631, + "num_tokens": 521135773.0, + "step": 20134 + }, + { + "epoch": 2.211179442126071, + "grad_norm": 2.178546190261841, + "learning_rate": 5e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7705599069595337, + "num_tokens": 521160717.0, + "step": 20135 + }, + { + "epoch": 2.211289259828684, + "grad_norm": 2.3229258060455322, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.75974440574646, + "num_tokens": 521183339.0, + "step": 20136 + }, + { + "epoch": 2.211399077531298, + "grad_norm": 2.348040819168091, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7621126174926758, + "num_tokens": 521207841.0, + "step": 20137 + }, + { + "epoch": 2.2115088952339117, + "grad_norm": 2.3853681087493896, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7744455337524414, + "num_tokens": 521229242.0, + "step": 20138 + }, + { + "epoch": 2.2116187129365255, + "grad_norm": 2.1208393573760986, + "learning_rate": 5e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7662485837936401, + "num_tokens": 521255871.0, + "step": 20139 + }, + { + "epoch": 2.211728530639139, + "grad_norm": 2.34332537651062, + "learning_rate": 5e-06, + "loss": 0.6555, + "mean_token_accuracy": 0.7794530391693115, + "num_tokens": 521277231.0, + "step": 20140 + }, + { + "epoch": 2.2118383483417525, + "grad_norm": 1.9036331176757812, + "learning_rate": 5e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.746384859085083, + "num_tokens": 521310270.0, + "step": 20141 + }, + { + "epoch": 2.2119481660443663, + "grad_norm": 1.938049077987671, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7610772848129272, + "num_tokens": 521340251.0, + "step": 20142 + }, + { + "epoch": 2.21205798374698, + "grad_norm": 2.0943305492401123, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7681816816329956, + "num_tokens": 521365973.0, + "step": 20143 + }, + { + "epoch": 2.212167801449594, + "grad_norm": 1.84293532371521, + "learning_rate": 5e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7629631161689758, + "num_tokens": 521395806.0, + "step": 20144 + }, + { + "epoch": 2.212277619152207, + "grad_norm": 2.1417157649993896, + "learning_rate": 5e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7644470930099487, + "num_tokens": 521422179.0, + "step": 20145 + }, + { + "epoch": 2.212387436854821, + "grad_norm": 2.172422170639038, + "learning_rate": 5e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.781562328338623, + "num_tokens": 521445226.0, + "step": 20146 + }, + { + "epoch": 2.2124972545574346, + "grad_norm": 2.0136160850524902, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7695817351341248, + "num_tokens": 521473875.0, + "step": 20147 + }, + { + "epoch": 2.2126070722600484, + "grad_norm": 2.145320415496826, + "learning_rate": 5e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7615969777107239, + "num_tokens": 521498590.0, + "step": 20148 + }, + { + "epoch": 2.212716889962662, + "grad_norm": 2.115095615386963, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7727954387664795, + "num_tokens": 521524510.0, + "step": 20149 + }, + { + "epoch": 2.2128267076652754, + "grad_norm": 2.047186851501465, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7612540125846863, + "num_tokens": 521553245.0, + "step": 20150 + }, + { + "epoch": 2.212936525367889, + "grad_norm": 1.8995287418365479, + "learning_rate": 5e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.7794427871704102, + "num_tokens": 521584286.0, + "step": 20151 + }, + { + "epoch": 2.213046343070503, + "grad_norm": 1.9195548295974731, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7625170350074768, + "num_tokens": 521614352.0, + "step": 20152 + }, + { + "epoch": 2.2131561607731167, + "grad_norm": 1.8875731229782104, + "learning_rate": 5e-06, + "loss": 0.6357, + "mean_token_accuracy": 0.7867857217788696, + "num_tokens": 521641056.0, + "step": 20153 + }, + { + "epoch": 2.2132659784757305, + "grad_norm": 2.0481488704681396, + "learning_rate": 5e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7680750489234924, + "num_tokens": 521668751.0, + "step": 20154 + }, + { + "epoch": 2.213375796178344, + "grad_norm": 1.9443961381912231, + "learning_rate": 5e-06, + "loss": 0.6671, + "mean_token_accuracy": 0.7770137786865234, + "num_tokens": 521698397.0, + "step": 20155 + }, + { + "epoch": 2.2134856138809575, + "grad_norm": 2.110934019088745, + "learning_rate": 5e-06, + "loss": 0.6729, + "mean_token_accuracy": 0.7742777466773987, + "num_tokens": 521721973.0, + "step": 20156 + }, + { + "epoch": 2.2135954315835713, + "grad_norm": 2.2688753604888916, + "learning_rate": 5e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7415342330932617, + "num_tokens": 521747447.0, + "step": 20157 + }, + { + "epoch": 2.213705249286185, + "grad_norm": 2.1750173568725586, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7731006741523743, + "num_tokens": 521772383.0, + "step": 20158 + }, + { + "epoch": 2.213815066988799, + "grad_norm": 2.1828582286834717, + "learning_rate": 5e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7738360166549683, + "num_tokens": 521798177.0, + "step": 20159 + }, + { + "epoch": 2.213924884691412, + "grad_norm": 1.8438540697097778, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7573216557502747, + "num_tokens": 521829321.0, + "step": 20160 + }, + { + "epoch": 2.214034702394026, + "grad_norm": 2.2305471897125244, + "learning_rate": 5e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7629529237747192, + "num_tokens": 521857426.0, + "step": 20161 + }, + { + "epoch": 2.2141445200966396, + "grad_norm": 2.0348048210144043, + "learning_rate": 5e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7687798738479614, + "num_tokens": 521886063.0, + "step": 20162 + }, + { + "epoch": 2.2142543377992534, + "grad_norm": 2.0684027671813965, + "learning_rate": 5e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7555937767028809, + "num_tokens": 521913855.0, + "step": 20163 + }, + { + "epoch": 2.2143641555018667, + "grad_norm": 2.036689519882202, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7823949456214905, + "num_tokens": 521940575.0, + "step": 20164 + }, + { + "epoch": 2.2144739732044805, + "grad_norm": 2.2903175354003906, + "learning_rate": 5e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.755662739276886, + "num_tokens": 521965451.0, + "step": 20165 + }, + { + "epoch": 2.2145837909070942, + "grad_norm": 1.9218741655349731, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7691457271575928, + "num_tokens": 521997210.0, + "step": 20166 + }, + { + "epoch": 2.214693608609708, + "grad_norm": 2.0490293502807617, + "learning_rate": 5e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.7620625495910645, + "num_tokens": 522024623.0, + "step": 20167 + }, + { + "epoch": 2.2148034263123217, + "grad_norm": 2.176274061203003, + "learning_rate": 5e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.757506251335144, + "num_tokens": 522052306.0, + "step": 20168 + }, + { + "epoch": 2.214913244014935, + "grad_norm": 2.0040292739868164, + "learning_rate": 5e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7624457478523254, + "num_tokens": 522077891.0, + "step": 20169 + }, + { + "epoch": 2.215023061717549, + "grad_norm": 1.9355206489562988, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7725189924240112, + "num_tokens": 522105578.0, + "step": 20170 + }, + { + "epoch": 2.2151328794201626, + "grad_norm": 2.0059542655944824, + "learning_rate": 5e-06, + "loss": 0.6147, + "mean_token_accuracy": 0.7990267872810364, + "num_tokens": 522131814.0, + "step": 20171 + }, + { + "epoch": 2.2152426971227763, + "grad_norm": 2.469569683074951, + "learning_rate": 5e-06, + "loss": 0.6551, + "mean_token_accuracy": 0.7839831709861755, + "num_tokens": 522150791.0, + "step": 20172 + }, + { + "epoch": 2.2153525148253896, + "grad_norm": 2.3319461345672607, + "learning_rate": 5e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.7906487584114075, + "num_tokens": 522171431.0, + "step": 20173 + }, + { + "epoch": 2.2154623325280034, + "grad_norm": 2.0856773853302, + "learning_rate": 5e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7690563201904297, + "num_tokens": 522197278.0, + "step": 20174 + }, + { + "epoch": 2.215572150230617, + "grad_norm": 2.058967113494873, + "learning_rate": 5e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7514902353286743, + "num_tokens": 522224526.0, + "step": 20175 + }, + { + "epoch": 2.215681967933231, + "grad_norm": 2.114748954772949, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.750558078289032, + "num_tokens": 522250703.0, + "step": 20176 + }, + { + "epoch": 2.2157917856358447, + "grad_norm": 2.104151964187622, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7639579772949219, + "num_tokens": 522276950.0, + "step": 20177 + }, + { + "epoch": 2.215901603338458, + "grad_norm": 2.3561017513275146, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7623516321182251, + "num_tokens": 522300309.0, + "step": 20178 + }, + { + "epoch": 2.2160114210410717, + "grad_norm": 1.8179923295974731, + "learning_rate": 5e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.7556571364402771, + "num_tokens": 522331492.0, + "step": 20179 + }, + { + "epoch": 2.2161212387436855, + "grad_norm": 2.142141103744507, + "learning_rate": 5e-06, + "loss": 0.6309, + "mean_token_accuracy": 0.7866284251213074, + "num_tokens": 522355727.0, + "step": 20180 + }, + { + "epoch": 2.2162310564462993, + "grad_norm": 1.9790382385253906, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7620147466659546, + "num_tokens": 522384766.0, + "step": 20181 + }, + { + "epoch": 2.216340874148913, + "grad_norm": 2.1142265796661377, + "learning_rate": 5e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7755361795425415, + "num_tokens": 522408931.0, + "step": 20182 + }, + { + "epoch": 2.2164506918515263, + "grad_norm": 2.0408778190612793, + "learning_rate": 5e-06, + "loss": 0.5477, + "mean_token_accuracy": 0.8067167401313782, + "num_tokens": 522433513.0, + "step": 20183 + }, + { + "epoch": 2.21656050955414, + "grad_norm": 2.1293325424194336, + "learning_rate": 5e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7715173959732056, + "num_tokens": 522458638.0, + "step": 20184 + }, + { + "epoch": 2.216670327256754, + "grad_norm": 2.3458073139190674, + "learning_rate": 5e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7794169187545776, + "num_tokens": 522479394.0, + "step": 20185 + }, + { + "epoch": 2.2167801449593676, + "grad_norm": 2.1836469173431396, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7761587500572205, + "num_tokens": 522504289.0, + "step": 20186 + }, + { + "epoch": 2.216889962661981, + "grad_norm": 2.40959095954895, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7661665678024292, + "num_tokens": 522524989.0, + "step": 20187 + }, + { + "epoch": 2.2169997803645947, + "grad_norm": 2.327240467071533, + "learning_rate": 5e-06, + "loss": 0.6539, + "mean_token_accuracy": 0.7778998613357544, + "num_tokens": 522547417.0, + "step": 20188 + }, + { + "epoch": 2.2171095980672084, + "grad_norm": 1.9493114948272705, + "learning_rate": 5e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.7870659828186035, + "num_tokens": 522578371.0, + "step": 20189 + }, + { + "epoch": 2.217219415769822, + "grad_norm": 1.894666075706482, + "learning_rate": 5e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.7829161286354065, + "num_tokens": 522608156.0, + "step": 20190 + }, + { + "epoch": 2.217329233472436, + "grad_norm": 2.1027822494506836, + "learning_rate": 5e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.7728084325790405, + "num_tokens": 522637144.0, + "step": 20191 + }, + { + "epoch": 2.2174390511750492, + "grad_norm": 1.9664771556854248, + "learning_rate": 5e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.7535971403121948, + "num_tokens": 522666610.0, + "step": 20192 + }, + { + "epoch": 2.217548868877663, + "grad_norm": 1.9691425561904907, + "learning_rate": 5e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.7842347025871277, + "num_tokens": 522691811.0, + "step": 20193 + }, + { + "epoch": 2.2176586865802768, + "grad_norm": 2.0785531997680664, + "learning_rate": 5e-06, + "loss": 0.6728, + "mean_token_accuracy": 0.7765907049179077, + "num_tokens": 522716061.0, + "step": 20194 + }, + { + "epoch": 2.2177685042828905, + "grad_norm": 2.03334641456604, + "learning_rate": 5e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7775293588638306, + "num_tokens": 522742898.0, + "step": 20195 + }, + { + "epoch": 2.217878321985504, + "grad_norm": 1.9033501148223877, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7679564952850342, + "num_tokens": 522772295.0, + "step": 20196 + }, + { + "epoch": 2.2179881396881176, + "grad_norm": 2.1336822509765625, + "learning_rate": 5e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.780159056186676, + "num_tokens": 522796330.0, + "step": 20197 + }, + { + "epoch": 2.2180979573907313, + "grad_norm": 1.963411808013916, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7571632862091064, + "num_tokens": 522827672.0, + "step": 20198 + }, + { + "epoch": 2.218207775093345, + "grad_norm": 2.079350471496582, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7785631418228149, + "num_tokens": 522852874.0, + "step": 20199 + }, + { + "epoch": 2.218317592795959, + "grad_norm": 2.129199743270874, + "learning_rate": 5e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7614650130271912, + "num_tokens": 522878748.0, + "step": 20200 + }, + { + "epoch": 2.218427410498572, + "grad_norm": 2.1273574829101562, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7677359580993652, + "num_tokens": 522905915.0, + "step": 20201 + }, + { + "epoch": 2.218537228201186, + "grad_norm": 1.9806410074234009, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7632256150245667, + "num_tokens": 522937872.0, + "step": 20202 + }, + { + "epoch": 2.2186470459037997, + "grad_norm": 2.035159111022949, + "learning_rate": 5e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7720077037811279, + "num_tokens": 522964434.0, + "step": 20203 + }, + { + "epoch": 2.2187568636064134, + "grad_norm": 2.175734043121338, + "learning_rate": 5e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7527973651885986, + "num_tokens": 522993220.0, + "step": 20204 + }, + { + "epoch": 2.218866681309027, + "grad_norm": 2.1039156913757324, + "learning_rate": 5e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7783186435699463, + "num_tokens": 523017929.0, + "step": 20205 + }, + { + "epoch": 2.2189764990116405, + "grad_norm": 2.0813963413238525, + "learning_rate": 5e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7473429441452026, + "num_tokens": 523042501.0, + "step": 20206 + }, + { + "epoch": 2.2190863167142543, + "grad_norm": 2.2103240489959717, + "learning_rate": 5e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7590174674987793, + "num_tokens": 523070644.0, + "step": 20207 + }, + { + "epoch": 2.219196134416868, + "grad_norm": 1.986672282218933, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7664821743965149, + "num_tokens": 523098730.0, + "step": 20208 + }, + { + "epoch": 2.219305952119482, + "grad_norm": 2.354405641555786, + "learning_rate": 5e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.8011810779571533, + "num_tokens": 523118736.0, + "step": 20209 + }, + { + "epoch": 2.2194157698220955, + "grad_norm": 2.207469940185547, + "learning_rate": 5e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7582079172134399, + "num_tokens": 523143955.0, + "step": 20210 + }, + { + "epoch": 2.219525587524709, + "grad_norm": 2.2611265182495117, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7714648842811584, + "num_tokens": 523166578.0, + "step": 20211 + }, + { + "epoch": 2.2196354052273226, + "grad_norm": 1.978930950164795, + "learning_rate": 5e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.757215142250061, + "num_tokens": 523195255.0, + "step": 20212 + }, + { + "epoch": 2.2197452229299364, + "grad_norm": 2.114197015762329, + "learning_rate": 5e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.7794123888015747, + "num_tokens": 523220469.0, + "step": 20213 + }, + { + "epoch": 2.21985504063255, + "grad_norm": 1.974349021911621, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7734203338623047, + "num_tokens": 523248713.0, + "step": 20214 + }, + { + "epoch": 2.2199648583351634, + "grad_norm": 2.101593017578125, + "learning_rate": 5e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7794466018676758, + "num_tokens": 523275493.0, + "step": 20215 + }, + { + "epoch": 2.220074676037777, + "grad_norm": 2.1732778549194336, + "learning_rate": 5e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.7785183787345886, + "num_tokens": 523297611.0, + "step": 20216 + }, + { + "epoch": 2.220184493740391, + "grad_norm": 2.0690462589263916, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7627360820770264, + "num_tokens": 523324533.0, + "step": 20217 + }, + { + "epoch": 2.2202943114430047, + "grad_norm": 2.4231536388397217, + "learning_rate": 5e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7615844011306763, + "num_tokens": 523350866.0, + "step": 20218 + }, + { + "epoch": 2.2204041291456185, + "grad_norm": 2.132845401763916, + "learning_rate": 5e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7792753577232361, + "num_tokens": 523374282.0, + "step": 20219 + }, + { + "epoch": 2.220513946848232, + "grad_norm": 2.048691749572754, + "learning_rate": 5e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.771530032157898, + "num_tokens": 523400724.0, + "step": 20220 + }, + { + "epoch": 2.2206237645508455, + "grad_norm": 2.032188653945923, + "learning_rate": 5e-06, + "loss": 0.6213, + "mean_token_accuracy": 0.7872945666313171, + "num_tokens": 523424824.0, + "step": 20221 + }, + { + "epoch": 2.2207335822534593, + "grad_norm": 2.236894130706787, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7710398435592651, + "num_tokens": 523450286.0, + "step": 20222 + }, + { + "epoch": 2.220843399956073, + "grad_norm": 2.0613460540771484, + "learning_rate": 5e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.768750786781311, + "num_tokens": 523475544.0, + "step": 20223 + }, + { + "epoch": 2.2209532176586864, + "grad_norm": 2.300740957260132, + "learning_rate": 5e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.7790061831474304, + "num_tokens": 523496623.0, + "step": 20224 + }, + { + "epoch": 2.2210630353613, + "grad_norm": 1.9061079025268555, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7795366644859314, + "num_tokens": 523523320.0, + "step": 20225 + }, + { + "epoch": 2.221172853063914, + "grad_norm": 1.9739142656326294, + "learning_rate": 5e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.7686642408370972, + "num_tokens": 523550959.0, + "step": 20226 + }, + { + "epoch": 2.2212826707665276, + "grad_norm": 2.0415396690368652, + "learning_rate": 5e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.7677599191665649, + "num_tokens": 523578348.0, + "step": 20227 + }, + { + "epoch": 2.2213924884691414, + "grad_norm": 2.1955442428588867, + "learning_rate": 5e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7678845524787903, + "num_tokens": 523602940.0, + "step": 20228 + }, + { + "epoch": 2.2215023061717547, + "grad_norm": 1.9417716264724731, + "learning_rate": 5e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.8031244277954102, + "num_tokens": 523630475.0, + "step": 20229 + }, + { + "epoch": 2.2216121238743685, + "grad_norm": 2.158928632736206, + "learning_rate": 5e-06, + "loss": 0.7147, + "mean_token_accuracy": 0.7671434879302979, + "num_tokens": 523654641.0, + "step": 20230 + }, + { + "epoch": 2.221721941576982, + "grad_norm": 1.882834553718567, + "learning_rate": 5e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7733935713768005, + "num_tokens": 523683768.0, + "step": 20231 + }, + { + "epoch": 2.221831759279596, + "grad_norm": 2.1269595623016357, + "learning_rate": 5e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7577427625656128, + "num_tokens": 523705974.0, + "step": 20232 + }, + { + "epoch": 2.2219415769822097, + "grad_norm": 2.244863510131836, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7807410955429077, + "num_tokens": 523732105.0, + "step": 20233 + }, + { + "epoch": 2.222051394684823, + "grad_norm": 2.021038293838501, + "learning_rate": 5e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.7556120157241821, + "num_tokens": 523759980.0, + "step": 20234 + }, + { + "epoch": 2.222161212387437, + "grad_norm": 2.252988576889038, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7730593085289001, + "num_tokens": 523785979.0, + "step": 20235 + }, + { + "epoch": 2.2222710300900506, + "grad_norm": 1.9197945594787598, + "learning_rate": 5e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7372959852218628, + "num_tokens": 523820139.0, + "step": 20236 + }, + { + "epoch": 2.2223808477926643, + "grad_norm": 2.101557731628418, + "learning_rate": 5e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.7658475637435913, + "num_tokens": 523846645.0, + "step": 20237 + }, + { + "epoch": 2.222490665495278, + "grad_norm": 2.312807321548462, + "learning_rate": 5e-06, + "loss": 0.624, + "mean_token_accuracy": 0.7881641983985901, + "num_tokens": 523867884.0, + "step": 20238 + }, + { + "epoch": 2.2226004831978914, + "grad_norm": 1.9613306522369385, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7620068192481995, + "num_tokens": 523896690.0, + "step": 20239 + }, + { + "epoch": 2.222710300900505, + "grad_norm": 2.227550983428955, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7570689916610718, + "num_tokens": 523921346.0, + "step": 20240 + }, + { + "epoch": 2.222820118603119, + "grad_norm": 2.165654182434082, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7681170701980591, + "num_tokens": 523946247.0, + "step": 20241 + }, + { + "epoch": 2.2229299363057327, + "grad_norm": 2.1519381999969482, + "learning_rate": 5e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7721354365348816, + "num_tokens": 523971392.0, + "step": 20242 + }, + { + "epoch": 2.223039754008346, + "grad_norm": 2.1337926387786865, + "learning_rate": 5e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7702451944351196, + "num_tokens": 523994798.0, + "step": 20243 + }, + { + "epoch": 2.2231495717109597, + "grad_norm": 1.8303368091583252, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7355278730392456, + "num_tokens": 524028890.0, + "step": 20244 + }, + { + "epoch": 2.2232593894135735, + "grad_norm": 2.220925807952881, + "learning_rate": 5e-06, + "loss": 0.673, + "mean_token_accuracy": 0.7765599489212036, + "num_tokens": 524051542.0, + "step": 20245 + }, + { + "epoch": 2.2233692071161872, + "grad_norm": 1.8952676057815552, + "learning_rate": 5e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7771238684654236, + "num_tokens": 524081011.0, + "step": 20246 + }, + { + "epoch": 2.223479024818801, + "grad_norm": 2.0763497352600098, + "learning_rate": 5e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.7890808582305908, + "num_tokens": 524107669.0, + "step": 20247 + }, + { + "epoch": 2.2235888425214143, + "grad_norm": 2.293642044067383, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7644562721252441, + "num_tokens": 524131772.0, + "step": 20248 + }, + { + "epoch": 2.223698660224028, + "grad_norm": 1.720880389213562, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7563711404800415, + "num_tokens": 524167365.0, + "step": 20249 + }, + { + "epoch": 2.223808477926642, + "grad_norm": 1.9728292226791382, + "learning_rate": 5e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7833073735237122, + "num_tokens": 524193321.0, + "step": 20250 + }, + { + "epoch": 2.2239182956292556, + "grad_norm": 2.0690135955810547, + "learning_rate": 5e-06, + "loss": 0.6758, + "mean_token_accuracy": 0.777520477771759, + "num_tokens": 524219078.0, + "step": 20251 + }, + { + "epoch": 2.224028113331869, + "grad_norm": 1.8686352968215942, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7541124224662781, + "num_tokens": 524250219.0, + "step": 20252 + }, + { + "epoch": 2.2241379310344827, + "grad_norm": 2.400453805923462, + "learning_rate": 5e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7588510513305664, + "num_tokens": 524272370.0, + "step": 20253 + }, + { + "epoch": 2.2242477487370964, + "grad_norm": 2.485775947570801, + "learning_rate": 5e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.7839246988296509, + "num_tokens": 524292111.0, + "step": 20254 + }, + { + "epoch": 2.22435756643971, + "grad_norm": 1.9836134910583496, + "learning_rate": 5e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.76694655418396, + "num_tokens": 524318280.0, + "step": 20255 + }, + { + "epoch": 2.224467384142324, + "grad_norm": 1.9247647523880005, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7734147310256958, + "num_tokens": 524343415.0, + "step": 20256 + }, + { + "epoch": 2.2245772018449372, + "grad_norm": 2.2789089679718018, + "learning_rate": 5e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.7910244464874268, + "num_tokens": 524364772.0, + "step": 20257 + }, + { + "epoch": 2.224687019547551, + "grad_norm": 1.993809461593628, + "learning_rate": 5e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.7666252851486206, + "num_tokens": 524392032.0, + "step": 20258 + }, + { + "epoch": 2.2247968372501647, + "grad_norm": 1.879726529121399, + "learning_rate": 5e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7753224968910217, + "num_tokens": 524420307.0, + "step": 20259 + }, + { + "epoch": 2.2249066549527785, + "grad_norm": 2.1870779991149902, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7631653547286987, + "num_tokens": 524445796.0, + "step": 20260 + }, + { + "epoch": 2.2250164726553923, + "grad_norm": 2.0439751148223877, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7647206783294678, + "num_tokens": 524473333.0, + "step": 20261 + }, + { + "epoch": 2.2251262903580056, + "grad_norm": 2.004822015762329, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7518309354782104, + "num_tokens": 524503210.0, + "step": 20262 + }, + { + "epoch": 2.2252361080606193, + "grad_norm": 2.0213983058929443, + "learning_rate": 5e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.7766129970550537, + "num_tokens": 524529101.0, + "step": 20263 + }, + { + "epoch": 2.225345925763233, + "grad_norm": 2.2215285301208496, + "learning_rate": 5e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7496869564056396, + "num_tokens": 524553807.0, + "step": 20264 + }, + { + "epoch": 2.225455743465847, + "grad_norm": 2.071061134338379, + "learning_rate": 5e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7736866474151611, + "num_tokens": 524580509.0, + "step": 20265 + }, + { + "epoch": 2.22556556116846, + "grad_norm": 1.91298246383667, + "learning_rate": 5e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7468389272689819, + "num_tokens": 524611850.0, + "step": 20266 + }, + { + "epoch": 2.225675378871074, + "grad_norm": 2.190592050552368, + "learning_rate": 5e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7701992392539978, + "num_tokens": 524635650.0, + "step": 20267 + }, + { + "epoch": 2.2257851965736877, + "grad_norm": 2.038914203643799, + "learning_rate": 5e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7715336084365845, + "num_tokens": 524662803.0, + "step": 20268 + }, + { + "epoch": 2.2258950142763014, + "grad_norm": 1.994309663772583, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7618438005447388, + "num_tokens": 524691160.0, + "step": 20269 + }, + { + "epoch": 2.226004831978915, + "grad_norm": 2.163747549057007, + "learning_rate": 5e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7623864412307739, + "num_tokens": 524715695.0, + "step": 20270 + }, + { + "epoch": 2.2261146496815285, + "grad_norm": 1.946839451789856, + "learning_rate": 5e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7535730600357056, + "num_tokens": 524745451.0, + "step": 20271 + }, + { + "epoch": 2.2262244673841423, + "grad_norm": 2.2104218006134033, + "learning_rate": 5e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.7950783371925354, + "num_tokens": 524767918.0, + "step": 20272 + }, + { + "epoch": 2.226334285086756, + "grad_norm": 2.1215498447418213, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7536777257919312, + "num_tokens": 524794084.0, + "step": 20273 + }, + { + "epoch": 2.2264441027893698, + "grad_norm": 2.268120765686035, + "learning_rate": 5e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7760535478591919, + "num_tokens": 524815269.0, + "step": 20274 + }, + { + "epoch": 2.226553920491983, + "grad_norm": 2.1262335777282715, + "learning_rate": 5e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7688350677490234, + "num_tokens": 524840567.0, + "step": 20275 + }, + { + "epoch": 2.226663738194597, + "grad_norm": 2.2878780364990234, + "learning_rate": 5e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7820665240287781, + "num_tokens": 524860790.0, + "step": 20276 + }, + { + "epoch": 2.2267735558972106, + "grad_norm": 1.8946242332458496, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7790998220443726, + "num_tokens": 524889510.0, + "step": 20277 + }, + { + "epoch": 2.2268833735998244, + "grad_norm": 1.9000383615493774, + "learning_rate": 5e-06, + "loss": 0.6194, + "mean_token_accuracy": 0.7960582375526428, + "num_tokens": 524917067.0, + "step": 20278 + }, + { + "epoch": 2.226993191302438, + "grad_norm": 2.1357641220092773, + "learning_rate": 5e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7648127675056458, + "num_tokens": 524942939.0, + "step": 20279 + }, + { + "epoch": 2.2271030090050514, + "grad_norm": 2.1553704738616943, + "learning_rate": 5e-06, + "loss": 0.6466, + "mean_token_accuracy": 0.7914490699768066, + "num_tokens": 524968586.0, + "step": 20280 + }, + { + "epoch": 2.227212826707665, + "grad_norm": 2.3100552558898926, + "learning_rate": 5e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.776417076587677, + "num_tokens": 524992204.0, + "step": 20281 + }, + { + "epoch": 2.227322644410279, + "grad_norm": 1.9715217351913452, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7631952166557312, + "num_tokens": 525019558.0, + "step": 20282 + }, + { + "epoch": 2.2274324621128927, + "grad_norm": 2.159681797027588, + "learning_rate": 5e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.775990903377533, + "num_tokens": 525043394.0, + "step": 20283 + }, + { + "epoch": 2.2275422798155065, + "grad_norm": 2.393402576446533, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7758786082267761, + "num_tokens": 525066916.0, + "step": 20284 + }, + { + "epoch": 2.2276520975181198, + "grad_norm": 2.233295202255249, + "learning_rate": 5e-06, + "loss": 0.6836, + "mean_token_accuracy": 0.776723325252533, + "num_tokens": 525090438.0, + "step": 20285 + }, + { + "epoch": 2.2277619152207335, + "grad_norm": 1.884255051612854, + "learning_rate": 5e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7686290144920349, + "num_tokens": 525120534.0, + "step": 20286 + }, + { + "epoch": 2.2278717329233473, + "grad_norm": 2.330763339996338, + "learning_rate": 5e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7703762054443359, + "num_tokens": 525143741.0, + "step": 20287 + }, + { + "epoch": 2.227981550625961, + "grad_norm": 2.1858036518096924, + "learning_rate": 5e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.7764469385147095, + "num_tokens": 525166838.0, + "step": 20288 + }, + { + "epoch": 2.228091368328575, + "grad_norm": 1.9118146896362305, + "learning_rate": 5e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.763399600982666, + "num_tokens": 525199074.0, + "step": 20289 + }, + { + "epoch": 2.228201186031188, + "grad_norm": 2.103043794631958, + "learning_rate": 5e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7827707529067993, + "num_tokens": 525223589.0, + "step": 20290 + }, + { + "epoch": 2.228311003733802, + "grad_norm": 1.9227641820907593, + "learning_rate": 5e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7543061971664429, + "num_tokens": 525251931.0, + "step": 20291 + }, + { + "epoch": 2.2284208214364156, + "grad_norm": 2.013195753097534, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7560127973556519, + "num_tokens": 525279518.0, + "step": 20292 + }, + { + "epoch": 2.2285306391390294, + "grad_norm": 1.8674428462982178, + "learning_rate": 5e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.758621096611023, + "num_tokens": 525310122.0, + "step": 20293 + }, + { + "epoch": 2.2286404568416427, + "grad_norm": 2.2473196983337402, + "learning_rate": 5e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8047187328338623, + "num_tokens": 525332397.0, + "step": 20294 + }, + { + "epoch": 2.2287502745442564, + "grad_norm": 2.1657211780548096, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7720836997032166, + "num_tokens": 525357844.0, + "step": 20295 + }, + { + "epoch": 2.22886009224687, + "grad_norm": 1.9718883037567139, + "learning_rate": 5e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7613263130187988, + "num_tokens": 525389609.0, + "step": 20296 + }, + { + "epoch": 2.228969909949484, + "grad_norm": 2.171370029449463, + "learning_rate": 5e-06, + "loss": 0.6325, + "mean_token_accuracy": 0.7881640195846558, + "num_tokens": 525413779.0, + "step": 20297 + }, + { + "epoch": 2.2290797276520977, + "grad_norm": 1.9973547458648682, + "learning_rate": 5e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7499800324440002, + "num_tokens": 525443537.0, + "step": 20298 + }, + { + "epoch": 2.229189545354711, + "grad_norm": 1.9647501707077026, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.760330319404602, + "num_tokens": 525473189.0, + "step": 20299 + }, + { + "epoch": 2.229299363057325, + "grad_norm": 2.161961078643799, + "learning_rate": 5e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.7842848300933838, + "num_tokens": 525495594.0, + "step": 20300 + }, + { + "epoch": 2.2294091807599385, + "grad_norm": 2.1119184494018555, + "learning_rate": 5e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.746487021446228, + "num_tokens": 525525794.0, + "step": 20301 + }, + { + "epoch": 2.2295189984625523, + "grad_norm": 2.220099925994873, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7652350664138794, + "num_tokens": 525551070.0, + "step": 20302 + }, + { + "epoch": 2.2296288161651656, + "grad_norm": 2.1151490211486816, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7678694725036621, + "num_tokens": 525575348.0, + "step": 20303 + }, + { + "epoch": 2.2297386338677794, + "grad_norm": 2.1272852420806885, + "learning_rate": 5e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7633042931556702, + "num_tokens": 525601907.0, + "step": 20304 + }, + { + "epoch": 2.229848451570393, + "grad_norm": 1.7852731943130493, + "learning_rate": 5e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7431498765945435, + "num_tokens": 525635744.0, + "step": 20305 + }, + { + "epoch": 2.229958269273007, + "grad_norm": 1.8163816928863525, + "learning_rate": 5e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.7826886773109436, + "num_tokens": 525664910.0, + "step": 20306 + }, + { + "epoch": 2.2300680869756206, + "grad_norm": 2.336559295654297, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7577477693557739, + "num_tokens": 525687541.0, + "step": 20307 + }, + { + "epoch": 2.230177904678234, + "grad_norm": 2.1983041763305664, + "learning_rate": 5e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.7807466387748718, + "num_tokens": 525710207.0, + "step": 20308 + }, + { + "epoch": 2.2302877223808477, + "grad_norm": 2.3998632431030273, + "learning_rate": 5e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7706663608551025, + "num_tokens": 525731327.0, + "step": 20309 + }, + { + "epoch": 2.2303975400834615, + "grad_norm": 2.0946638584136963, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7615658044815063, + "num_tokens": 525757795.0, + "step": 20310 + }, + { + "epoch": 2.2305073577860752, + "grad_norm": 1.958951473236084, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7572166919708252, + "num_tokens": 525786782.0, + "step": 20311 + }, + { + "epoch": 2.230617175488689, + "grad_norm": 1.8059006929397583, + "learning_rate": 5e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.7985430955886841, + "num_tokens": 525816905.0, + "step": 20312 + }, + { + "epoch": 2.2307269931913023, + "grad_norm": 2.006885528564453, + "learning_rate": 5e-06, + "loss": 0.602, + "mean_token_accuracy": 0.8019248247146606, + "num_tokens": 525841545.0, + "step": 20313 + }, + { + "epoch": 2.230836810893916, + "grad_norm": 1.9495347738265991, + "learning_rate": 5e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7317235469818115, + "num_tokens": 525874649.0, + "step": 20314 + }, + { + "epoch": 2.23094662859653, + "grad_norm": 1.8797179460525513, + "learning_rate": 5e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7614352107048035, + "num_tokens": 525904495.0, + "step": 20315 + }, + { + "epoch": 2.2310564462991436, + "grad_norm": 1.866417646408081, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.762161374092102, + "num_tokens": 525936522.0, + "step": 20316 + }, + { + "epoch": 2.231166264001757, + "grad_norm": 2.0531725883483887, + "learning_rate": 5e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7637553215026855, + "num_tokens": 525963788.0, + "step": 20317 + }, + { + "epoch": 2.2312760817043706, + "grad_norm": 2.3698618412017822, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7753999829292297, + "num_tokens": 525987735.0, + "step": 20318 + }, + { + "epoch": 2.2313858994069844, + "grad_norm": 2.1357476711273193, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7657558917999268, + "num_tokens": 526014743.0, + "step": 20319 + }, + { + "epoch": 2.231495717109598, + "grad_norm": 2.256525754928589, + "learning_rate": 5e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7666699886322021, + "num_tokens": 526036214.0, + "step": 20320 + }, + { + "epoch": 2.231605534812212, + "grad_norm": 2.331657648086548, + "learning_rate": 5e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7718414068222046, + "num_tokens": 526057205.0, + "step": 20321 + }, + { + "epoch": 2.2317153525148252, + "grad_norm": 2.015345573425293, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7572005987167358, + "num_tokens": 526087606.0, + "step": 20322 + }, + { + "epoch": 2.231825170217439, + "grad_norm": 2.107454538345337, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7599774599075317, + "num_tokens": 526114311.0, + "step": 20323 + }, + { + "epoch": 2.2319349879200527, + "grad_norm": 2.081613779067993, + "learning_rate": 5e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7663396596908569, + "num_tokens": 526140658.0, + "step": 20324 + }, + { + "epoch": 2.2320448056226665, + "grad_norm": 2.0665411949157715, + "learning_rate": 5e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7686734199523926, + "num_tokens": 526165235.0, + "step": 20325 + }, + { + "epoch": 2.23215462332528, + "grad_norm": 2.126133918762207, + "learning_rate": 5e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.7721915245056152, + "num_tokens": 526191820.0, + "step": 20326 + }, + { + "epoch": 2.2322644410278936, + "grad_norm": 2.201861619949341, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7625407576560974, + "num_tokens": 526218700.0, + "step": 20327 + }, + { + "epoch": 2.2323742587305073, + "grad_norm": 2.137056589126587, + "learning_rate": 5e-06, + "loss": 0.6703, + "mean_token_accuracy": 0.775740921497345, + "num_tokens": 526244239.0, + "step": 20328 + }, + { + "epoch": 2.232484076433121, + "grad_norm": 2.159315586090088, + "learning_rate": 5e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.7731432318687439, + "num_tokens": 526266744.0, + "step": 20329 + }, + { + "epoch": 2.232593894135735, + "grad_norm": 2.002953290939331, + "learning_rate": 5e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7604530453681946, + "num_tokens": 526294412.0, + "step": 20330 + }, + { + "epoch": 2.232703711838348, + "grad_norm": 2.1150920391082764, + "learning_rate": 5e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7591229677200317, + "num_tokens": 526321409.0, + "step": 20331 + }, + { + "epoch": 2.232813529540962, + "grad_norm": 1.940474033355713, + "learning_rate": 5e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7529157996177673, + "num_tokens": 526350927.0, + "step": 20332 + }, + { + "epoch": 2.2329233472435757, + "grad_norm": 2.1481924057006836, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7581436634063721, + "num_tokens": 526377553.0, + "step": 20333 + }, + { + "epoch": 2.2330331649461894, + "grad_norm": 2.1304426193237305, + "learning_rate": 5e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7623006701469421, + "num_tokens": 526401780.0, + "step": 20334 + }, + { + "epoch": 2.233142982648803, + "grad_norm": 2.244523048400879, + "learning_rate": 5e-06, + "loss": 0.6366, + "mean_token_accuracy": 0.792816162109375, + "num_tokens": 526423017.0, + "step": 20335 + }, + { + "epoch": 2.2332528003514165, + "grad_norm": 2.235344171524048, + "learning_rate": 5e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.7941775321960449, + "num_tokens": 526445188.0, + "step": 20336 + }, + { + "epoch": 2.2333626180540302, + "grad_norm": 2.0484020709991455, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7635897397994995, + "num_tokens": 526472526.0, + "step": 20337 + }, + { + "epoch": 2.233472435756644, + "grad_norm": 2.1912455558776855, + "learning_rate": 5e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.7636581659317017, + "num_tokens": 526497501.0, + "step": 20338 + }, + { + "epoch": 2.2335822534592578, + "grad_norm": 2.009503126144409, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7566558718681335, + "num_tokens": 526527060.0, + "step": 20339 + }, + { + "epoch": 2.2336920711618715, + "grad_norm": 2.3091976642608643, + "learning_rate": 5e-06, + "loss": 0.6836, + "mean_token_accuracy": 0.776208221912384, + "num_tokens": 526549644.0, + "step": 20340 + }, + { + "epoch": 2.233801888864485, + "grad_norm": 2.323063850402832, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7696508169174194, + "num_tokens": 526570545.0, + "step": 20341 + }, + { + "epoch": 2.2339117065670986, + "grad_norm": 2.081918716430664, + "learning_rate": 5e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7714307308197021, + "num_tokens": 526596761.0, + "step": 20342 + }, + { + "epoch": 2.2340215242697123, + "grad_norm": 2.041710376739502, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7581640481948853, + "num_tokens": 526626127.0, + "step": 20343 + }, + { + "epoch": 2.234131341972326, + "grad_norm": 2.1015625, + "learning_rate": 5e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7591904401779175, + "num_tokens": 526653447.0, + "step": 20344 + }, + { + "epoch": 2.2342411596749394, + "grad_norm": 2.1025819778442383, + "learning_rate": 5e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7714753746986389, + "num_tokens": 526677319.0, + "step": 20345 + }, + { + "epoch": 2.234350977377553, + "grad_norm": 2.0727756023406982, + "learning_rate": 5e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.751349687576294, + "num_tokens": 526705302.0, + "step": 20346 + }, + { + "epoch": 2.234460795080167, + "grad_norm": 2.303196430206299, + "learning_rate": 5e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7729254961013794, + "num_tokens": 526728698.0, + "step": 20347 + }, + { + "epoch": 2.2345706127827807, + "grad_norm": 2.1490564346313477, + "learning_rate": 5e-06, + "loss": 0.6528, + "mean_token_accuracy": 0.7892395853996277, + "num_tokens": 526750817.0, + "step": 20348 + }, + { + "epoch": 2.2346804304853944, + "grad_norm": 2.296741247177124, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7586807012557983, + "num_tokens": 526774950.0, + "step": 20349 + }, + { + "epoch": 2.2347902481880078, + "grad_norm": 2.1091439723968506, + "learning_rate": 5e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.7872231006622314, + "num_tokens": 526799520.0, + "step": 20350 + }, + { + "epoch": 2.2349000658906215, + "grad_norm": 1.798870325088501, + "learning_rate": 5e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7306872010231018, + "num_tokens": 526834500.0, + "step": 20351 + }, + { + "epoch": 2.2350098835932353, + "grad_norm": 2.136460781097412, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7686886787414551, + "num_tokens": 526859269.0, + "step": 20352 + }, + { + "epoch": 2.235119701295849, + "grad_norm": 2.1415650844573975, + "learning_rate": 5e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.743625819683075, + "num_tokens": 526883907.0, + "step": 20353 + }, + { + "epoch": 2.2352295189984623, + "grad_norm": 2.2778873443603516, + "learning_rate": 5e-06, + "loss": 0.6821, + "mean_token_accuracy": 0.7721048593521118, + "num_tokens": 526907451.0, + "step": 20354 + }, + { + "epoch": 2.235339336701076, + "grad_norm": 2.4323060512542725, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.771142840385437, + "num_tokens": 526929656.0, + "step": 20355 + }, + { + "epoch": 2.23544915440369, + "grad_norm": 2.2300984859466553, + "learning_rate": 5e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.7726041078567505, + "num_tokens": 526952002.0, + "step": 20356 + }, + { + "epoch": 2.2355589721063036, + "grad_norm": 2.342416286468506, + "learning_rate": 5e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.7725074291229248, + "num_tokens": 526974864.0, + "step": 20357 + }, + { + "epoch": 2.2356687898089174, + "grad_norm": 1.7833539247512817, + "learning_rate": 5e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7690370678901672, + "num_tokens": 527006740.0, + "step": 20358 + }, + { + "epoch": 2.2357786075115307, + "grad_norm": 2.329704999923706, + "learning_rate": 5e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.7720336318016052, + "num_tokens": 527029986.0, + "step": 20359 + }, + { + "epoch": 2.2358884252141444, + "grad_norm": 2.50323486328125, + "learning_rate": 5e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8049776554107666, + "num_tokens": 527048286.0, + "step": 20360 + }, + { + "epoch": 2.235998242916758, + "grad_norm": 1.9425623416900635, + "learning_rate": 5e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7678039073944092, + "num_tokens": 527075886.0, + "step": 20361 + }, + { + "epoch": 2.236108060619372, + "grad_norm": 2.294687509536743, + "learning_rate": 5e-06, + "loss": 0.6445, + "mean_token_accuracy": 0.7910355925559998, + "num_tokens": 527096296.0, + "step": 20362 + }, + { + "epoch": 2.2362178783219857, + "grad_norm": 1.9893969297409058, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7611725330352783, + "num_tokens": 527126308.0, + "step": 20363 + }, + { + "epoch": 2.236327696024599, + "grad_norm": 2.327568292617798, + "learning_rate": 5e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7705518007278442, + "num_tokens": 527148138.0, + "step": 20364 + }, + { + "epoch": 2.236437513727213, + "grad_norm": 2.1343913078308105, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7563171982765198, + "num_tokens": 527175033.0, + "step": 20365 + }, + { + "epoch": 2.2365473314298265, + "grad_norm": 2.202965259552002, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7649230360984802, + "num_tokens": 527200190.0, + "step": 20366 + }, + { + "epoch": 2.2366571491324403, + "grad_norm": 2.2221922874450684, + "learning_rate": 5e-06, + "loss": 0.6485, + "mean_token_accuracy": 0.7833237648010254, + "num_tokens": 527225186.0, + "step": 20367 + }, + { + "epoch": 2.2367669668350536, + "grad_norm": 2.0187978744506836, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7615765333175659, + "num_tokens": 527253588.0, + "step": 20368 + }, + { + "epoch": 2.2368767845376674, + "grad_norm": 1.8716509342193604, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.772552490234375, + "num_tokens": 527281791.0, + "step": 20369 + }, + { + "epoch": 2.236986602240281, + "grad_norm": 2.1483445167541504, + "learning_rate": 5e-06, + "loss": 0.6373, + "mean_token_accuracy": 0.7774548530578613, + "num_tokens": 527305354.0, + "step": 20370 + }, + { + "epoch": 2.237096419942895, + "grad_norm": 2.136237859725952, + "learning_rate": 5e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7803217172622681, + "num_tokens": 527331312.0, + "step": 20371 + }, + { + "epoch": 2.2372062376455086, + "grad_norm": 1.9379463195800781, + "learning_rate": 5e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7615848779678345, + "num_tokens": 527361277.0, + "step": 20372 + }, + { + "epoch": 2.237316055348122, + "grad_norm": 2.1570956707000732, + "learning_rate": 5e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7692506909370422, + "num_tokens": 527387281.0, + "step": 20373 + }, + { + "epoch": 2.2374258730507357, + "grad_norm": 2.194183111190796, + "learning_rate": 5e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7640317678451538, + "num_tokens": 527412683.0, + "step": 20374 + }, + { + "epoch": 2.2375356907533495, + "grad_norm": 2.369817018508911, + "learning_rate": 5e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.7922440767288208, + "num_tokens": 527434325.0, + "step": 20375 + }, + { + "epoch": 2.237645508455963, + "grad_norm": 1.9419647455215454, + "learning_rate": 5e-06, + "loss": 0.6587, + "mean_token_accuracy": 0.7780004143714905, + "num_tokens": 527462687.0, + "step": 20376 + }, + { + "epoch": 2.2377553261585765, + "grad_norm": 2.1567447185516357, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7479753494262695, + "num_tokens": 527490048.0, + "step": 20377 + }, + { + "epoch": 2.2378651438611903, + "grad_norm": 2.737002372741699, + "learning_rate": 5e-06, + "loss": 0.5893, + "mean_token_accuracy": 0.801601767539978, + "num_tokens": 527506257.0, + "step": 20378 + }, + { + "epoch": 2.237974961563804, + "grad_norm": 2.1449153423309326, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7821849584579468, + "num_tokens": 527531900.0, + "step": 20379 + }, + { + "epoch": 2.238084779266418, + "grad_norm": 2.066892147064209, + "learning_rate": 5e-06, + "loss": 0.6122, + "mean_token_accuracy": 0.7925461530685425, + "num_tokens": 527556819.0, + "step": 20380 + }, + { + "epoch": 2.2381945969690316, + "grad_norm": 2.310549736022949, + "learning_rate": 5e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7755038738250732, + "num_tokens": 527578703.0, + "step": 20381 + }, + { + "epoch": 2.238304414671645, + "grad_norm": 2.0277533531188965, + "learning_rate": 5e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7627922296524048, + "num_tokens": 527605539.0, + "step": 20382 + }, + { + "epoch": 2.2384142323742586, + "grad_norm": 2.329848051071167, + "learning_rate": 5e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7565343379974365, + "num_tokens": 527629494.0, + "step": 20383 + }, + { + "epoch": 2.2385240500768724, + "grad_norm": 2.052259922027588, + "learning_rate": 5e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.774770975112915, + "num_tokens": 527655860.0, + "step": 20384 + }, + { + "epoch": 2.238633867779486, + "grad_norm": 2.3182387351989746, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7716715335845947, + "num_tokens": 527679382.0, + "step": 20385 + }, + { + "epoch": 2.2387436854821, + "grad_norm": 2.178715705871582, + "learning_rate": 5e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7535065412521362, + "num_tokens": 527708708.0, + "step": 20386 + }, + { + "epoch": 2.238853503184713, + "grad_norm": 2.1927826404571533, + "learning_rate": 5e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.7772289514541626, + "num_tokens": 527732075.0, + "step": 20387 + }, + { + "epoch": 2.238963320887327, + "grad_norm": 1.9575116634368896, + "learning_rate": 5e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7568977475166321, + "num_tokens": 527760797.0, + "step": 20388 + }, + { + "epoch": 2.2390731385899407, + "grad_norm": 2.1410272121429443, + "learning_rate": 5e-06, + "loss": 0.6375, + "mean_token_accuracy": 0.7873556613922119, + "num_tokens": 527785705.0, + "step": 20389 + }, + { + "epoch": 2.2391829562925545, + "grad_norm": 2.0767507553100586, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7479639053344727, + "num_tokens": 527815053.0, + "step": 20390 + }, + { + "epoch": 2.2392927739951682, + "grad_norm": 2.0129783153533936, + "learning_rate": 5e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7750756144523621, + "num_tokens": 527842821.0, + "step": 20391 + }, + { + "epoch": 2.2394025916977816, + "grad_norm": 2.182997941970825, + "learning_rate": 5e-06, + "loss": 0.6821, + "mean_token_accuracy": 0.7777231335639954, + "num_tokens": 527866544.0, + "step": 20392 + }, + { + "epoch": 2.2395124094003953, + "grad_norm": 2.04301381111145, + "learning_rate": 5e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7664685249328613, + "num_tokens": 527894531.0, + "step": 20393 + }, + { + "epoch": 2.239622227103009, + "grad_norm": 1.9289538860321045, + "learning_rate": 5e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.755347490310669, + "num_tokens": 527923636.0, + "step": 20394 + }, + { + "epoch": 2.239732044805623, + "grad_norm": 1.983378291130066, + "learning_rate": 5e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7791407108306885, + "num_tokens": 527950554.0, + "step": 20395 + }, + { + "epoch": 2.239841862508236, + "grad_norm": 2.214669942855835, + "learning_rate": 5e-06, + "loss": 0.6109, + "mean_token_accuracy": 0.7961690425872803, + "num_tokens": 527972553.0, + "step": 20396 + }, + { + "epoch": 2.23995168021085, + "grad_norm": 2.0723533630371094, + "learning_rate": 5e-06, + "loss": 0.6453, + "mean_token_accuracy": 0.7790602445602417, + "num_tokens": 527997248.0, + "step": 20397 + }, + { + "epoch": 2.2400614979134637, + "grad_norm": 2.5416698455810547, + "learning_rate": 5e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.7771977186203003, + "num_tokens": 528015842.0, + "step": 20398 + }, + { + "epoch": 2.2401713156160774, + "grad_norm": 1.9058619737625122, + "learning_rate": 5e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.7677072882652283, + "num_tokens": 528046356.0, + "step": 20399 + }, + { + "epoch": 2.240281133318691, + "grad_norm": 1.8772506713867188, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.777401864528656, + "num_tokens": 528078214.0, + "step": 20400 + }, + { + "epoch": 2.2403909510213045, + "grad_norm": 1.963242769241333, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7424180507659912, + "num_tokens": 528107249.0, + "step": 20401 + }, + { + "epoch": 2.2405007687239182, + "grad_norm": 2.3040261268615723, + "learning_rate": 5e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7766681909561157, + "num_tokens": 528130132.0, + "step": 20402 + }, + { + "epoch": 2.240610586426532, + "grad_norm": 2.268439292907715, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7638533115386963, + "num_tokens": 528151735.0, + "step": 20403 + }, + { + "epoch": 2.2407204041291457, + "grad_norm": 2.0386481285095215, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7695525884628296, + "num_tokens": 528177933.0, + "step": 20404 + }, + { + "epoch": 2.240830221831759, + "grad_norm": 1.7096494436264038, + "learning_rate": 5e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7560377717018127, + "num_tokens": 528214596.0, + "step": 20405 + }, + { + "epoch": 2.240940039534373, + "grad_norm": 2.247211217880249, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7620514631271362, + "num_tokens": 528240198.0, + "step": 20406 + }, + { + "epoch": 2.2410498572369866, + "grad_norm": 2.145909309387207, + "learning_rate": 5e-06, + "loss": 0.662, + "mean_token_accuracy": 0.78120356798172, + "num_tokens": 528265095.0, + "step": 20407 + }, + { + "epoch": 2.2411596749396003, + "grad_norm": 2.075597047805786, + "learning_rate": 5e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7556418180465698, + "num_tokens": 528291221.0, + "step": 20408 + }, + { + "epoch": 2.241269492642214, + "grad_norm": 2.2714555263519287, + "learning_rate": 5e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7692063450813293, + "num_tokens": 528314792.0, + "step": 20409 + }, + { + "epoch": 2.2413793103448274, + "grad_norm": 2.0285744667053223, + "learning_rate": 5e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7483569979667664, + "num_tokens": 528341086.0, + "step": 20410 + }, + { + "epoch": 2.241489128047441, + "grad_norm": 2.1954078674316406, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7654609680175781, + "num_tokens": 528366459.0, + "step": 20411 + }, + { + "epoch": 2.241598945750055, + "grad_norm": 1.9737515449523926, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7589582800865173, + "num_tokens": 528394629.0, + "step": 20412 + }, + { + "epoch": 2.2417087634526687, + "grad_norm": 2.0708091259002686, + "learning_rate": 5e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.7785261273384094, + "num_tokens": 528422648.0, + "step": 20413 + }, + { + "epoch": 2.2418185811552824, + "grad_norm": 1.9289562702178955, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7763562202453613, + "num_tokens": 528453218.0, + "step": 20414 + }, + { + "epoch": 2.2419283988578957, + "grad_norm": 2.0159850120544434, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7538295388221741, + "num_tokens": 528481632.0, + "step": 20415 + }, + { + "epoch": 2.2420382165605095, + "grad_norm": 2.236553192138672, + "learning_rate": 5e-06, + "loss": 0.6656, + "mean_token_accuracy": 0.7751693725585938, + "num_tokens": 528504691.0, + "step": 20416 + }, + { + "epoch": 2.2421480342631233, + "grad_norm": 2.2256855964660645, + "learning_rate": 5e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7684653401374817, + "num_tokens": 528529977.0, + "step": 20417 + }, + { + "epoch": 2.242257851965737, + "grad_norm": 2.127429246902466, + "learning_rate": 5e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7647638916969299, + "num_tokens": 528556189.0, + "step": 20418 + }, + { + "epoch": 2.2423676696683508, + "grad_norm": 2.169468641281128, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7551068067550659, + "num_tokens": 528582351.0, + "step": 20419 + }, + { + "epoch": 2.242477487370964, + "grad_norm": 2.1426472663879395, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7540525197982788, + "num_tokens": 528608718.0, + "step": 20420 + }, + { + "epoch": 2.242587305073578, + "grad_norm": 1.8984230756759644, + "learning_rate": 5e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7623659372329712, + "num_tokens": 528641405.0, + "step": 20421 + }, + { + "epoch": 2.2426971227761916, + "grad_norm": 2.009838581085205, + "learning_rate": 5e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.7753068208694458, + "num_tokens": 528665164.0, + "step": 20422 + }, + { + "epoch": 2.2428069404788054, + "grad_norm": 1.8230725526809692, + "learning_rate": 5e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7488653659820557, + "num_tokens": 528697903.0, + "step": 20423 + }, + { + "epoch": 2.2429167581814187, + "grad_norm": 2.514605760574341, + "learning_rate": 5e-06, + "loss": 0.652, + "mean_token_accuracy": 0.7795297503471375, + "num_tokens": 528715782.0, + "step": 20424 + }, + { + "epoch": 2.2430265758840324, + "grad_norm": 1.792496919631958, + "learning_rate": 5e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.7779415845870972, + "num_tokens": 528748432.0, + "step": 20425 + }, + { + "epoch": 2.243136393586646, + "grad_norm": 2.0886950492858887, + "learning_rate": 5e-06, + "loss": 0.6277, + "mean_token_accuracy": 0.7925686240196228, + "num_tokens": 528772997.0, + "step": 20426 + }, + { + "epoch": 2.24324621128926, + "grad_norm": 1.981788158416748, + "learning_rate": 5e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.76926589012146, + "num_tokens": 528802420.0, + "step": 20427 + }, + { + "epoch": 2.2433560289918737, + "grad_norm": 2.216290235519409, + "learning_rate": 5e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7797766923904419, + "num_tokens": 528825765.0, + "step": 20428 + }, + { + "epoch": 2.243465846694487, + "grad_norm": 1.7883408069610596, + "learning_rate": 5e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7224266529083252, + "num_tokens": 528865792.0, + "step": 20429 + }, + { + "epoch": 2.2435756643971008, + "grad_norm": 2.0575335025787354, + "learning_rate": 5e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.746562123298645, + "num_tokens": 528893972.0, + "step": 20430 + }, + { + "epoch": 2.2436854820997145, + "grad_norm": 2.3071560859680176, + "learning_rate": 5e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.7716464996337891, + "num_tokens": 528916899.0, + "step": 20431 + }, + { + "epoch": 2.2437952998023283, + "grad_norm": 2.2691292762756348, + "learning_rate": 5e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7658141255378723, + "num_tokens": 528940296.0, + "step": 20432 + }, + { + "epoch": 2.2439051175049416, + "grad_norm": 2.0607924461364746, + "learning_rate": 5e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7633463144302368, + "num_tokens": 528966136.0, + "step": 20433 + }, + { + "epoch": 2.2440149352075554, + "grad_norm": 1.836782693862915, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7633378505706787, + "num_tokens": 528997213.0, + "step": 20434 + }, + { + "epoch": 2.244124752910169, + "grad_norm": 2.117021322250366, + "learning_rate": 5e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.7722684144973755, + "num_tokens": 529023824.0, + "step": 20435 + }, + { + "epoch": 2.244234570612783, + "grad_norm": 1.9585518836975098, + "learning_rate": 5e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7543845176696777, + "num_tokens": 529050767.0, + "step": 20436 + }, + { + "epoch": 2.2443443883153966, + "grad_norm": 1.948984146118164, + "learning_rate": 5e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7604260444641113, + "num_tokens": 529080408.0, + "step": 20437 + }, + { + "epoch": 2.24445420601801, + "grad_norm": 1.967470645904541, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7582591772079468, + "num_tokens": 529106850.0, + "step": 20438 + }, + { + "epoch": 2.2445640237206237, + "grad_norm": 2.0691285133361816, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7560349106788635, + "num_tokens": 529134176.0, + "step": 20439 + }, + { + "epoch": 2.2446738414232374, + "grad_norm": 1.9625060558319092, + "learning_rate": 5e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7471874952316284, + "num_tokens": 529164332.0, + "step": 20440 + }, + { + "epoch": 2.244783659125851, + "grad_norm": 2.1317949295043945, + "learning_rate": 5e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7638404965400696, + "num_tokens": 529189408.0, + "step": 20441 + }, + { + "epoch": 2.244893476828465, + "grad_norm": 1.9112694263458252, + "learning_rate": 5e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.756472110748291, + "num_tokens": 529219342.0, + "step": 20442 + }, + { + "epoch": 2.2450032945310783, + "grad_norm": 1.898038625717163, + "learning_rate": 5e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7500137686729431, + "num_tokens": 529252486.0, + "step": 20443 + }, + { + "epoch": 2.245113112233692, + "grad_norm": 2.0495450496673584, + "learning_rate": 5e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7650287747383118, + "num_tokens": 529279389.0, + "step": 20444 + }, + { + "epoch": 2.245222929936306, + "grad_norm": 1.970949649810791, + "learning_rate": 5e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7710318565368652, + "num_tokens": 529310475.0, + "step": 20445 + }, + { + "epoch": 2.2453327476389195, + "grad_norm": 2.082937240600586, + "learning_rate": 5e-06, + "loss": 0.6443, + "mean_token_accuracy": 0.781775951385498, + "num_tokens": 529336926.0, + "step": 20446 + }, + { + "epoch": 2.245442565341533, + "grad_norm": 2.1761958599090576, + "learning_rate": 5e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7678683996200562, + "num_tokens": 529361984.0, + "step": 20447 + }, + { + "epoch": 2.2455523830441466, + "grad_norm": 2.165713310241699, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7621690630912781, + "num_tokens": 529387127.0, + "step": 20448 + }, + { + "epoch": 2.2456622007467604, + "grad_norm": 2.1851816177368164, + "learning_rate": 5e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7606417536735535, + "num_tokens": 529411728.0, + "step": 20449 + }, + { + "epoch": 2.245772018449374, + "grad_norm": 2.1036996841430664, + "learning_rate": 5e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.7689807415008545, + "num_tokens": 529437262.0, + "step": 20450 + }, + { + "epoch": 2.245881836151988, + "grad_norm": 2.0621259212493896, + "learning_rate": 5e-06, + "loss": 0.6439, + "mean_token_accuracy": 0.7904155254364014, + "num_tokens": 529461459.0, + "step": 20451 + }, + { + "epoch": 2.245991653854601, + "grad_norm": 1.9971760511398315, + "learning_rate": 5e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7434360980987549, + "num_tokens": 529489832.0, + "step": 20452 + }, + { + "epoch": 2.246101471557215, + "grad_norm": 2.2052061557769775, + "learning_rate": 5e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.762160062789917, + "num_tokens": 529514105.0, + "step": 20453 + }, + { + "epoch": 2.2462112892598287, + "grad_norm": 2.411196231842041, + "learning_rate": 5e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.7718465328216553, + "num_tokens": 529536132.0, + "step": 20454 + }, + { + "epoch": 2.2463211069624425, + "grad_norm": 1.9726238250732422, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7624477744102478, + "num_tokens": 529564112.0, + "step": 20455 + }, + { + "epoch": 2.246430924665056, + "grad_norm": 2.096562623977661, + "learning_rate": 5e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7580685615539551, + "num_tokens": 529588459.0, + "step": 20456 + }, + { + "epoch": 2.2465407423676695, + "grad_norm": 1.9542182683944702, + "learning_rate": 5e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7371681928634644, + "num_tokens": 529620741.0, + "step": 20457 + }, + { + "epoch": 2.2466505600702833, + "grad_norm": 1.9708468914031982, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7707446217536926, + "num_tokens": 529649646.0, + "step": 20458 + }, + { + "epoch": 2.246760377772897, + "grad_norm": 2.137141466140747, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7664083242416382, + "num_tokens": 529675809.0, + "step": 20459 + }, + { + "epoch": 2.246870195475511, + "grad_norm": 2.2014386653900146, + "learning_rate": 5e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.7651710510253906, + "num_tokens": 529699958.0, + "step": 20460 + }, + { + "epoch": 2.246980013178124, + "grad_norm": 1.662419319152832, + "learning_rate": 5e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7478873133659363, + "num_tokens": 529738671.0, + "step": 20461 + }, + { + "epoch": 2.247089830880738, + "grad_norm": 2.1545722484588623, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7412636280059814, + "num_tokens": 529764707.0, + "step": 20462 + }, + { + "epoch": 2.2471996485833516, + "grad_norm": 2.0553722381591797, + "learning_rate": 5e-06, + "loss": 0.637, + "mean_token_accuracy": 0.7884894609451294, + "num_tokens": 529790320.0, + "step": 20463 + }, + { + "epoch": 2.2473094662859654, + "grad_norm": 2.203685760498047, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7665380239486694, + "num_tokens": 529814173.0, + "step": 20464 + }, + { + "epoch": 2.247419283988579, + "grad_norm": 2.2547032833099365, + "learning_rate": 5e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.7737904787063599, + "num_tokens": 529837458.0, + "step": 20465 + }, + { + "epoch": 2.2475291016911925, + "grad_norm": 2.1462745666503906, + "learning_rate": 5e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7564154863357544, + "num_tokens": 529862437.0, + "step": 20466 + }, + { + "epoch": 2.2476389193938062, + "grad_norm": 2.1270875930786133, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7478447556495667, + "num_tokens": 529887576.0, + "step": 20467 + }, + { + "epoch": 2.24774873709642, + "grad_norm": 1.9558581113815308, + "learning_rate": 5e-06, + "loss": 0.666, + "mean_token_accuracy": 0.7815393805503845, + "num_tokens": 529913388.0, + "step": 20468 + }, + { + "epoch": 2.2478585547990337, + "grad_norm": 2.119584321975708, + "learning_rate": 5e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7550812363624573, + "num_tokens": 529937717.0, + "step": 20469 + }, + { + "epoch": 2.2479683725016475, + "grad_norm": 2.2049248218536377, + "learning_rate": 5e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7715185880661011, + "num_tokens": 529962534.0, + "step": 20470 + }, + { + "epoch": 2.248078190204261, + "grad_norm": 2.3055479526519775, + "learning_rate": 5e-06, + "loss": 0.6662, + "mean_token_accuracy": 0.7856884002685547, + "num_tokens": 529985062.0, + "step": 20471 + }, + { + "epoch": 2.2481880079068746, + "grad_norm": 2.0454375743865967, + "learning_rate": 5e-06, + "loss": 0.6843, + "mean_token_accuracy": 0.7791796922683716, + "num_tokens": 530010223.0, + "step": 20472 + }, + { + "epoch": 2.2482978256094883, + "grad_norm": 2.005403518676758, + "learning_rate": 5e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7767344117164612, + "num_tokens": 530037177.0, + "step": 20473 + }, + { + "epoch": 2.248407643312102, + "grad_norm": 2.1826469898223877, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7701042294502258, + "num_tokens": 530060419.0, + "step": 20474 + }, + { + "epoch": 2.2485174610147154, + "grad_norm": 2.0802114009857178, + "learning_rate": 5e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.7760511636734009, + "num_tokens": 530084574.0, + "step": 20475 + }, + { + "epoch": 2.248627278717329, + "grad_norm": 2.037020683288574, + "learning_rate": 5e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7653340101242065, + "num_tokens": 530113951.0, + "step": 20476 + }, + { + "epoch": 2.248737096419943, + "grad_norm": 1.8204618692398071, + "learning_rate": 5e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.7409100532531738, + "num_tokens": 530148018.0, + "step": 20477 + }, + { + "epoch": 2.2488469141225567, + "grad_norm": 2.3198306560516357, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.764931321144104, + "num_tokens": 530171184.0, + "step": 20478 + }, + { + "epoch": 2.2489567318251704, + "grad_norm": 1.9014657735824585, + "learning_rate": 5e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.7811237573623657, + "num_tokens": 530198862.0, + "step": 20479 + }, + { + "epoch": 2.2490665495277837, + "grad_norm": 2.118239164352417, + "learning_rate": 5e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7890873551368713, + "num_tokens": 530225115.0, + "step": 20480 + }, + { + "epoch": 2.2491763672303975, + "grad_norm": 2.458469867706299, + "learning_rate": 5e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7803807258605957, + "num_tokens": 530247385.0, + "step": 20481 + }, + { + "epoch": 2.2492861849330112, + "grad_norm": 1.893315076828003, + "learning_rate": 5e-06, + "loss": 0.6767, + "mean_token_accuracy": 0.7808526754379272, + "num_tokens": 530277393.0, + "step": 20482 + }, + { + "epoch": 2.249396002635625, + "grad_norm": 2.130005121231079, + "learning_rate": 5e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.7820783853530884, + "num_tokens": 530302336.0, + "step": 20483 + }, + { + "epoch": 2.2495058203382383, + "grad_norm": 1.9765079021453857, + "learning_rate": 5e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7554044127464294, + "num_tokens": 530330443.0, + "step": 20484 + }, + { + "epoch": 2.249615638040852, + "grad_norm": 2.217466354370117, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.755671501159668, + "num_tokens": 530354809.0, + "step": 20485 + }, + { + "epoch": 2.249725455743466, + "grad_norm": 1.8455171585083008, + "learning_rate": 5e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7378513813018799, + "num_tokens": 530384716.0, + "step": 20486 + }, + { + "epoch": 2.2498352734460796, + "grad_norm": 2.2121617794036865, + "learning_rate": 5e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7682881355285645, + "num_tokens": 530410295.0, + "step": 20487 + }, + { + "epoch": 2.2499450911486933, + "grad_norm": 2.064399242401123, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7513461112976074, + "num_tokens": 530438704.0, + "step": 20488 + }, + { + "epoch": 2.2500549088513067, + "grad_norm": 2.288141965866089, + "learning_rate": 5e-06, + "loss": 0.6337, + "mean_token_accuracy": 0.7858996391296387, + "num_tokens": 530459665.0, + "step": 20489 + }, + { + "epoch": 2.2501647265539204, + "grad_norm": 2.1719632148742676, + "learning_rate": 5e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.78751540184021, + "num_tokens": 530482192.0, + "step": 20490 + }, + { + "epoch": 2.250274544256534, + "grad_norm": 1.957472562789917, + "learning_rate": 5e-06, + "loss": 0.6547, + "mean_token_accuracy": 0.7857805490493774, + "num_tokens": 530509075.0, + "step": 20491 + }, + { + "epoch": 2.250384361959148, + "grad_norm": 1.9811681509017944, + "learning_rate": 5e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.745140552520752, + "num_tokens": 530539021.0, + "step": 20492 + }, + { + "epoch": 2.2504941796617617, + "grad_norm": 1.9748482704162598, + "learning_rate": 5e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.7856078743934631, + "num_tokens": 530565171.0, + "step": 20493 + }, + { + "epoch": 2.250603997364375, + "grad_norm": 2.268533229827881, + "learning_rate": 5e-06, + "loss": 0.6565, + "mean_token_accuracy": 0.7800697088241577, + "num_tokens": 530588297.0, + "step": 20494 + }, + { + "epoch": 2.2507138150669888, + "grad_norm": 2.112741231918335, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7737308740615845, + "num_tokens": 530612327.0, + "step": 20495 + }, + { + "epoch": 2.2508236327696025, + "grad_norm": 2.046501398086548, + "learning_rate": 5e-06, + "loss": 0.6515, + "mean_token_accuracy": 0.7853047251701355, + "num_tokens": 530636735.0, + "step": 20496 + }, + { + "epoch": 2.2509334504722163, + "grad_norm": 2.024672031402588, + "learning_rate": 5e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7728574872016907, + "num_tokens": 530662581.0, + "step": 20497 + }, + { + "epoch": 2.25104326817483, + "grad_norm": 2.1298696994781494, + "learning_rate": 5e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.7771638035774231, + "num_tokens": 530686537.0, + "step": 20498 + }, + { + "epoch": 2.2511530858774433, + "grad_norm": 2.1581292152404785, + "learning_rate": 5e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7716498374938965, + "num_tokens": 530710694.0, + "step": 20499 + }, + { + "epoch": 2.251262903580057, + "grad_norm": 2.1109790802001953, + "learning_rate": 5e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7448883056640625, + "num_tokens": 530736355.0, + "step": 20500 + }, + { + "epoch": 2.251372721282671, + "grad_norm": 1.881932020187378, + "learning_rate": 5e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7432961463928223, + "num_tokens": 530768565.0, + "step": 20501 + }, + { + "epoch": 2.2514825389852846, + "grad_norm": 2.4142918586730957, + "learning_rate": 5e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.7812299728393555, + "num_tokens": 530788741.0, + "step": 20502 + }, + { + "epoch": 2.251592356687898, + "grad_norm": 2.1342756748199463, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7458460330963135, + "num_tokens": 530816824.0, + "step": 20503 + }, + { + "epoch": 2.2517021743905117, + "grad_norm": 2.1354243755340576, + "learning_rate": 5e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.7859034538269043, + "num_tokens": 530841473.0, + "step": 20504 + }, + { + "epoch": 2.2518119920931254, + "grad_norm": 2.322786331176758, + "learning_rate": 5e-06, + "loss": 0.6565, + "mean_token_accuracy": 0.7853026390075684, + "num_tokens": 530861435.0, + "step": 20505 + }, + { + "epoch": 2.251921809795739, + "grad_norm": 2.0166168212890625, + "learning_rate": 5e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7754154205322266, + "num_tokens": 530888085.0, + "step": 20506 + }, + { + "epoch": 2.2520316274983525, + "grad_norm": 2.0264010429382324, + "learning_rate": 5e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.788044810295105, + "num_tokens": 530914203.0, + "step": 20507 + }, + { + "epoch": 2.2521414452009663, + "grad_norm": 1.9089674949645996, + "learning_rate": 5e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7616103887557983, + "num_tokens": 530943033.0, + "step": 20508 + }, + { + "epoch": 2.25225126290358, + "grad_norm": 2.1862783432006836, + "learning_rate": 5e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.789568305015564, + "num_tokens": 530965575.0, + "step": 20509 + }, + { + "epoch": 2.2523610806061938, + "grad_norm": 2.2891845703125, + "learning_rate": 5e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7699241638183594, + "num_tokens": 530988747.0, + "step": 20510 + }, + { + "epoch": 2.2524708983088075, + "grad_norm": 2.2265868186950684, + "learning_rate": 5e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7558670043945312, + "num_tokens": 531012072.0, + "step": 20511 + }, + { + "epoch": 2.252580716011421, + "grad_norm": 2.2230217456817627, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7523167729377747, + "num_tokens": 531036090.0, + "step": 20512 + }, + { + "epoch": 2.2526905337140346, + "grad_norm": 1.8413561582565308, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.769758939743042, + "num_tokens": 531068599.0, + "step": 20513 + }, + { + "epoch": 2.2528003514166484, + "grad_norm": 2.079061985015869, + "learning_rate": 5e-06, + "loss": 0.6015, + "mean_token_accuracy": 0.7991507053375244, + "num_tokens": 531092550.0, + "step": 20514 + }, + { + "epoch": 2.252910169119262, + "grad_norm": 2.014988660812378, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7659599781036377, + "num_tokens": 531117415.0, + "step": 20515 + }, + { + "epoch": 2.253019986821876, + "grad_norm": 2.144803047180176, + "learning_rate": 5e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.7779809832572937, + "num_tokens": 531139843.0, + "step": 20516 + }, + { + "epoch": 2.253129804524489, + "grad_norm": 1.973380446434021, + "learning_rate": 5e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.7737913727760315, + "num_tokens": 531167681.0, + "step": 20517 + }, + { + "epoch": 2.253239622227103, + "grad_norm": 2.215791940689087, + "learning_rate": 5e-06, + "loss": 0.625, + "mean_token_accuracy": 0.7926679849624634, + "num_tokens": 531190841.0, + "step": 20518 + }, + { + "epoch": 2.2533494399297167, + "grad_norm": 2.118295192718506, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.768261194229126, + "num_tokens": 531216193.0, + "step": 20519 + }, + { + "epoch": 2.2534592576323305, + "grad_norm": 2.093158483505249, + "learning_rate": 5e-06, + "loss": 0.6442, + "mean_token_accuracy": 0.784611701965332, + "num_tokens": 531241506.0, + "step": 20520 + }, + { + "epoch": 2.253569075334944, + "grad_norm": 2.4455015659332275, + "learning_rate": 5e-06, + "loss": 0.6617, + "mean_token_accuracy": 0.7748551368713379, + "num_tokens": 531260057.0, + "step": 20521 + }, + { + "epoch": 2.2536788930375575, + "grad_norm": 1.9838694334030151, + "learning_rate": 5e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7686352729797363, + "num_tokens": 531288516.0, + "step": 20522 + }, + { + "epoch": 2.2537887107401713, + "grad_norm": 2.0541293621063232, + "learning_rate": 5e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7655285000801086, + "num_tokens": 531315494.0, + "step": 20523 + }, + { + "epoch": 2.253898528442785, + "grad_norm": 2.1495368480682373, + "learning_rate": 5e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7706284523010254, + "num_tokens": 531340653.0, + "step": 20524 + }, + { + "epoch": 2.254008346145399, + "grad_norm": 2.00380539894104, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7631979584693909, + "num_tokens": 531369782.0, + "step": 20525 + }, + { + "epoch": 2.254118163848012, + "grad_norm": 2.1094305515289307, + "learning_rate": 5e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7521234154701233, + "num_tokens": 531397907.0, + "step": 20526 + }, + { + "epoch": 2.254227981550626, + "grad_norm": 2.0084762573242188, + "learning_rate": 5e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7609531879425049, + "num_tokens": 531425994.0, + "step": 20527 + }, + { + "epoch": 2.2543377992532396, + "grad_norm": 1.828403353691101, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7589671015739441, + "num_tokens": 531455915.0, + "step": 20528 + }, + { + "epoch": 2.2544476169558534, + "grad_norm": 2.1937520503997803, + "learning_rate": 5e-06, + "loss": 0.672, + "mean_token_accuracy": 0.7762660980224609, + "num_tokens": 531479546.0, + "step": 20529 + }, + { + "epoch": 2.2545574346584667, + "grad_norm": 1.8555312156677246, + "learning_rate": 5e-06, + "loss": 0.6518, + "mean_token_accuracy": 0.7819069027900696, + "num_tokens": 531511396.0, + "step": 20530 + }, + { + "epoch": 2.2546672523610805, + "grad_norm": 2.008115530014038, + "learning_rate": 5e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.7752922773361206, + "num_tokens": 531541074.0, + "step": 20531 + }, + { + "epoch": 2.254777070063694, + "grad_norm": 2.138561487197876, + "learning_rate": 5e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.757570743560791, + "num_tokens": 531566456.0, + "step": 20532 + }, + { + "epoch": 2.254886887766308, + "grad_norm": 1.9727369546890259, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7534599900245667, + "num_tokens": 531596795.0, + "step": 20533 + }, + { + "epoch": 2.2549967054689217, + "grad_norm": 2.4464221000671387, + "learning_rate": 5e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.7744967937469482, + "num_tokens": 531616808.0, + "step": 20534 + }, + { + "epoch": 2.255106523171535, + "grad_norm": 2.1796562671661377, + "learning_rate": 5e-06, + "loss": 0.6195, + "mean_token_accuracy": 0.791063666343689, + "num_tokens": 531638801.0, + "step": 20535 + }, + { + "epoch": 2.255216340874149, + "grad_norm": 2.1959497928619385, + "learning_rate": 5e-06, + "loss": 0.6461, + "mean_token_accuracy": 0.7913321256637573, + "num_tokens": 531661399.0, + "step": 20536 + }, + { + "epoch": 2.2553261585767626, + "grad_norm": 1.9237529039382935, + "learning_rate": 5e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.787638247013092, + "num_tokens": 531691218.0, + "step": 20537 + }, + { + "epoch": 2.2554359762793763, + "grad_norm": 2.489934206008911, + "learning_rate": 5e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7713031768798828, + "num_tokens": 531711822.0, + "step": 20538 + }, + { + "epoch": 2.25554579398199, + "grad_norm": 2.3065390586853027, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.768923282623291, + "num_tokens": 531736242.0, + "step": 20539 + }, + { + "epoch": 2.2556556116846034, + "grad_norm": 1.939412236213684, + "learning_rate": 5e-06, + "loss": 0.603, + "mean_token_accuracy": 0.7979302406311035, + "num_tokens": 531764305.0, + "step": 20540 + }, + { + "epoch": 2.255765429387217, + "grad_norm": 2.414238214492798, + "learning_rate": 5e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.7946770787239075, + "num_tokens": 531783492.0, + "step": 20541 + }, + { + "epoch": 2.255875247089831, + "grad_norm": 2.28125262260437, + "learning_rate": 5e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.7962450981140137, + "num_tokens": 531803718.0, + "step": 20542 + }, + { + "epoch": 2.2559850647924446, + "grad_norm": 2.0849990844726562, + "learning_rate": 5e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.7779439687728882, + "num_tokens": 531831939.0, + "step": 20543 + }, + { + "epoch": 2.2560948824950584, + "grad_norm": 2.103201150894165, + "learning_rate": 5e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7539829015731812, + "num_tokens": 531858299.0, + "step": 20544 + }, + { + "epoch": 2.2562047001976717, + "grad_norm": 1.8110347986221313, + "learning_rate": 5e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7591849565505981, + "num_tokens": 531892255.0, + "step": 20545 + }, + { + "epoch": 2.2563145179002855, + "grad_norm": 2.022167921066284, + "learning_rate": 5e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.760650634765625, + "num_tokens": 531921068.0, + "step": 20546 + }, + { + "epoch": 2.2564243356028992, + "grad_norm": 2.257533550262451, + "learning_rate": 5e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.7702673077583313, + "num_tokens": 531942936.0, + "step": 20547 + }, + { + "epoch": 2.256534153305513, + "grad_norm": 2.021958351135254, + "learning_rate": 5e-06, + "loss": 0.672, + "mean_token_accuracy": 0.7822571396827698, + "num_tokens": 531969597.0, + "step": 20548 + }, + { + "epoch": 2.2566439710081267, + "grad_norm": 2.4140350818634033, + "learning_rate": 5e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.775364875793457, + "num_tokens": 531991098.0, + "step": 20549 + }, + { + "epoch": 2.25675378871074, + "grad_norm": 2.280888795852661, + "learning_rate": 5e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7621382474899292, + "num_tokens": 532016456.0, + "step": 20550 + }, + { + "epoch": 2.256863606413354, + "grad_norm": 1.8389021158218384, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7508400678634644, + "num_tokens": 532049115.0, + "step": 20551 + }, + { + "epoch": 2.2569734241159676, + "grad_norm": 2.219331979751587, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7633296847343445, + "num_tokens": 532074327.0, + "step": 20552 + }, + { + "epoch": 2.2570832418185813, + "grad_norm": 2.1872847080230713, + "learning_rate": 5e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7764421701431274, + "num_tokens": 532099436.0, + "step": 20553 + }, + { + "epoch": 2.2571930595211946, + "grad_norm": 2.0472936630249023, + "learning_rate": 5e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.7775588035583496, + "num_tokens": 532125520.0, + "step": 20554 + }, + { + "epoch": 2.2573028772238084, + "grad_norm": 2.0486927032470703, + "learning_rate": 5e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.765891432762146, + "num_tokens": 532152800.0, + "step": 20555 + }, + { + "epoch": 2.257412694926422, + "grad_norm": 2.2719805240631104, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7829427123069763, + "num_tokens": 532173613.0, + "step": 20556 + }, + { + "epoch": 2.257522512629036, + "grad_norm": 1.9500372409820557, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7611449956893921, + "num_tokens": 532202695.0, + "step": 20557 + }, + { + "epoch": 2.2576323303316492, + "grad_norm": 1.8796489238739014, + "learning_rate": 5e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7731961011886597, + "num_tokens": 532232983.0, + "step": 20558 + }, + { + "epoch": 2.257742148034263, + "grad_norm": 1.8931429386138916, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7349869608879089, + "num_tokens": 532263535.0, + "step": 20559 + }, + { + "epoch": 2.2578519657368767, + "grad_norm": 2.1444315910339355, + "learning_rate": 5e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.7769071459770203, + "num_tokens": 532287089.0, + "step": 20560 + }, + { + "epoch": 2.2579617834394905, + "grad_norm": 2.0696792602539062, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7661316394805908, + "num_tokens": 532313868.0, + "step": 20561 + }, + { + "epoch": 2.2580716011421043, + "grad_norm": 2.2811970710754395, + "learning_rate": 5e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.772953987121582, + "num_tokens": 532335086.0, + "step": 20562 + }, + { + "epoch": 2.2581814188447176, + "grad_norm": 2.2142882347106934, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7655267715454102, + "num_tokens": 532357911.0, + "step": 20563 + }, + { + "epoch": 2.2582912365473313, + "grad_norm": 2.1717369556427, + "learning_rate": 5e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7715960741043091, + "num_tokens": 532386950.0, + "step": 20564 + }, + { + "epoch": 2.258401054249945, + "grad_norm": 2.1561059951782227, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7599484324455261, + "num_tokens": 532414135.0, + "step": 20565 + }, + { + "epoch": 2.258510871952559, + "grad_norm": 1.9168422222137451, + "learning_rate": 5e-06, + "loss": 0.6292, + "mean_token_accuracy": 0.7968303561210632, + "num_tokens": 532442260.0, + "step": 20566 + }, + { + "epoch": 2.2586206896551726, + "grad_norm": 2.0945968627929688, + "learning_rate": 5e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.756679356098175, + "num_tokens": 532467155.0, + "step": 20567 + }, + { + "epoch": 2.258730507357786, + "grad_norm": 2.2371561527252197, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7547239661216736, + "num_tokens": 532493195.0, + "step": 20568 + }, + { + "epoch": 2.2588403250603997, + "grad_norm": 2.0376710891723633, + "learning_rate": 5e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7573609352111816, + "num_tokens": 532520031.0, + "step": 20569 + }, + { + "epoch": 2.2589501427630134, + "grad_norm": 2.231933116912842, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7739526033401489, + "num_tokens": 532544913.0, + "step": 20570 + }, + { + "epoch": 2.259059960465627, + "grad_norm": 1.8773220777511597, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7727104425430298, + "num_tokens": 532573829.0, + "step": 20571 + }, + { + "epoch": 2.259169778168241, + "grad_norm": 1.965973973274231, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7442413568496704, + "num_tokens": 532601191.0, + "step": 20572 + }, + { + "epoch": 2.2592795958708543, + "grad_norm": 1.9669204950332642, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7436116337776184, + "num_tokens": 532632149.0, + "step": 20573 + }, + { + "epoch": 2.259389413573468, + "grad_norm": 2.0555355548858643, + "learning_rate": 5e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7654057741165161, + "num_tokens": 532656669.0, + "step": 20574 + }, + { + "epoch": 2.2594992312760818, + "grad_norm": 2.3575551509857178, + "learning_rate": 5e-06, + "loss": 0.6508, + "mean_token_accuracy": 0.784778892993927, + "num_tokens": 532676083.0, + "step": 20575 + }, + { + "epoch": 2.2596090489786955, + "grad_norm": 2.337581157684326, + "learning_rate": 5e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.7863496541976929, + "num_tokens": 532696435.0, + "step": 20576 + }, + { + "epoch": 2.259718866681309, + "grad_norm": 2.0970427989959717, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7616367936134338, + "num_tokens": 532722279.0, + "step": 20577 + }, + { + "epoch": 2.2598286843839226, + "grad_norm": 2.2487504482269287, + "learning_rate": 5e-06, + "loss": 0.6065, + "mean_token_accuracy": 0.7902660965919495, + "num_tokens": 532743739.0, + "step": 20578 + }, + { + "epoch": 2.2599385020865363, + "grad_norm": 2.155369997024536, + "learning_rate": 5e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8027071952819824, + "num_tokens": 532764952.0, + "step": 20579 + }, + { + "epoch": 2.26004831978915, + "grad_norm": 2.261247158050537, + "learning_rate": 5e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7765381336212158, + "num_tokens": 532786289.0, + "step": 20580 + }, + { + "epoch": 2.260158137491764, + "grad_norm": 2.151587724685669, + "learning_rate": 5e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.7800359725952148, + "num_tokens": 532808225.0, + "step": 20581 + }, + { + "epoch": 2.260267955194377, + "grad_norm": 2.1802806854248047, + "learning_rate": 5e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7677487134933472, + "num_tokens": 532835723.0, + "step": 20582 + }, + { + "epoch": 2.260377772896991, + "grad_norm": 2.392193078994751, + "learning_rate": 5e-06, + "loss": 0.601, + "mean_token_accuracy": 0.7953782081604004, + "num_tokens": 532855877.0, + "step": 20583 + }, + { + "epoch": 2.2604875905996047, + "grad_norm": 2.2703909873962402, + "learning_rate": 5e-06, + "loss": 0.6739, + "mean_token_accuracy": 0.781912088394165, + "num_tokens": 532877067.0, + "step": 20584 + }, + { + "epoch": 2.2605974083022184, + "grad_norm": 1.9885615110397339, + "learning_rate": 5e-06, + "loss": 0.6733, + "mean_token_accuracy": 0.7773740887641907, + "num_tokens": 532906410.0, + "step": 20585 + }, + { + "epoch": 2.2607072260048318, + "grad_norm": 2.4602670669555664, + "learning_rate": 5e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.7904142141342163, + "num_tokens": 532925598.0, + "step": 20586 + }, + { + "epoch": 2.2608170437074455, + "grad_norm": 1.9681105613708496, + "learning_rate": 5e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7688943147659302, + "num_tokens": 532953353.0, + "step": 20587 + }, + { + "epoch": 2.2609268614100593, + "grad_norm": 1.9327032566070557, + "learning_rate": 5e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7720228433609009, + "num_tokens": 532982947.0, + "step": 20588 + }, + { + "epoch": 2.261036679112673, + "grad_norm": 2.2472476959228516, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7498272657394409, + "num_tokens": 533005667.0, + "step": 20589 + }, + { + "epoch": 2.261146496815287, + "grad_norm": 2.1001136302948, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7689779996871948, + "num_tokens": 533031132.0, + "step": 20590 + }, + { + "epoch": 2.2612563145179, + "grad_norm": 2.1637203693389893, + "learning_rate": 5e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7585071325302124, + "num_tokens": 533054840.0, + "step": 20591 + }, + { + "epoch": 2.261366132220514, + "grad_norm": 2.026170015335083, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7599952816963196, + "num_tokens": 533084858.0, + "step": 20592 + }, + { + "epoch": 2.2614759499231276, + "grad_norm": 2.162748336791992, + "learning_rate": 5e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.7799906730651855, + "num_tokens": 533108952.0, + "step": 20593 + }, + { + "epoch": 2.2615857676257414, + "grad_norm": 2.158367156982422, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.784332811832428, + "num_tokens": 533133505.0, + "step": 20594 + }, + { + "epoch": 2.261695585328355, + "grad_norm": 2.413055181503296, + "learning_rate": 5e-06, + "loss": 0.6888, + "mean_token_accuracy": 0.7796351313591003, + "num_tokens": 533155039.0, + "step": 20595 + }, + { + "epoch": 2.2618054030309684, + "grad_norm": 2.18381404876709, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7672094106674194, + "num_tokens": 533180111.0, + "step": 20596 + }, + { + "epoch": 2.261915220733582, + "grad_norm": 2.0868959426879883, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7644056081771851, + "num_tokens": 533207728.0, + "step": 20597 + }, + { + "epoch": 2.262025038436196, + "grad_norm": 2.1213979721069336, + "learning_rate": 5e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7671205997467041, + "num_tokens": 533235113.0, + "step": 20598 + }, + { + "epoch": 2.2621348561388097, + "grad_norm": 2.183821439743042, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7823886275291443, + "num_tokens": 533260583.0, + "step": 20599 + }, + { + "epoch": 2.2622446738414235, + "grad_norm": 2.118001937866211, + "learning_rate": 5e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7461354732513428, + "num_tokens": 533289453.0, + "step": 20600 + }, + { + "epoch": 2.262354491544037, + "grad_norm": 2.5030341148376465, + "learning_rate": 5e-06, + "loss": 0.5766, + "mean_token_accuracy": 0.8062772750854492, + "num_tokens": 533307417.0, + "step": 20601 + }, + { + "epoch": 2.2624643092466505, + "grad_norm": 2.2210352420806885, + "learning_rate": 5e-06, + "loss": 0.6197, + "mean_token_accuracy": 0.7943888306617737, + "num_tokens": 533328791.0, + "step": 20602 + }, + { + "epoch": 2.2625741269492643, + "grad_norm": 2.2904837131500244, + "learning_rate": 5e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7757370471954346, + "num_tokens": 533352806.0, + "step": 20603 + }, + { + "epoch": 2.262683944651878, + "grad_norm": 2.0143589973449707, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7730696201324463, + "num_tokens": 533385257.0, + "step": 20604 + }, + { + "epoch": 2.2627937623544914, + "grad_norm": 1.9368622303009033, + "learning_rate": 5e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7587506175041199, + "num_tokens": 533414897.0, + "step": 20605 + }, + { + "epoch": 2.262903580057105, + "grad_norm": 2.010777235031128, + "learning_rate": 5e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7743561267852783, + "num_tokens": 533443738.0, + "step": 20606 + }, + { + "epoch": 2.263013397759719, + "grad_norm": 2.0365195274353027, + "learning_rate": 5e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.780981183052063, + "num_tokens": 533470548.0, + "step": 20607 + }, + { + "epoch": 2.2631232154623326, + "grad_norm": 2.233262777328491, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.761915922164917, + "num_tokens": 533493570.0, + "step": 20608 + }, + { + "epoch": 2.263233033164946, + "grad_norm": 2.2595934867858887, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7703275680541992, + "num_tokens": 533518486.0, + "step": 20609 + }, + { + "epoch": 2.2633428508675597, + "grad_norm": 1.9125345945358276, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7658582925796509, + "num_tokens": 533547811.0, + "step": 20610 + }, + { + "epoch": 2.2634526685701735, + "grad_norm": 2.3113574981689453, + "learning_rate": 5e-06, + "loss": 0.665, + "mean_token_accuracy": 0.7845699191093445, + "num_tokens": 533570129.0, + "step": 20611 + }, + { + "epoch": 2.263562486272787, + "grad_norm": 2.110560894012451, + "learning_rate": 5e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.750938355922699, + "num_tokens": 533596830.0, + "step": 20612 + }, + { + "epoch": 2.263672303975401, + "grad_norm": 1.959672451019287, + "learning_rate": 5e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.765792191028595, + "num_tokens": 533623506.0, + "step": 20613 + }, + { + "epoch": 2.2637821216780143, + "grad_norm": 2.351283550262451, + "learning_rate": 5e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.7930585145950317, + "num_tokens": 533642407.0, + "step": 20614 + }, + { + "epoch": 2.263891939380628, + "grad_norm": 2.1572933197021484, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7445347309112549, + "num_tokens": 533669217.0, + "step": 20615 + }, + { + "epoch": 2.264001757083242, + "grad_norm": 2.324932098388672, + "learning_rate": 5e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7602438926696777, + "num_tokens": 533691577.0, + "step": 20616 + }, + { + "epoch": 2.2641115747858556, + "grad_norm": 2.1019294261932373, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7442308068275452, + "num_tokens": 533717654.0, + "step": 20617 + }, + { + "epoch": 2.2642213924884693, + "grad_norm": 2.1716904640197754, + "learning_rate": 5e-06, + "loss": 0.689, + "mean_token_accuracy": 0.7769006490707397, + "num_tokens": 533742289.0, + "step": 20618 + }, + { + "epoch": 2.2643312101910826, + "grad_norm": 2.0294289588928223, + "learning_rate": 5e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.778702437877655, + "num_tokens": 533767705.0, + "step": 20619 + }, + { + "epoch": 2.2644410278936964, + "grad_norm": 1.9174174070358276, + "learning_rate": 5e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7654927968978882, + "num_tokens": 533796554.0, + "step": 20620 + }, + { + "epoch": 2.26455084559631, + "grad_norm": 2.030663013458252, + "learning_rate": 5e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.768093466758728, + "num_tokens": 533823276.0, + "step": 20621 + }, + { + "epoch": 2.264660663298924, + "grad_norm": 2.434661865234375, + "learning_rate": 5e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.7728649973869324, + "num_tokens": 533843473.0, + "step": 20622 + }, + { + "epoch": 2.2647704810015377, + "grad_norm": 2.0811073780059814, + "learning_rate": 5e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7596031427383423, + "num_tokens": 533871938.0, + "step": 20623 + }, + { + "epoch": 2.264880298704151, + "grad_norm": 1.8660063743591309, + "learning_rate": 5e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.7851558923721313, + "num_tokens": 533899947.0, + "step": 20624 + }, + { + "epoch": 2.2649901164067647, + "grad_norm": 2.1028149127960205, + "learning_rate": 5e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.763740062713623, + "num_tokens": 533926896.0, + "step": 20625 + }, + { + "epoch": 2.2650999341093785, + "grad_norm": 2.3220102787017822, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7645635604858398, + "num_tokens": 533949568.0, + "step": 20626 + }, + { + "epoch": 2.2652097518119922, + "grad_norm": 1.9849101305007935, + "learning_rate": 5e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.758156418800354, + "num_tokens": 533976400.0, + "step": 20627 + }, + { + "epoch": 2.265319569514606, + "grad_norm": 2.154513359069824, + "learning_rate": 5e-06, + "loss": 0.6462, + "mean_token_accuracy": 0.7815470695495605, + "num_tokens": 533998150.0, + "step": 20628 + }, + { + "epoch": 2.2654293872172193, + "grad_norm": 2.179396867752075, + "learning_rate": 5e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.7773659229278564, + "num_tokens": 534020140.0, + "step": 20629 + }, + { + "epoch": 2.265539204919833, + "grad_norm": 2.6025402545928955, + "learning_rate": 5e-06, + "loss": 0.5917, + "mean_token_accuracy": 0.7962013483047485, + "num_tokens": 534037547.0, + "step": 20630 + }, + { + "epoch": 2.265649022622447, + "grad_norm": 2.0944483280181885, + "learning_rate": 5e-06, + "loss": 0.6793, + "mean_token_accuracy": 0.7781764268875122, + "num_tokens": 534062972.0, + "step": 20631 + }, + { + "epoch": 2.2657588403250606, + "grad_norm": 1.905566930770874, + "learning_rate": 5e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7716038227081299, + "num_tokens": 534094692.0, + "step": 20632 + }, + { + "epoch": 2.265868658027674, + "grad_norm": 2.1480026245117188, + "learning_rate": 5e-06, + "loss": 0.5719, + "mean_token_accuracy": 0.8033079504966736, + "num_tokens": 534115786.0, + "step": 20633 + }, + { + "epoch": 2.2659784757302877, + "grad_norm": 2.096221685409546, + "learning_rate": 5e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.7786736488342285, + "num_tokens": 534141560.0, + "step": 20634 + }, + { + "epoch": 2.2660882934329014, + "grad_norm": 2.103300094604492, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7521008849143982, + "num_tokens": 534168041.0, + "step": 20635 + }, + { + "epoch": 2.266198111135515, + "grad_norm": 1.958375096321106, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.754687488079071, + "num_tokens": 534201120.0, + "step": 20636 + }, + { + "epoch": 2.2663079288381285, + "grad_norm": 2.332007646560669, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7533349990844727, + "num_tokens": 534224130.0, + "step": 20637 + }, + { + "epoch": 2.2664177465407422, + "grad_norm": 2.0703370571136475, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7630252838134766, + "num_tokens": 534250655.0, + "step": 20638 + }, + { + "epoch": 2.266527564243356, + "grad_norm": 2.1949353218078613, + "learning_rate": 5e-06, + "loss": 0.5993, + "mean_token_accuracy": 0.7998579144477844, + "num_tokens": 534276022.0, + "step": 20639 + }, + { + "epoch": 2.2666373819459698, + "grad_norm": 2.303954839706421, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7578380107879639, + "num_tokens": 534300402.0, + "step": 20640 + }, + { + "epoch": 2.2667471996485835, + "grad_norm": 2.0672264099121094, + "learning_rate": 5e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7520885467529297, + "num_tokens": 534326371.0, + "step": 20641 + }, + { + "epoch": 2.266857017351197, + "grad_norm": 2.1531081199645996, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7755125761032104, + "num_tokens": 534349747.0, + "step": 20642 + }, + { + "epoch": 2.2669668350538106, + "grad_norm": 2.2458620071411133, + "learning_rate": 5e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.77739417552948, + "num_tokens": 534373022.0, + "step": 20643 + }, + { + "epoch": 2.2670766527564243, + "grad_norm": 2.1573493480682373, + "learning_rate": 5e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.7747764587402344, + "num_tokens": 534396762.0, + "step": 20644 + }, + { + "epoch": 2.267186470459038, + "grad_norm": 2.1318235397338867, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7803962230682373, + "num_tokens": 534419285.0, + "step": 20645 + }, + { + "epoch": 2.267296288161652, + "grad_norm": 2.291576862335205, + "learning_rate": 5e-06, + "loss": 0.6449, + "mean_token_accuracy": 0.781165361404419, + "num_tokens": 534443690.0, + "step": 20646 + }, + { + "epoch": 2.267406105864265, + "grad_norm": 1.912166714668274, + "learning_rate": 5e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7646802663803101, + "num_tokens": 534478506.0, + "step": 20647 + }, + { + "epoch": 2.267515923566879, + "grad_norm": 2.020393133163452, + "learning_rate": 5e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7739495038986206, + "num_tokens": 534505787.0, + "step": 20648 + }, + { + "epoch": 2.2676257412694927, + "grad_norm": 2.096346855163574, + "learning_rate": 5e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7360681295394897, + "num_tokens": 534533058.0, + "step": 20649 + }, + { + "epoch": 2.2677355589721064, + "grad_norm": 2.194397449493408, + "learning_rate": 5e-06, + "loss": 0.6663, + "mean_token_accuracy": 0.7880414724349976, + "num_tokens": 534557019.0, + "step": 20650 + }, + { + "epoch": 2.26784537667472, + "grad_norm": 2.10781192779541, + "learning_rate": 5e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7672743797302246, + "num_tokens": 534585327.0, + "step": 20651 + }, + { + "epoch": 2.2679551943773335, + "grad_norm": 2.3777236938476562, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7620654106140137, + "num_tokens": 534608697.0, + "step": 20652 + }, + { + "epoch": 2.2680650120799473, + "grad_norm": 2.395165205001831, + "learning_rate": 5e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.7896676063537598, + "num_tokens": 534627974.0, + "step": 20653 + }, + { + "epoch": 2.268174829782561, + "grad_norm": 2.0885257720947266, + "learning_rate": 5e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7711079120635986, + "num_tokens": 534653606.0, + "step": 20654 + }, + { + "epoch": 2.2682846474851748, + "grad_norm": 2.023406505584717, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7594404220581055, + "num_tokens": 534680467.0, + "step": 20655 + }, + { + "epoch": 2.268394465187788, + "grad_norm": 2.123861789703369, + "learning_rate": 5e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.7749333381652832, + "num_tokens": 534706814.0, + "step": 20656 + }, + { + "epoch": 2.268504282890402, + "grad_norm": 2.0232276916503906, + "learning_rate": 5e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.7842714786529541, + "num_tokens": 534733373.0, + "step": 20657 + }, + { + "epoch": 2.2686141005930156, + "grad_norm": 2.0215234756469727, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7674076557159424, + "num_tokens": 534759981.0, + "step": 20658 + }, + { + "epoch": 2.2687239182956294, + "grad_norm": 1.7762959003448486, + "learning_rate": 5e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7407792806625366, + "num_tokens": 534794072.0, + "step": 20659 + }, + { + "epoch": 2.2688337359982427, + "grad_norm": 1.9326122999191284, + "learning_rate": 5e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.7615557312965393, + "num_tokens": 534822189.0, + "step": 20660 + }, + { + "epoch": 2.2689435537008564, + "grad_norm": 2.184880018234253, + "learning_rate": 5e-06, + "loss": 0.765, + "mean_token_accuracy": 0.749537467956543, + "num_tokens": 534848242.0, + "step": 20661 + }, + { + "epoch": 2.26905337140347, + "grad_norm": 2.1388766765594482, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.764245331287384, + "num_tokens": 534874480.0, + "step": 20662 + }, + { + "epoch": 2.269163189106084, + "grad_norm": 2.3298287391662598, + "learning_rate": 5e-06, + "loss": 0.6216, + "mean_token_accuracy": 0.7917592525482178, + "num_tokens": 534893301.0, + "step": 20663 + }, + { + "epoch": 2.2692730068086977, + "grad_norm": 2.0229647159576416, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.755516767501831, + "num_tokens": 534924061.0, + "step": 20664 + }, + { + "epoch": 2.269382824511311, + "grad_norm": 2.1152069568634033, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7652893662452698, + "num_tokens": 534948816.0, + "step": 20665 + }, + { + "epoch": 2.2694926422139248, + "grad_norm": 1.9476120471954346, + "learning_rate": 5e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.7929602861404419, + "num_tokens": 534975427.0, + "step": 20666 + }, + { + "epoch": 2.2696024599165385, + "grad_norm": 2.0472309589385986, + "learning_rate": 5e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7660505771636963, + "num_tokens": 535001818.0, + "step": 20667 + }, + { + "epoch": 2.2697122776191523, + "grad_norm": 2.0855636596679688, + "learning_rate": 5e-06, + "loss": 0.6656, + "mean_token_accuracy": 0.7790094614028931, + "num_tokens": 535028549.0, + "step": 20668 + }, + { + "epoch": 2.269822095321766, + "grad_norm": 1.9852824211120605, + "learning_rate": 5e-06, + "loss": 0.668, + "mean_token_accuracy": 0.7789486050605774, + "num_tokens": 535056507.0, + "step": 20669 + }, + { + "epoch": 2.2699319130243794, + "grad_norm": 2.2427356243133545, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7589995265007019, + "num_tokens": 535079082.0, + "step": 20670 + }, + { + "epoch": 2.270041730726993, + "grad_norm": 2.0444493293762207, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7660089731216431, + "num_tokens": 535108063.0, + "step": 20671 + }, + { + "epoch": 2.270151548429607, + "grad_norm": 1.9825680255889893, + "learning_rate": 5e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.7739347219467163, + "num_tokens": 535136782.0, + "step": 20672 + }, + { + "epoch": 2.2702613661322206, + "grad_norm": 2.046104669570923, + "learning_rate": 5e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7703170776367188, + "num_tokens": 535162048.0, + "step": 20673 + }, + { + "epoch": 2.2703711838348344, + "grad_norm": 1.979055643081665, + "learning_rate": 5e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7453115582466125, + "num_tokens": 535190944.0, + "step": 20674 + }, + { + "epoch": 2.2704810015374477, + "grad_norm": 2.1535818576812744, + "learning_rate": 5e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.7730762362480164, + "num_tokens": 535215894.0, + "step": 20675 + }, + { + "epoch": 2.2705908192400615, + "grad_norm": 2.1976428031921387, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7705890536308289, + "num_tokens": 535240402.0, + "step": 20676 + }, + { + "epoch": 2.270700636942675, + "grad_norm": 2.35752534866333, + "learning_rate": 5e-06, + "loss": 0.658, + "mean_token_accuracy": 0.7843934297561646, + "num_tokens": 535262076.0, + "step": 20677 + }, + { + "epoch": 2.270810454645289, + "grad_norm": 2.1535449028015137, + "learning_rate": 5e-06, + "loss": 0.6095, + "mean_token_accuracy": 0.8006452322006226, + "num_tokens": 535285372.0, + "step": 20678 + }, + { + "epoch": 2.2709202723479027, + "grad_norm": 1.9925261735916138, + "learning_rate": 5e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.7857679128646851, + "num_tokens": 535312080.0, + "step": 20679 + }, + { + "epoch": 2.271030090050516, + "grad_norm": 2.1728203296661377, + "learning_rate": 5e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7403597831726074, + "num_tokens": 535338965.0, + "step": 20680 + }, + { + "epoch": 2.27113990775313, + "grad_norm": 1.9788755178451538, + "learning_rate": 5e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7534390687942505, + "num_tokens": 535367589.0, + "step": 20681 + }, + { + "epoch": 2.2712497254557436, + "grad_norm": 2.0464539527893066, + "learning_rate": 5e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.774602472782135, + "num_tokens": 535390395.0, + "step": 20682 + }, + { + "epoch": 2.2713595431583573, + "grad_norm": 2.16729474067688, + "learning_rate": 5e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7716031670570374, + "num_tokens": 535414614.0, + "step": 20683 + }, + { + "epoch": 2.2714693608609706, + "grad_norm": 1.8492391109466553, + "learning_rate": 5e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7764272093772888, + "num_tokens": 535441693.0, + "step": 20684 + }, + { + "epoch": 2.2715791785635844, + "grad_norm": 1.9867340326309204, + "learning_rate": 5e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7851274013519287, + "num_tokens": 535469210.0, + "step": 20685 + }, + { + "epoch": 2.271688996266198, + "grad_norm": 2.10601806640625, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.761772632598877, + "num_tokens": 535497062.0, + "step": 20686 + }, + { + "epoch": 2.271798813968812, + "grad_norm": 1.9774669408798218, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7808489799499512, + "num_tokens": 535523961.0, + "step": 20687 + }, + { + "epoch": 2.271908631671425, + "grad_norm": 2.0243897438049316, + "learning_rate": 5e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.7605957388877869, + "num_tokens": 535551270.0, + "step": 20688 + }, + { + "epoch": 2.272018449374039, + "grad_norm": 2.069333553314209, + "learning_rate": 5e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7512146234512329, + "num_tokens": 535577170.0, + "step": 20689 + }, + { + "epoch": 2.2721282670766527, + "grad_norm": 2.0647265911102295, + "learning_rate": 5e-06, + "loss": 0.6282, + "mean_token_accuracy": 0.78780198097229, + "num_tokens": 535602273.0, + "step": 20690 + }, + { + "epoch": 2.2722380847792665, + "grad_norm": 1.9163652658462524, + "learning_rate": 5e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7717832326889038, + "num_tokens": 535630295.0, + "step": 20691 + }, + { + "epoch": 2.2723479024818802, + "grad_norm": 2.3366734981536865, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7602169513702393, + "num_tokens": 535655564.0, + "step": 20692 + }, + { + "epoch": 2.2724577201844935, + "grad_norm": 1.8301844596862793, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7670767903327942, + "num_tokens": 535689829.0, + "step": 20693 + }, + { + "epoch": 2.2725675378871073, + "grad_norm": 2.0086212158203125, + "learning_rate": 5e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7634187936782837, + "num_tokens": 535718710.0, + "step": 20694 + }, + { + "epoch": 2.272677355589721, + "grad_norm": 2.2435450553894043, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7788804173469543, + "num_tokens": 535741769.0, + "step": 20695 + }, + { + "epoch": 2.272787173292335, + "grad_norm": 1.9380124807357788, + "learning_rate": 5e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7731080651283264, + "num_tokens": 535770591.0, + "step": 20696 + }, + { + "epoch": 2.2728969909949486, + "grad_norm": 2.5234603881835938, + "learning_rate": 5e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.7853415608406067, + "num_tokens": 535787916.0, + "step": 20697 + }, + { + "epoch": 2.273006808697562, + "grad_norm": 2.5515918731689453, + "learning_rate": 5e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.7838728427886963, + "num_tokens": 535805379.0, + "step": 20698 + }, + { + "epoch": 2.2731166264001756, + "grad_norm": 1.8644717931747437, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.754412055015564, + "num_tokens": 535837214.0, + "step": 20699 + }, + { + "epoch": 2.2732264441027894, + "grad_norm": 2.1755599975585938, + "learning_rate": 5e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7612469792366028, + "num_tokens": 535863630.0, + "step": 20700 + }, + { + "epoch": 2.273336261805403, + "grad_norm": 2.1556835174560547, + "learning_rate": 5e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7577842473983765, + "num_tokens": 535890117.0, + "step": 20701 + }, + { + "epoch": 2.273446079508017, + "grad_norm": 2.074432611465454, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7637194991111755, + "num_tokens": 535914935.0, + "step": 20702 + }, + { + "epoch": 2.2735558972106302, + "grad_norm": 2.0233404636383057, + "learning_rate": 5e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7631727457046509, + "num_tokens": 535940491.0, + "step": 20703 + }, + { + "epoch": 2.273665714913244, + "grad_norm": 2.10703182220459, + "learning_rate": 5e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7509610652923584, + "num_tokens": 535969384.0, + "step": 20704 + }, + { + "epoch": 2.2737755326158577, + "grad_norm": 2.3852615356445312, + "learning_rate": 5e-06, + "loss": 0.6307, + "mean_token_accuracy": 0.7890982627868652, + "num_tokens": 535989891.0, + "step": 20705 + }, + { + "epoch": 2.2738853503184715, + "grad_norm": 2.310372829437256, + "learning_rate": 5e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.7816081643104553, + "num_tokens": 536010957.0, + "step": 20706 + }, + { + "epoch": 2.273995168021085, + "grad_norm": 1.9515670537948608, + "learning_rate": 5e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7658711671829224, + "num_tokens": 536039391.0, + "step": 20707 + }, + { + "epoch": 2.2741049857236986, + "grad_norm": 1.742086410522461, + "learning_rate": 5e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.754162073135376, + "num_tokens": 536078091.0, + "step": 20708 + }, + { + "epoch": 2.2742148034263123, + "grad_norm": 2.0980803966522217, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.765733540058136, + "num_tokens": 536103814.0, + "step": 20709 + }, + { + "epoch": 2.274324621128926, + "grad_norm": 2.2157821655273438, + "learning_rate": 5e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.7682532072067261, + "num_tokens": 536130503.0, + "step": 20710 + }, + { + "epoch": 2.2744344388315394, + "grad_norm": 2.3161070346832275, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7651971578598022, + "num_tokens": 536154371.0, + "step": 20711 + }, + { + "epoch": 2.274544256534153, + "grad_norm": 1.8644325733184814, + "learning_rate": 5e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7722485065460205, + "num_tokens": 536184071.0, + "step": 20712 + }, + { + "epoch": 2.274654074236767, + "grad_norm": 2.3274049758911133, + "learning_rate": 5e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7763119339942932, + "num_tokens": 536206202.0, + "step": 20713 + }, + { + "epoch": 2.2747638919393807, + "grad_norm": 1.748102068901062, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7366902828216553, + "num_tokens": 536243851.0, + "step": 20714 + }, + { + "epoch": 2.2748737096419944, + "grad_norm": 2.0730154514312744, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.759996771812439, + "num_tokens": 536272872.0, + "step": 20715 + }, + { + "epoch": 2.2749835273446077, + "grad_norm": 2.2119088172912598, + "learning_rate": 5e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7654842138290405, + "num_tokens": 536295306.0, + "step": 20716 + }, + { + "epoch": 2.2750933450472215, + "grad_norm": 2.2887277603149414, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7708929181098938, + "num_tokens": 536319265.0, + "step": 20717 + }, + { + "epoch": 2.2752031627498353, + "grad_norm": 2.0701708793640137, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7768926620483398, + "num_tokens": 536346316.0, + "step": 20718 + }, + { + "epoch": 2.275312980452449, + "grad_norm": 2.0156219005584717, + "learning_rate": 5e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.7823697328567505, + "num_tokens": 536374176.0, + "step": 20719 + }, + { + "epoch": 2.2754227981550628, + "grad_norm": 1.9662585258483887, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.758971095085144, + "num_tokens": 536402567.0, + "step": 20720 + }, + { + "epoch": 2.275532615857676, + "grad_norm": 2.203829526901245, + "learning_rate": 5e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7674932479858398, + "num_tokens": 536427726.0, + "step": 20721 + }, + { + "epoch": 2.27564243356029, + "grad_norm": 1.985668420791626, + "learning_rate": 5e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7627302408218384, + "num_tokens": 536457747.0, + "step": 20722 + }, + { + "epoch": 2.2757522512629036, + "grad_norm": 2.0578413009643555, + "learning_rate": 5e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7605417370796204, + "num_tokens": 536486257.0, + "step": 20723 + }, + { + "epoch": 2.2758620689655173, + "grad_norm": 2.2160205841064453, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.771981418132782, + "num_tokens": 536510445.0, + "step": 20724 + }, + { + "epoch": 2.275971886668131, + "grad_norm": 2.1063175201416016, + "learning_rate": 5e-06, + "loss": 0.6416, + "mean_token_accuracy": 0.7857422232627869, + "num_tokens": 536534193.0, + "step": 20725 + }, + { + "epoch": 2.2760817043707444, + "grad_norm": 1.9594734907150269, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7592802047729492, + "num_tokens": 536565228.0, + "step": 20726 + }, + { + "epoch": 2.276191522073358, + "grad_norm": 2.194093704223633, + "learning_rate": 5e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7681639194488525, + "num_tokens": 536589746.0, + "step": 20727 + }, + { + "epoch": 2.276301339775972, + "grad_norm": 2.1937882900238037, + "learning_rate": 5e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7758612632751465, + "num_tokens": 536612076.0, + "step": 20728 + }, + { + "epoch": 2.2764111574785857, + "grad_norm": 2.0862176418304443, + "learning_rate": 5e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7626678347587585, + "num_tokens": 536638297.0, + "step": 20729 + }, + { + "epoch": 2.2765209751811994, + "grad_norm": 1.9681055545806885, + "learning_rate": 5e-06, + "loss": 0.7061, + "mean_token_accuracy": 0.772790789604187, + "num_tokens": 536666576.0, + "step": 20730 + }, + { + "epoch": 2.2766307928838128, + "grad_norm": 2.2626726627349854, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7608848214149475, + "num_tokens": 536691480.0, + "step": 20731 + }, + { + "epoch": 2.2767406105864265, + "grad_norm": 2.1291704177856445, + "learning_rate": 5e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.763941764831543, + "num_tokens": 536715287.0, + "step": 20732 + }, + { + "epoch": 2.2768504282890403, + "grad_norm": 2.2211451530456543, + "learning_rate": 5e-06, + "loss": 0.6702, + "mean_token_accuracy": 0.7810349464416504, + "num_tokens": 536738586.0, + "step": 20733 + }, + { + "epoch": 2.276960245991654, + "grad_norm": 1.9109071493148804, + "learning_rate": 5e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7475535869598389, + "num_tokens": 536769586.0, + "step": 20734 + }, + { + "epoch": 2.2770700636942673, + "grad_norm": 2.0391714572906494, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.756409227848053, + "num_tokens": 536795349.0, + "step": 20735 + }, + { + "epoch": 2.277179881396881, + "grad_norm": 2.116089105606079, + "learning_rate": 5e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.7974604964256287, + "num_tokens": 536821269.0, + "step": 20736 + }, + { + "epoch": 2.277289699099495, + "grad_norm": 2.0014748573303223, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7592931985855103, + "num_tokens": 536847928.0, + "step": 20737 + }, + { + "epoch": 2.2773995168021086, + "grad_norm": 1.9763648509979248, + "learning_rate": 5e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7657841444015503, + "num_tokens": 536877556.0, + "step": 20738 + }, + { + "epoch": 2.277509334504722, + "grad_norm": 2.1498215198516846, + "learning_rate": 5e-06, + "loss": 0.6265, + "mean_token_accuracy": 0.7933220267295837, + "num_tokens": 536900833.0, + "step": 20739 + }, + { + "epoch": 2.2776191522073357, + "grad_norm": 1.751287817955017, + "learning_rate": 5e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7570841312408447, + "num_tokens": 536934787.0, + "step": 20740 + }, + { + "epoch": 2.2777289699099494, + "grad_norm": 2.0564067363739014, + "learning_rate": 5e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7636669278144836, + "num_tokens": 536960721.0, + "step": 20741 + }, + { + "epoch": 2.277838787612563, + "grad_norm": 2.190091371536255, + "learning_rate": 5e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.7789313197135925, + "num_tokens": 536985208.0, + "step": 20742 + }, + { + "epoch": 2.277948605315177, + "grad_norm": 2.045283317565918, + "learning_rate": 5e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7560374736785889, + "num_tokens": 537014104.0, + "step": 20743 + }, + { + "epoch": 2.2780584230177903, + "grad_norm": 2.1431376934051514, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7757196426391602, + "num_tokens": 537037331.0, + "step": 20744 + }, + { + "epoch": 2.278168240720404, + "grad_norm": 2.2302117347717285, + "learning_rate": 5e-06, + "loss": 0.638, + "mean_token_accuracy": 0.7897759079933167, + "num_tokens": 537059046.0, + "step": 20745 + }, + { + "epoch": 2.278278058423018, + "grad_norm": 2.277198553085327, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7596135139465332, + "num_tokens": 537082166.0, + "step": 20746 + }, + { + "epoch": 2.2783878761256315, + "grad_norm": 2.17269229888916, + "learning_rate": 5e-06, + "loss": 0.6706, + "mean_token_accuracy": 0.7799161672592163, + "num_tokens": 537106625.0, + "step": 20747 + }, + { + "epoch": 2.2784976938282453, + "grad_norm": 1.9670166969299316, + "learning_rate": 5e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7494274377822876, + "num_tokens": 537136616.0, + "step": 20748 + }, + { + "epoch": 2.2786075115308586, + "grad_norm": 2.297663450241089, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7539530992507935, + "num_tokens": 537159567.0, + "step": 20749 + }, + { + "epoch": 2.2787173292334724, + "grad_norm": 2.413581609725952, + "learning_rate": 5e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7743228077888489, + "num_tokens": 537181100.0, + "step": 20750 + }, + { + "epoch": 2.278827146936086, + "grad_norm": 1.9711863994598389, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7370532751083374, + "num_tokens": 537212530.0, + "step": 20751 + }, + { + "epoch": 2.2789369646387, + "grad_norm": 2.126119375228882, + "learning_rate": 5e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7675762176513672, + "num_tokens": 537238400.0, + "step": 20752 + }, + { + "epoch": 2.2790467823413136, + "grad_norm": 2.058076858520508, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7562728524208069, + "num_tokens": 537266352.0, + "step": 20753 + }, + { + "epoch": 2.279156600043927, + "grad_norm": 1.991148829460144, + "learning_rate": 5e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.747776985168457, + "num_tokens": 537296357.0, + "step": 20754 + }, + { + "epoch": 2.2792664177465407, + "grad_norm": 2.204742670059204, + "learning_rate": 5e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.7654026746749878, + "num_tokens": 537320387.0, + "step": 20755 + }, + { + "epoch": 2.2793762354491545, + "grad_norm": 2.285196542739868, + "learning_rate": 5e-06, + "loss": 0.6579, + "mean_token_accuracy": 0.7780297994613647, + "num_tokens": 537341800.0, + "step": 20756 + }, + { + "epoch": 2.279486053151768, + "grad_norm": 2.268097162246704, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7631696462631226, + "num_tokens": 537365331.0, + "step": 20757 + }, + { + "epoch": 2.2795958708543815, + "grad_norm": 2.23225998878479, + "learning_rate": 5e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7897552251815796, + "num_tokens": 537388573.0, + "step": 20758 + }, + { + "epoch": 2.2797056885569953, + "grad_norm": 1.9592255353927612, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7642475366592407, + "num_tokens": 537418949.0, + "step": 20759 + }, + { + "epoch": 2.279815506259609, + "grad_norm": 1.9922683238983154, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7714017033576965, + "num_tokens": 537445416.0, + "step": 20760 + }, + { + "epoch": 2.279925323962223, + "grad_norm": 2.3433873653411865, + "learning_rate": 5e-06, + "loss": 0.6306, + "mean_token_accuracy": 0.7945468425750732, + "num_tokens": 537467497.0, + "step": 20761 + }, + { + "epoch": 2.2800351416648366, + "grad_norm": 2.016061305999756, + "learning_rate": 5e-06, + "loss": 0.6381, + "mean_token_accuracy": 0.7855557799339294, + "num_tokens": 537492668.0, + "step": 20762 + }, + { + "epoch": 2.28014495936745, + "grad_norm": 2.0733749866485596, + "learning_rate": 5e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7672278881072998, + "num_tokens": 537519137.0, + "step": 20763 + }, + { + "epoch": 2.2802547770700636, + "grad_norm": 1.9378724098205566, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7633246779441833, + "num_tokens": 537549461.0, + "step": 20764 + }, + { + "epoch": 2.2803645947726774, + "grad_norm": 2.1224937438964844, + "learning_rate": 5e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7684672474861145, + "num_tokens": 537573226.0, + "step": 20765 + }, + { + "epoch": 2.280474412475291, + "grad_norm": 1.9347171783447266, + "learning_rate": 5e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7609666585922241, + "num_tokens": 537605041.0, + "step": 20766 + }, + { + "epoch": 2.2805842301779045, + "grad_norm": 2.2048399448394775, + "learning_rate": 5e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7751460075378418, + "num_tokens": 537630357.0, + "step": 20767 + }, + { + "epoch": 2.280694047880518, + "grad_norm": 2.009077787399292, + "learning_rate": 5e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7475472092628479, + "num_tokens": 537660369.0, + "step": 20768 + }, + { + "epoch": 2.280803865583132, + "grad_norm": 2.354461193084717, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7726490497589111, + "num_tokens": 537679906.0, + "step": 20769 + }, + { + "epoch": 2.2809136832857457, + "grad_norm": 2.083854913711548, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7665529251098633, + "num_tokens": 537703375.0, + "step": 20770 + }, + { + "epoch": 2.2810235009883595, + "grad_norm": 2.214376211166382, + "learning_rate": 5e-06, + "loss": 0.6181, + "mean_token_accuracy": 0.7882289886474609, + "num_tokens": 537726352.0, + "step": 20771 + }, + { + "epoch": 2.281133318690973, + "grad_norm": 2.320678234100342, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7696220874786377, + "num_tokens": 537750134.0, + "step": 20772 + }, + { + "epoch": 2.2812431363935866, + "grad_norm": 2.003589391708374, + "learning_rate": 5e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7480697631835938, + "num_tokens": 537779083.0, + "step": 20773 + }, + { + "epoch": 2.2813529540962003, + "grad_norm": 2.180903434753418, + "learning_rate": 5e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7299829721450806, + "num_tokens": 537807355.0, + "step": 20774 + }, + { + "epoch": 2.281462771798814, + "grad_norm": 1.801802158355713, + "learning_rate": 5e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7640957832336426, + "num_tokens": 537839157.0, + "step": 20775 + }, + { + "epoch": 2.281572589501428, + "grad_norm": 2.279550075531006, + "learning_rate": 5e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7649757862091064, + "num_tokens": 537860865.0, + "step": 20776 + }, + { + "epoch": 2.281682407204041, + "grad_norm": 2.0431621074676514, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7575420141220093, + "num_tokens": 537886262.0, + "step": 20777 + }, + { + "epoch": 2.281792224906655, + "grad_norm": 1.97320556640625, + "learning_rate": 5e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7400854229927063, + "num_tokens": 537917655.0, + "step": 20778 + }, + { + "epoch": 2.2819020426092687, + "grad_norm": 2.078648805618286, + "learning_rate": 5e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.7885270714759827, + "num_tokens": 537941665.0, + "step": 20779 + }, + { + "epoch": 2.2820118603118824, + "grad_norm": 2.116265058517456, + "learning_rate": 5e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7576545476913452, + "num_tokens": 537967098.0, + "step": 20780 + }, + { + "epoch": 2.282121678014496, + "grad_norm": 2.2556612491607666, + "learning_rate": 5e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7659305334091187, + "num_tokens": 537991025.0, + "step": 20781 + }, + { + "epoch": 2.2822314957171095, + "grad_norm": 1.869261384010315, + "learning_rate": 5e-06, + "loss": 0.6918, + "mean_token_accuracy": 0.7716748714447021, + "num_tokens": 538021946.0, + "step": 20782 + }, + { + "epoch": 2.2823413134197232, + "grad_norm": 1.912237524986267, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7499715685844421, + "num_tokens": 538051395.0, + "step": 20783 + }, + { + "epoch": 2.282451131122337, + "grad_norm": 1.8520677089691162, + "learning_rate": 5e-06, + "loss": 0.6445, + "mean_token_accuracy": 0.7843446731567383, + "num_tokens": 538080501.0, + "step": 20784 + }, + { + "epoch": 2.2825609488249508, + "grad_norm": 2.0065078735351562, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7636004686355591, + "num_tokens": 538107504.0, + "step": 20785 + }, + { + "epoch": 2.282670766527564, + "grad_norm": 2.163410186767578, + "learning_rate": 5e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.7906356453895569, + "num_tokens": 538132103.0, + "step": 20786 + }, + { + "epoch": 2.282780584230178, + "grad_norm": 2.228555679321289, + "learning_rate": 5e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.773982048034668, + "num_tokens": 538154869.0, + "step": 20787 + }, + { + "epoch": 2.2828904019327916, + "grad_norm": 2.2630085945129395, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7703139781951904, + "num_tokens": 538178918.0, + "step": 20788 + }, + { + "epoch": 2.2830002196354053, + "grad_norm": 2.000779151916504, + "learning_rate": 5e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7615727186203003, + "num_tokens": 538207470.0, + "step": 20789 + }, + { + "epoch": 2.2831100373380186, + "grad_norm": 2.0923962593078613, + "learning_rate": 5e-06, + "loss": 0.64, + "mean_token_accuracy": 0.783714771270752, + "num_tokens": 538231155.0, + "step": 20790 + }, + { + "epoch": 2.2832198550406324, + "grad_norm": 2.0981273651123047, + "learning_rate": 5e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7696409225463867, + "num_tokens": 538256020.0, + "step": 20791 + }, + { + "epoch": 2.283329672743246, + "grad_norm": 2.1342384815216064, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7773343324661255, + "num_tokens": 538280752.0, + "step": 20792 + }, + { + "epoch": 2.28343949044586, + "grad_norm": 2.2885501384735107, + "learning_rate": 5e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.781386137008667, + "num_tokens": 538301765.0, + "step": 20793 + }, + { + "epoch": 2.2835493081484737, + "grad_norm": 2.003584623336792, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7681152820587158, + "num_tokens": 538326957.0, + "step": 20794 + }, + { + "epoch": 2.283659125851087, + "grad_norm": 1.8584038019180298, + "learning_rate": 5e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7404898405075073, + "num_tokens": 538361443.0, + "step": 20795 + }, + { + "epoch": 2.2837689435537007, + "grad_norm": 2.0317299365997314, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7602260112762451, + "num_tokens": 538387271.0, + "step": 20796 + }, + { + "epoch": 2.2838787612563145, + "grad_norm": 1.8592395782470703, + "learning_rate": 5e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.7678602933883667, + "num_tokens": 538415779.0, + "step": 20797 + }, + { + "epoch": 2.2839885789589283, + "grad_norm": 2.1126530170440674, + "learning_rate": 5e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.7691866159439087, + "num_tokens": 538441581.0, + "step": 20798 + }, + { + "epoch": 2.284098396661542, + "grad_norm": 2.2938380241394043, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7607430815696716, + "num_tokens": 538465057.0, + "step": 20799 + }, + { + "epoch": 2.2842082143641553, + "grad_norm": 2.100889205932617, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.76361083984375, + "num_tokens": 538490769.0, + "step": 20800 + }, + { + "epoch": 2.284318032066769, + "grad_norm": 1.9481828212738037, + "learning_rate": 5e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.760231077671051, + "num_tokens": 538519875.0, + "step": 20801 + }, + { + "epoch": 2.284427849769383, + "grad_norm": 2.1573987007141113, + "learning_rate": 5e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.8027729392051697, + "num_tokens": 538541782.0, + "step": 20802 + }, + { + "epoch": 2.2845376674719966, + "grad_norm": 1.9982597827911377, + "learning_rate": 5e-06, + "loss": 0.7367, + "mean_token_accuracy": 0.7572851181030273, + "num_tokens": 538571498.0, + "step": 20803 + }, + { + "epoch": 2.2846474851746104, + "grad_norm": 2.0201098918914795, + "learning_rate": 5e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7512449026107788, + "num_tokens": 538599558.0, + "step": 20804 + }, + { + "epoch": 2.2847573028772237, + "grad_norm": 2.3747923374176025, + "learning_rate": 5e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7669954895973206, + "num_tokens": 538620996.0, + "step": 20805 + }, + { + "epoch": 2.2848671205798374, + "grad_norm": 2.1744816303253174, + "learning_rate": 5e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.772758960723877, + "num_tokens": 538646274.0, + "step": 20806 + }, + { + "epoch": 2.284976938282451, + "grad_norm": 2.0084893703460693, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7492791414260864, + "num_tokens": 538673146.0, + "step": 20807 + }, + { + "epoch": 2.285086755985065, + "grad_norm": 2.2823996543884277, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7674932479858398, + "num_tokens": 538696364.0, + "step": 20808 + }, + { + "epoch": 2.2851965736876787, + "grad_norm": 2.159954071044922, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7547153234481812, + "num_tokens": 538720063.0, + "step": 20809 + }, + { + "epoch": 2.285306391390292, + "grad_norm": 2.1358301639556885, + "learning_rate": 5e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7697550654411316, + "num_tokens": 538747430.0, + "step": 20810 + }, + { + "epoch": 2.2854162090929058, + "grad_norm": 2.1900758743286133, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7753013372421265, + "num_tokens": 538770422.0, + "step": 20811 + }, + { + "epoch": 2.2855260267955195, + "grad_norm": 2.060276508331299, + "learning_rate": 5e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.776848316192627, + "num_tokens": 538794570.0, + "step": 20812 + }, + { + "epoch": 2.2856358444981333, + "grad_norm": 2.040858745574951, + "learning_rate": 5e-06, + "loss": 0.6756, + "mean_token_accuracy": 0.7733192443847656, + "num_tokens": 538820796.0, + "step": 20813 + }, + { + "epoch": 2.2857456622007466, + "grad_norm": 2.033681631088257, + "learning_rate": 5e-06, + "loss": 0.6742, + "mean_token_accuracy": 0.7770536541938782, + "num_tokens": 538846959.0, + "step": 20814 + }, + { + "epoch": 2.2858554799033604, + "grad_norm": 2.021652936935425, + "learning_rate": 5e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7652121782302856, + "num_tokens": 538874521.0, + "step": 20815 + }, + { + "epoch": 2.285965297605974, + "grad_norm": 2.3242392539978027, + "learning_rate": 5e-06, + "loss": 0.6767, + "mean_token_accuracy": 0.7780635356903076, + "num_tokens": 538895340.0, + "step": 20816 + }, + { + "epoch": 2.286075115308588, + "grad_norm": 2.2076315879821777, + "learning_rate": 5e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7553232908248901, + "num_tokens": 538919312.0, + "step": 20817 + }, + { + "epoch": 2.286184933011201, + "grad_norm": 2.075326681137085, + "learning_rate": 5e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7528961896896362, + "num_tokens": 538948489.0, + "step": 20818 + }, + { + "epoch": 2.286294750713815, + "grad_norm": 2.21854567527771, + "learning_rate": 5e-06, + "loss": 0.6384, + "mean_token_accuracy": 0.7884581089019775, + "num_tokens": 538973516.0, + "step": 20819 + }, + { + "epoch": 2.2864045684164287, + "grad_norm": 2.0944979190826416, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7710265517234802, + "num_tokens": 538998213.0, + "step": 20820 + }, + { + "epoch": 2.2865143861190425, + "grad_norm": 2.170982599258423, + "learning_rate": 5e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7447757124900818, + "num_tokens": 539024700.0, + "step": 20821 + }, + { + "epoch": 2.286624203821656, + "grad_norm": 2.2591233253479004, + "learning_rate": 5e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7762969732284546, + "num_tokens": 539049178.0, + "step": 20822 + }, + { + "epoch": 2.2867340215242695, + "grad_norm": 2.1383228302001953, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7652649283409119, + "num_tokens": 539073757.0, + "step": 20823 + }, + { + "epoch": 2.2868438392268833, + "grad_norm": 1.9996010065078735, + "learning_rate": 5e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7394323348999023, + "num_tokens": 539103873.0, + "step": 20824 + }, + { + "epoch": 2.286953656929497, + "grad_norm": 2.230290174484253, + "learning_rate": 5e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7614827156066895, + "num_tokens": 539127027.0, + "step": 20825 + }, + { + "epoch": 2.287063474632111, + "grad_norm": 1.8827409744262695, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7682145833969116, + "num_tokens": 539159575.0, + "step": 20826 + }, + { + "epoch": 2.2871732923347246, + "grad_norm": 2.047323703765869, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7737541198730469, + "num_tokens": 539186076.0, + "step": 20827 + }, + { + "epoch": 2.287283110037338, + "grad_norm": 2.111593008041382, + "learning_rate": 5e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7574820518493652, + "num_tokens": 539209931.0, + "step": 20828 + }, + { + "epoch": 2.2873929277399516, + "grad_norm": 2.208144187927246, + "learning_rate": 5e-06, + "loss": 0.826, + "mean_token_accuracy": 0.747153103351593, + "num_tokens": 539235315.0, + "step": 20829 + }, + { + "epoch": 2.2875027454425654, + "grad_norm": 2.1958305835723877, + "learning_rate": 5e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7347233295440674, + "num_tokens": 539262832.0, + "step": 20830 + }, + { + "epoch": 2.287612563145179, + "grad_norm": 2.0424208641052246, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7683546543121338, + "num_tokens": 539289486.0, + "step": 20831 + }, + { + "epoch": 2.287722380847793, + "grad_norm": 2.2175562381744385, + "learning_rate": 5e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.7769603729248047, + "num_tokens": 539310553.0, + "step": 20832 + }, + { + "epoch": 2.287832198550406, + "grad_norm": 1.965561032295227, + "learning_rate": 5e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.7782192230224609, + "num_tokens": 539338606.0, + "step": 20833 + }, + { + "epoch": 2.28794201625302, + "grad_norm": 2.2824554443359375, + "learning_rate": 5e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.777795672416687, + "num_tokens": 539359656.0, + "step": 20834 + }, + { + "epoch": 2.2880518339556337, + "grad_norm": 2.055619239807129, + "learning_rate": 5e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.7820752859115601, + "num_tokens": 539385225.0, + "step": 20835 + }, + { + "epoch": 2.2881616516582475, + "grad_norm": 2.1935789585113525, + "learning_rate": 5e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.7763338685035706, + "num_tokens": 539406095.0, + "step": 20836 + }, + { + "epoch": 2.288271469360861, + "grad_norm": 2.067814826965332, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7494953870773315, + "num_tokens": 539434881.0, + "step": 20837 + }, + { + "epoch": 2.2883812870634745, + "grad_norm": 2.1332366466522217, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7618131637573242, + "num_tokens": 539461349.0, + "step": 20838 + }, + { + "epoch": 2.2884911047660883, + "grad_norm": 2.027097463607788, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.753415584564209, + "num_tokens": 539487811.0, + "step": 20839 + }, + { + "epoch": 2.288600922468702, + "grad_norm": 2.0678224563598633, + "learning_rate": 5e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7731098532676697, + "num_tokens": 539514199.0, + "step": 20840 + }, + { + "epoch": 2.2887107401713154, + "grad_norm": 2.128965377807617, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7593264579772949, + "num_tokens": 539540107.0, + "step": 20841 + }, + { + "epoch": 2.288820557873929, + "grad_norm": 2.1694412231445312, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7705060839653015, + "num_tokens": 539564234.0, + "step": 20842 + }, + { + "epoch": 2.288930375576543, + "grad_norm": 2.1147098541259766, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7524904608726501, + "num_tokens": 539592543.0, + "step": 20843 + }, + { + "epoch": 2.2890401932791566, + "grad_norm": 1.8938032388687134, + "learning_rate": 5e-06, + "loss": 0.6681, + "mean_token_accuracy": 0.7758894562721252, + "num_tokens": 539620565.0, + "step": 20844 + }, + { + "epoch": 2.2891500109817704, + "grad_norm": 1.9733892679214478, + "learning_rate": 5e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.756443977355957, + "num_tokens": 539649153.0, + "step": 20845 + }, + { + "epoch": 2.2892598286843837, + "grad_norm": 2.281709671020508, + "learning_rate": 5e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7714197635650635, + "num_tokens": 539670810.0, + "step": 20846 + }, + { + "epoch": 2.2893696463869975, + "grad_norm": 2.1664910316467285, + "learning_rate": 5e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.7786449193954468, + "num_tokens": 539695310.0, + "step": 20847 + }, + { + "epoch": 2.2894794640896112, + "grad_norm": 2.119281530380249, + "learning_rate": 5e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.7790459990501404, + "num_tokens": 539718428.0, + "step": 20848 + }, + { + "epoch": 2.289589281792225, + "grad_norm": 2.065126657485962, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7523776888847351, + "num_tokens": 539743431.0, + "step": 20849 + }, + { + "epoch": 2.2896990994948387, + "grad_norm": 2.3804848194122314, + "learning_rate": 5e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7667360901832581, + "num_tokens": 539766262.0, + "step": 20850 + }, + { + "epoch": 2.289808917197452, + "grad_norm": 2.2075488567352295, + "learning_rate": 5e-06, + "loss": 0.6383, + "mean_token_accuracy": 0.7894284725189209, + "num_tokens": 539788507.0, + "step": 20851 + }, + { + "epoch": 2.289918734900066, + "grad_norm": 2.332124710083008, + "learning_rate": 5e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7566255331039429, + "num_tokens": 539811940.0, + "step": 20852 + }, + { + "epoch": 2.2900285526026796, + "grad_norm": 2.0858874320983887, + "learning_rate": 5e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7511851191520691, + "num_tokens": 539840957.0, + "step": 20853 + }, + { + "epoch": 2.2901383703052933, + "grad_norm": 2.091736078262329, + "learning_rate": 5e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7660632729530334, + "num_tokens": 539867769.0, + "step": 20854 + }, + { + "epoch": 2.290248188007907, + "grad_norm": 2.220383405685425, + "learning_rate": 5e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7635735273361206, + "num_tokens": 539889470.0, + "step": 20855 + }, + { + "epoch": 2.2903580057105204, + "grad_norm": 2.1772680282592773, + "learning_rate": 5e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7650091052055359, + "num_tokens": 539915001.0, + "step": 20856 + }, + { + "epoch": 2.290467823413134, + "grad_norm": 2.159179449081421, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7743483781814575, + "num_tokens": 539939793.0, + "step": 20857 + }, + { + "epoch": 2.290577641115748, + "grad_norm": 2.097181797027588, + "learning_rate": 5e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.7832211852073669, + "num_tokens": 539966014.0, + "step": 20858 + }, + { + "epoch": 2.2906874588183617, + "grad_norm": 2.4031760692596436, + "learning_rate": 5e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.765360414981842, + "num_tokens": 539987670.0, + "step": 20859 + }, + { + "epoch": 2.2907972765209754, + "grad_norm": 1.9619381427764893, + "learning_rate": 5e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7739800214767456, + "num_tokens": 540013943.0, + "step": 20860 + }, + { + "epoch": 2.2909070942235887, + "grad_norm": 1.8940062522888184, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7702664136886597, + "num_tokens": 540043879.0, + "step": 20861 + }, + { + "epoch": 2.2910169119262025, + "grad_norm": 1.8013088703155518, + "learning_rate": 5e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7623319029808044, + "num_tokens": 540076942.0, + "step": 20862 + }, + { + "epoch": 2.2911267296288162, + "grad_norm": 2.17471981048584, + "learning_rate": 5e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7598928809165955, + "num_tokens": 540102189.0, + "step": 20863 + }, + { + "epoch": 2.29123654733143, + "grad_norm": 1.9441412687301636, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7610527873039246, + "num_tokens": 540130075.0, + "step": 20864 + }, + { + "epoch": 2.2913463650340433, + "grad_norm": 2.003525495529175, + "learning_rate": 5e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7417840957641602, + "num_tokens": 540158809.0, + "step": 20865 + }, + { + "epoch": 2.291456182736657, + "grad_norm": 1.829904556274414, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7715225219726562, + "num_tokens": 540191171.0, + "step": 20866 + }, + { + "epoch": 2.291566000439271, + "grad_norm": 2.0901126861572266, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7825400829315186, + "num_tokens": 540218866.0, + "step": 20867 + }, + { + "epoch": 2.2916758181418846, + "grad_norm": 1.9925392866134644, + "learning_rate": 5e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7536289095878601, + "num_tokens": 540247027.0, + "step": 20868 + }, + { + "epoch": 2.291785635844498, + "grad_norm": 2.214907169342041, + "learning_rate": 5e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7705562710762024, + "num_tokens": 540268744.0, + "step": 20869 + }, + { + "epoch": 2.2918954535471117, + "grad_norm": 2.092459201812744, + "learning_rate": 5e-06, + "loss": 0.711, + "mean_token_accuracy": 0.7606299519538879, + "num_tokens": 540296015.0, + "step": 20870 + }, + { + "epoch": 2.2920052712497254, + "grad_norm": 2.0297462940216064, + "learning_rate": 5e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8177004456520081, + "num_tokens": 540323009.0, + "step": 20871 + }, + { + "epoch": 2.292115088952339, + "grad_norm": 1.9015308618545532, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7694864273071289, + "num_tokens": 540352667.0, + "step": 20872 + }, + { + "epoch": 2.292224906654953, + "grad_norm": 1.9800361394882202, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.757723331451416, + "num_tokens": 540382071.0, + "step": 20873 + }, + { + "epoch": 2.2923347243575662, + "grad_norm": 2.498044967651367, + "learning_rate": 5e-06, + "loss": 0.6687, + "mean_token_accuracy": 0.7763333916664124, + "num_tokens": 540401366.0, + "step": 20874 + }, + { + "epoch": 2.29244454206018, + "grad_norm": 2.2442636489868164, + "learning_rate": 5e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7667026519775391, + "num_tokens": 540424334.0, + "step": 20875 + }, + { + "epoch": 2.2925543597627938, + "grad_norm": 2.112297773361206, + "learning_rate": 5e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.7740103006362915, + "num_tokens": 540450416.0, + "step": 20876 + }, + { + "epoch": 2.2926641774654075, + "grad_norm": 2.0870440006256104, + "learning_rate": 5e-06, + "loss": 0.651, + "mean_token_accuracy": 0.776605486869812, + "num_tokens": 540473342.0, + "step": 20877 + }, + { + "epoch": 2.2927739951680213, + "grad_norm": 1.979616641998291, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7657038569450378, + "num_tokens": 540499155.0, + "step": 20878 + }, + { + "epoch": 2.2928838128706346, + "grad_norm": 2.041675567626953, + "learning_rate": 5e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7657456398010254, + "num_tokens": 540526652.0, + "step": 20879 + }, + { + "epoch": 2.2929936305732483, + "grad_norm": 2.4606704711914062, + "learning_rate": 5e-06, + "loss": 0.6548, + "mean_token_accuracy": 0.790928065776825, + "num_tokens": 540545411.0, + "step": 20880 + }, + { + "epoch": 2.293103448275862, + "grad_norm": 1.9516137838363647, + "learning_rate": 5e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.7734602093696594, + "num_tokens": 540570415.0, + "step": 20881 + }, + { + "epoch": 2.293213265978476, + "grad_norm": 1.9315383434295654, + "learning_rate": 5e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.759178102016449, + "num_tokens": 540599179.0, + "step": 20882 + }, + { + "epoch": 2.2933230836810896, + "grad_norm": 2.227241277694702, + "learning_rate": 5e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.7768696546554565, + "num_tokens": 540620747.0, + "step": 20883 + }, + { + "epoch": 2.293432901383703, + "grad_norm": 1.972976803779602, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7545338869094849, + "num_tokens": 540649933.0, + "step": 20884 + }, + { + "epoch": 2.2935427190863167, + "grad_norm": 2.334981679916382, + "learning_rate": 5e-06, + "loss": 0.6628, + "mean_token_accuracy": 0.7799046039581299, + "num_tokens": 540671227.0, + "step": 20885 + }, + { + "epoch": 2.2936525367889304, + "grad_norm": 2.061800003051758, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7719248533248901, + "num_tokens": 540698569.0, + "step": 20886 + }, + { + "epoch": 2.293762354491544, + "grad_norm": 2.1414449214935303, + "learning_rate": 5e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.7527480721473694, + "num_tokens": 540722666.0, + "step": 20887 + }, + { + "epoch": 2.2938721721941575, + "grad_norm": 2.008817434310913, + "learning_rate": 5e-06, + "loss": 0.7095, + "mean_token_accuracy": 0.7792047262191772, + "num_tokens": 540749163.0, + "step": 20888 + }, + { + "epoch": 2.2939819898967713, + "grad_norm": 1.860016942024231, + "learning_rate": 5e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7628062963485718, + "num_tokens": 540778624.0, + "step": 20889 + }, + { + "epoch": 2.294091807599385, + "grad_norm": 2.3290224075317383, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7710970044136047, + "num_tokens": 540800755.0, + "step": 20890 + }, + { + "epoch": 2.294201625301999, + "grad_norm": 2.064476728439331, + "learning_rate": 5e-06, + "loss": 0.674, + "mean_token_accuracy": 0.7783235907554626, + "num_tokens": 540826361.0, + "step": 20891 + }, + { + "epoch": 2.294311443004612, + "grad_norm": 2.0575873851776123, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7363365888595581, + "num_tokens": 540854997.0, + "step": 20892 + }, + { + "epoch": 2.294421260707226, + "grad_norm": 1.920374870300293, + "learning_rate": 5e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.7800281047821045, + "num_tokens": 540880694.0, + "step": 20893 + }, + { + "epoch": 2.2945310784098396, + "grad_norm": 2.144005060195923, + "learning_rate": 5e-06, + "loss": 0.6256, + "mean_token_accuracy": 0.7929773330688477, + "num_tokens": 540906013.0, + "step": 20894 + }, + { + "epoch": 2.2946408961124534, + "grad_norm": 1.830393671989441, + "learning_rate": 5e-06, + "loss": 0.7684, + "mean_token_accuracy": 0.7577835321426392, + "num_tokens": 540939136.0, + "step": 20895 + }, + { + "epoch": 2.294750713815067, + "grad_norm": 2.131239652633667, + "learning_rate": 5e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7660171985626221, + "num_tokens": 540966269.0, + "step": 20896 + }, + { + "epoch": 2.2948605315176804, + "grad_norm": 2.2638442516326904, + "learning_rate": 5e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.7797005772590637, + "num_tokens": 540987311.0, + "step": 20897 + }, + { + "epoch": 2.294970349220294, + "grad_norm": 1.9528071880340576, + "learning_rate": 5e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7537491321563721, + "num_tokens": 541017693.0, + "step": 20898 + }, + { + "epoch": 2.295080166922908, + "grad_norm": 1.938562273979187, + "learning_rate": 5e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.7697325944900513, + "num_tokens": 541044483.0, + "step": 20899 + }, + { + "epoch": 2.2951899846255217, + "grad_norm": 2.2018625736236572, + "learning_rate": 5e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.7893358469009399, + "num_tokens": 541065814.0, + "step": 20900 + }, + { + "epoch": 2.2952998023281355, + "grad_norm": 2.1335389614105225, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7774240970611572, + "num_tokens": 541091651.0, + "step": 20901 + }, + { + "epoch": 2.2954096200307488, + "grad_norm": 1.9551104307174683, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7636705636978149, + "num_tokens": 541120220.0, + "step": 20902 + }, + { + "epoch": 2.2955194377333625, + "grad_norm": 2.1421866416931152, + "learning_rate": 5e-06, + "loss": 0.6807, + "mean_token_accuracy": 0.7817182540893555, + "num_tokens": 541145064.0, + "step": 20903 + }, + { + "epoch": 2.2956292554359763, + "grad_norm": 1.9422882795333862, + "learning_rate": 5e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7460752725601196, + "num_tokens": 541174467.0, + "step": 20904 + }, + { + "epoch": 2.29573907313859, + "grad_norm": 2.389498710632324, + "learning_rate": 5e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.7735077738761902, + "num_tokens": 541194720.0, + "step": 20905 + }, + { + "epoch": 2.295848890841204, + "grad_norm": 1.9849181175231934, + "learning_rate": 5e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.76271653175354, + "num_tokens": 541223224.0, + "step": 20906 + }, + { + "epoch": 2.295958708543817, + "grad_norm": 1.9202195405960083, + "learning_rate": 5e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.7805453538894653, + "num_tokens": 541252571.0, + "step": 20907 + }, + { + "epoch": 2.296068526246431, + "grad_norm": 2.2601237297058105, + "learning_rate": 5e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7798072099685669, + "num_tokens": 541273622.0, + "step": 20908 + }, + { + "epoch": 2.2961783439490446, + "grad_norm": 2.2850069999694824, + "learning_rate": 5e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7553365230560303, + "num_tokens": 541296576.0, + "step": 20909 + }, + { + "epoch": 2.2962881616516584, + "grad_norm": 1.8667539358139038, + "learning_rate": 5e-06, + "loss": 0.5964, + "mean_token_accuracy": 0.7956211566925049, + "num_tokens": 541324532.0, + "step": 20910 + }, + { + "epoch": 2.296397979354272, + "grad_norm": 2.072636604309082, + "learning_rate": 5e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7703791856765747, + "num_tokens": 541350148.0, + "step": 20911 + }, + { + "epoch": 2.2965077970568855, + "grad_norm": 2.2127981185913086, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7523726224899292, + "num_tokens": 541376128.0, + "step": 20912 + }, + { + "epoch": 2.296617614759499, + "grad_norm": 2.0415115356445312, + "learning_rate": 5e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7656890153884888, + "num_tokens": 541404680.0, + "step": 20913 + }, + { + "epoch": 2.296727432462113, + "grad_norm": 2.00522518157959, + "learning_rate": 5e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.7759994268417358, + "num_tokens": 541432832.0, + "step": 20914 + }, + { + "epoch": 2.2968372501647267, + "grad_norm": 2.2824318408966064, + "learning_rate": 5e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.7809233069419861, + "num_tokens": 541454905.0, + "step": 20915 + }, + { + "epoch": 2.29694706786734, + "grad_norm": 2.193410873413086, + "learning_rate": 5e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7712453603744507, + "num_tokens": 541480302.0, + "step": 20916 + }, + { + "epoch": 2.297056885569954, + "grad_norm": 2.1133291721343994, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7585191130638123, + "num_tokens": 541504934.0, + "step": 20917 + }, + { + "epoch": 2.2971667032725676, + "grad_norm": 2.0175702571868896, + "learning_rate": 5e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7671654224395752, + "num_tokens": 541533817.0, + "step": 20918 + }, + { + "epoch": 2.2972765209751813, + "grad_norm": 2.1559131145477295, + "learning_rate": 5e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.783828854560852, + "num_tokens": 541558240.0, + "step": 20919 + }, + { + "epoch": 2.2973863386777946, + "grad_norm": 2.024894952774048, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.773806095123291, + "num_tokens": 541585234.0, + "step": 20920 + }, + { + "epoch": 2.2974961563804084, + "grad_norm": 2.2361745834350586, + "learning_rate": 5e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7721307277679443, + "num_tokens": 541607859.0, + "step": 20921 + }, + { + "epoch": 2.297605974083022, + "grad_norm": 2.0796127319335938, + "learning_rate": 5e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7505618333816528, + "num_tokens": 541636615.0, + "step": 20922 + }, + { + "epoch": 2.297715791785636, + "grad_norm": 1.9841097593307495, + "learning_rate": 5e-06, + "loss": 0.7803, + "mean_token_accuracy": 0.7529833316802979, + "num_tokens": 541668045.0, + "step": 20923 + }, + { + "epoch": 2.2978256094882497, + "grad_norm": 2.031287431716919, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7699450254440308, + "num_tokens": 541692916.0, + "step": 20924 + }, + { + "epoch": 2.297935427190863, + "grad_norm": 2.551945686340332, + "learning_rate": 5e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.7808637619018555, + "num_tokens": 541712525.0, + "step": 20925 + }, + { + "epoch": 2.2980452448934767, + "grad_norm": 2.022982358932495, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7635249495506287, + "num_tokens": 541740499.0, + "step": 20926 + }, + { + "epoch": 2.2981550625960905, + "grad_norm": 2.0907557010650635, + "learning_rate": 5e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.7796810865402222, + "num_tokens": 541764010.0, + "step": 20927 + }, + { + "epoch": 2.2982648802987042, + "grad_norm": 2.2915306091308594, + "learning_rate": 5e-06, + "loss": 0.6408, + "mean_token_accuracy": 0.7846558094024658, + "num_tokens": 541784486.0, + "step": 20928 + }, + { + "epoch": 2.298374698001318, + "grad_norm": 2.2089314460754395, + "learning_rate": 5e-06, + "loss": 0.634, + "mean_token_accuracy": 0.7810736894607544, + "num_tokens": 541810225.0, + "step": 20929 + }, + { + "epoch": 2.2984845157039313, + "grad_norm": 2.40936279296875, + "learning_rate": 5e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.7872879505157471, + "num_tokens": 541831329.0, + "step": 20930 + }, + { + "epoch": 2.298594333406545, + "grad_norm": 2.0479280948638916, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7606273293495178, + "num_tokens": 541859191.0, + "step": 20931 + }, + { + "epoch": 2.298704151109159, + "grad_norm": 2.206181287765503, + "learning_rate": 5e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.7937147617340088, + "num_tokens": 541881727.0, + "step": 20932 + }, + { + "epoch": 2.2988139688117726, + "grad_norm": 2.021960496902466, + "learning_rate": 5e-06, + "loss": 0.655, + "mean_token_accuracy": 0.7831754684448242, + "num_tokens": 541906826.0, + "step": 20933 + }, + { + "epoch": 2.2989237865143863, + "grad_norm": 2.178581476211548, + "learning_rate": 5e-06, + "loss": 0.692, + "mean_token_accuracy": 0.780258297920227, + "num_tokens": 541930841.0, + "step": 20934 + }, + { + "epoch": 2.2990336042169996, + "grad_norm": 1.7762906551361084, + "learning_rate": 5e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7398164868354797, + "num_tokens": 541961762.0, + "step": 20935 + }, + { + "epoch": 2.2991434219196134, + "grad_norm": 2.17846941947937, + "learning_rate": 5e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.7838470339775085, + "num_tokens": 541985285.0, + "step": 20936 + }, + { + "epoch": 2.299253239622227, + "grad_norm": 2.1924917697906494, + "learning_rate": 5e-06, + "loss": 0.6468, + "mean_token_accuracy": 0.7794633507728577, + "num_tokens": 542009125.0, + "step": 20937 + }, + { + "epoch": 2.299363057324841, + "grad_norm": 1.8417460918426514, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7514464855194092, + "num_tokens": 542040133.0, + "step": 20938 + }, + { + "epoch": 2.2994728750274542, + "grad_norm": 2.2603321075439453, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7681038975715637, + "num_tokens": 542064452.0, + "step": 20939 + }, + { + "epoch": 2.299582692730068, + "grad_norm": 2.005774974822998, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7661932706832886, + "num_tokens": 542092018.0, + "step": 20940 + }, + { + "epoch": 2.2996925104326817, + "grad_norm": 2.2186925411224365, + "learning_rate": 5e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7731572985649109, + "num_tokens": 542116049.0, + "step": 20941 + }, + { + "epoch": 2.2998023281352955, + "grad_norm": 2.083456516265869, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7769275903701782, + "num_tokens": 542139581.0, + "step": 20942 + }, + { + "epoch": 2.2999121458379093, + "grad_norm": 2.1569712162017822, + "learning_rate": 5e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.767235517501831, + "num_tokens": 542165050.0, + "step": 20943 + }, + { + "epoch": 2.3000219635405226, + "grad_norm": 1.98486328125, + "learning_rate": 5e-06, + "loss": 0.6329, + "mean_token_accuracy": 0.7921112775802612, + "num_tokens": 542191214.0, + "step": 20944 + }, + { + "epoch": 2.3001317812431363, + "grad_norm": 2.00042724609375, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7605539560317993, + "num_tokens": 542218059.0, + "step": 20945 + }, + { + "epoch": 2.30024159894575, + "grad_norm": 2.177910804748535, + "learning_rate": 5e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.7991673946380615, + "num_tokens": 542240395.0, + "step": 20946 + }, + { + "epoch": 2.300351416648364, + "grad_norm": 1.7731274366378784, + "learning_rate": 5e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7801148891448975, + "num_tokens": 542268403.0, + "step": 20947 + }, + { + "epoch": 2.300461234350977, + "grad_norm": 1.8921977281570435, + "learning_rate": 5e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7748386859893799, + "num_tokens": 542298175.0, + "step": 20948 + }, + { + "epoch": 2.300571052053591, + "grad_norm": 2.16029691696167, + "learning_rate": 5e-06, + "loss": 0.6583, + "mean_token_accuracy": 0.7820945382118225, + "num_tokens": 542321850.0, + "step": 20949 + }, + { + "epoch": 2.3006808697562047, + "grad_norm": 2.2397046089172363, + "learning_rate": 5e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7681746482849121, + "num_tokens": 542345081.0, + "step": 20950 + }, + { + "epoch": 2.3007906874588184, + "grad_norm": 2.418929100036621, + "learning_rate": 5e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7592266798019409, + "num_tokens": 542369106.0, + "step": 20951 + }, + { + "epoch": 2.300900505161432, + "grad_norm": 2.124093532562256, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7707106471061707, + "num_tokens": 542392809.0, + "step": 20952 + }, + { + "epoch": 2.3010103228640455, + "grad_norm": 2.075018882751465, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7762179374694824, + "num_tokens": 542420051.0, + "step": 20953 + }, + { + "epoch": 2.3011201405666593, + "grad_norm": 2.162972927093506, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7655084729194641, + "num_tokens": 542447315.0, + "step": 20954 + }, + { + "epoch": 2.301229958269273, + "grad_norm": 1.946674108505249, + "learning_rate": 5e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.756328821182251, + "num_tokens": 542478145.0, + "step": 20955 + }, + { + "epoch": 2.3013397759718868, + "grad_norm": 1.8958353996276855, + "learning_rate": 5e-06, + "loss": 0.6403, + "mean_token_accuracy": 0.7894288301467896, + "num_tokens": 542509848.0, + "step": 20956 + }, + { + "epoch": 2.3014495936745005, + "grad_norm": 2.276090621948242, + "learning_rate": 5e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7758691310882568, + "num_tokens": 542535187.0, + "step": 20957 + }, + { + "epoch": 2.301559411377114, + "grad_norm": 2.2596628665924072, + "learning_rate": 5e-06, + "loss": 0.674, + "mean_token_accuracy": 0.7806758880615234, + "num_tokens": 542557657.0, + "step": 20958 + }, + { + "epoch": 2.3016692290797276, + "grad_norm": 1.9780404567718506, + "learning_rate": 5e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.7872311472892761, + "num_tokens": 542582631.0, + "step": 20959 + }, + { + "epoch": 2.3017790467823414, + "grad_norm": 2.0693514347076416, + "learning_rate": 5e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7665260434150696, + "num_tokens": 542606521.0, + "step": 20960 + }, + { + "epoch": 2.301888864484955, + "grad_norm": 1.973366618156433, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7610587477684021, + "num_tokens": 542636421.0, + "step": 20961 + }, + { + "epoch": 2.301998682187569, + "grad_norm": 2.1529765129089355, + "learning_rate": 5e-06, + "loss": 0.6553, + "mean_token_accuracy": 0.7837306261062622, + "num_tokens": 542661238.0, + "step": 20962 + }, + { + "epoch": 2.302108499890182, + "grad_norm": 1.936297059059143, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.732634425163269, + "num_tokens": 542691173.0, + "step": 20963 + }, + { + "epoch": 2.302218317592796, + "grad_norm": 1.90292489528656, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7612086534500122, + "num_tokens": 542722400.0, + "step": 20964 + }, + { + "epoch": 2.3023281352954097, + "grad_norm": 2.1822805404663086, + "learning_rate": 5e-06, + "loss": 0.589, + "mean_token_accuracy": 0.8044555187225342, + "num_tokens": 542744181.0, + "step": 20965 + }, + { + "epoch": 2.3024379529980235, + "grad_norm": 2.2146430015563965, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7614319324493408, + "num_tokens": 542770584.0, + "step": 20966 + }, + { + "epoch": 2.3025477707006368, + "grad_norm": 2.185849905014038, + "learning_rate": 5e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.7887911796569824, + "num_tokens": 542794892.0, + "step": 20967 + }, + { + "epoch": 2.3026575884032505, + "grad_norm": 1.975767731666565, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.762880802154541, + "num_tokens": 542823070.0, + "step": 20968 + }, + { + "epoch": 2.3027674061058643, + "grad_norm": 2.472437620162964, + "learning_rate": 5e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.7907044887542725, + "num_tokens": 542843183.0, + "step": 20969 + }, + { + "epoch": 2.302877223808478, + "grad_norm": 1.9369655847549438, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7736762762069702, + "num_tokens": 542873887.0, + "step": 20970 + }, + { + "epoch": 2.3029870415110913, + "grad_norm": 2.133664131164551, + "learning_rate": 5e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7712205052375793, + "num_tokens": 542898345.0, + "step": 20971 + }, + { + "epoch": 2.303096859213705, + "grad_norm": 2.150371789932251, + "learning_rate": 5e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7593836188316345, + "num_tokens": 542924652.0, + "step": 20972 + }, + { + "epoch": 2.303206676916319, + "grad_norm": 2.1103572845458984, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7603778839111328, + "num_tokens": 542951083.0, + "step": 20973 + }, + { + "epoch": 2.3033164946189326, + "grad_norm": 2.007382392883301, + "learning_rate": 5e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7491141557693481, + "num_tokens": 542979549.0, + "step": 20974 + }, + { + "epoch": 2.3034263123215464, + "grad_norm": 2.037310838699341, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7749511003494263, + "num_tokens": 543006414.0, + "step": 20975 + }, + { + "epoch": 2.3035361300241597, + "grad_norm": 2.2442073822021484, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7701131105422974, + "num_tokens": 543029379.0, + "step": 20976 + }, + { + "epoch": 2.3036459477267734, + "grad_norm": 2.120018482208252, + "learning_rate": 5e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7563790678977966, + "num_tokens": 543058173.0, + "step": 20977 + }, + { + "epoch": 2.303755765429387, + "grad_norm": 2.097930669784546, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7579695582389832, + "num_tokens": 543084299.0, + "step": 20978 + }, + { + "epoch": 2.303865583132001, + "grad_norm": 2.1393940448760986, + "learning_rate": 5e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.7921345829963684, + "num_tokens": 543109097.0, + "step": 20979 + }, + { + "epoch": 2.3039754008346147, + "grad_norm": 2.099161386489868, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7685361504554749, + "num_tokens": 543135421.0, + "step": 20980 + }, + { + "epoch": 2.304085218537228, + "grad_norm": 2.3252880573272705, + "learning_rate": 5e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.7833676338195801, + "num_tokens": 543157258.0, + "step": 20981 + }, + { + "epoch": 2.304195036239842, + "grad_norm": 2.123926877975464, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7683966755867004, + "num_tokens": 543182051.0, + "step": 20982 + }, + { + "epoch": 2.3043048539424555, + "grad_norm": 2.211888313293457, + "learning_rate": 5e-06, + "loss": 0.6587, + "mean_token_accuracy": 0.779363214969635, + "num_tokens": 543204671.0, + "step": 20983 + }, + { + "epoch": 2.3044146716450693, + "grad_norm": 1.9023829698562622, + "learning_rate": 5e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.7835156917572021, + "num_tokens": 543235803.0, + "step": 20984 + }, + { + "epoch": 2.304524489347683, + "grad_norm": 1.9885798692703247, + "learning_rate": 5e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.7823303937911987, + "num_tokens": 543261845.0, + "step": 20985 + }, + { + "epoch": 2.3046343070502964, + "grad_norm": 2.090404748916626, + "learning_rate": 5e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7699369788169861, + "num_tokens": 543285682.0, + "step": 20986 + }, + { + "epoch": 2.30474412475291, + "grad_norm": 2.1543149948120117, + "learning_rate": 5e-06, + "loss": 0.6698, + "mean_token_accuracy": 0.7781899571418762, + "num_tokens": 543310147.0, + "step": 20987 + }, + { + "epoch": 2.304853942455524, + "grad_norm": 2.0302085876464844, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.757987380027771, + "num_tokens": 543337818.0, + "step": 20988 + }, + { + "epoch": 2.3049637601581376, + "grad_norm": 1.8317539691925049, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7651634216308594, + "num_tokens": 543368386.0, + "step": 20989 + }, + { + "epoch": 2.3050735778607514, + "grad_norm": 2.171490430831909, + "learning_rate": 5e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.7862567901611328, + "num_tokens": 543391324.0, + "step": 20990 + }, + { + "epoch": 2.3051833955633647, + "grad_norm": 2.2870993614196777, + "learning_rate": 5e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.7768885493278503, + "num_tokens": 543414267.0, + "step": 20991 + }, + { + "epoch": 2.3052932132659785, + "grad_norm": 2.153977394104004, + "learning_rate": 5e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7569442987442017, + "num_tokens": 543438981.0, + "step": 20992 + }, + { + "epoch": 2.3054030309685922, + "grad_norm": 2.2457737922668457, + "learning_rate": 5e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.772615909576416, + "num_tokens": 543461643.0, + "step": 20993 + }, + { + "epoch": 2.305512848671206, + "grad_norm": 1.9120150804519653, + "learning_rate": 5e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7559396624565125, + "num_tokens": 543493156.0, + "step": 20994 + }, + { + "epoch": 2.3056226663738193, + "grad_norm": 1.9434657096862793, + "learning_rate": 5e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7535452246665955, + "num_tokens": 543521804.0, + "step": 20995 + }, + { + "epoch": 2.305732484076433, + "grad_norm": 2.003065824508667, + "learning_rate": 5e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7521357536315918, + "num_tokens": 543553387.0, + "step": 20996 + }, + { + "epoch": 2.305842301779047, + "grad_norm": 2.4735732078552246, + "learning_rate": 5e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.7919331789016724, + "num_tokens": 543572338.0, + "step": 20997 + }, + { + "epoch": 2.3059521194816606, + "grad_norm": 2.071533203125, + "learning_rate": 5e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.760352373123169, + "num_tokens": 543599832.0, + "step": 20998 + }, + { + "epoch": 2.306061937184274, + "grad_norm": 1.9527504444122314, + "learning_rate": 5e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.7573184967041016, + "num_tokens": 543631098.0, + "step": 20999 + }, + { + "epoch": 2.3061717548868876, + "grad_norm": 2.200629472732544, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7640329599380493, + "num_tokens": 543655593.0, + "step": 21000 + }, + { + "epoch": 2.3062815725895014, + "grad_norm": 2.2688846588134766, + "learning_rate": 5e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.774486243724823, + "num_tokens": 543677988.0, + "step": 21001 + }, + { + "epoch": 2.306391390292115, + "grad_norm": 2.1686112880706787, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7685678601264954, + "num_tokens": 543703958.0, + "step": 21002 + }, + { + "epoch": 2.306501207994729, + "grad_norm": 2.005770683288574, + "learning_rate": 5e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7707812786102295, + "num_tokens": 543730526.0, + "step": 21003 + }, + { + "epoch": 2.306611025697342, + "grad_norm": 2.21044921875, + "learning_rate": 5e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.785441517829895, + "num_tokens": 543752951.0, + "step": 21004 + }, + { + "epoch": 2.306720843399956, + "grad_norm": 2.078948974609375, + "learning_rate": 5e-06, + "loss": 0.6468, + "mean_token_accuracy": 0.786382794380188, + "num_tokens": 543777224.0, + "step": 21005 + }, + { + "epoch": 2.3068306611025697, + "grad_norm": 2.0750842094421387, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.756600022315979, + "num_tokens": 543808180.0, + "step": 21006 + }, + { + "epoch": 2.3069404788051835, + "grad_norm": 2.210153818130493, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7704287767410278, + "num_tokens": 543831521.0, + "step": 21007 + }, + { + "epoch": 2.3070502965077972, + "grad_norm": 1.9930514097213745, + "learning_rate": 5e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7567459940910339, + "num_tokens": 543859329.0, + "step": 21008 + }, + { + "epoch": 2.3071601142104106, + "grad_norm": 1.916213870048523, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7642115950584412, + "num_tokens": 543888219.0, + "step": 21009 + }, + { + "epoch": 2.3072699319130243, + "grad_norm": 2.1436257362365723, + "learning_rate": 5e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7736555337905884, + "num_tokens": 543912386.0, + "step": 21010 + }, + { + "epoch": 2.307379749615638, + "grad_norm": 2.3053812980651855, + "learning_rate": 5e-06, + "loss": 0.6683, + "mean_token_accuracy": 0.7711355090141296, + "num_tokens": 543932681.0, + "step": 21011 + }, + { + "epoch": 2.307489567318252, + "grad_norm": 2.312927484512329, + "learning_rate": 5e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7714002132415771, + "num_tokens": 543953643.0, + "step": 21012 + }, + { + "epoch": 2.3075993850208656, + "grad_norm": 2.1845312118530273, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7537796497344971, + "num_tokens": 543978618.0, + "step": 21013 + }, + { + "epoch": 2.307709202723479, + "grad_norm": 2.1011993885040283, + "learning_rate": 5e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.7649464011192322, + "num_tokens": 544004903.0, + "step": 21014 + }, + { + "epoch": 2.3078190204260927, + "grad_norm": 2.117269992828369, + "learning_rate": 5e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7683664560317993, + "num_tokens": 544034951.0, + "step": 21015 + }, + { + "epoch": 2.3079288381287064, + "grad_norm": 2.357389450073242, + "learning_rate": 5e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7547144889831543, + "num_tokens": 544057598.0, + "step": 21016 + }, + { + "epoch": 2.30803865583132, + "grad_norm": 2.3017477989196777, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7580090165138245, + "num_tokens": 544079872.0, + "step": 21017 + }, + { + "epoch": 2.3081484735339335, + "grad_norm": 2.2603840827941895, + "learning_rate": 5e-06, + "loss": 0.676, + "mean_token_accuracy": 0.7775964140892029, + "num_tokens": 544101922.0, + "step": 21018 + }, + { + "epoch": 2.3082582912365472, + "grad_norm": 2.2708511352539062, + "learning_rate": 5e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7626718282699585, + "num_tokens": 544126538.0, + "step": 21019 + }, + { + "epoch": 2.308368108939161, + "grad_norm": 1.9733577966690063, + "learning_rate": 5e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.7935497760772705, + "num_tokens": 544153460.0, + "step": 21020 + }, + { + "epoch": 2.3084779266417748, + "grad_norm": 2.363163709640503, + "learning_rate": 5e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.7949408292770386, + "num_tokens": 544172942.0, + "step": 21021 + }, + { + "epoch": 2.308587744344388, + "grad_norm": 2.01916241645813, + "learning_rate": 5e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7580158114433289, + "num_tokens": 544200276.0, + "step": 21022 + }, + { + "epoch": 2.308697562047002, + "grad_norm": 1.9877363443374634, + "learning_rate": 5e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.753696858882904, + "num_tokens": 544228461.0, + "step": 21023 + }, + { + "epoch": 2.3088073797496156, + "grad_norm": 2.165665626525879, + "learning_rate": 5e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7453336715698242, + "num_tokens": 544250433.0, + "step": 21024 + }, + { + "epoch": 2.3089171974522293, + "grad_norm": 1.951877236366272, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7713277339935303, + "num_tokens": 544278255.0, + "step": 21025 + }, + { + "epoch": 2.309027015154843, + "grad_norm": 2.1859185695648193, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7610242366790771, + "num_tokens": 544302758.0, + "step": 21026 + }, + { + "epoch": 2.3091368328574564, + "grad_norm": 2.06562876701355, + "learning_rate": 5e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.7836481332778931, + "num_tokens": 544328280.0, + "step": 21027 + }, + { + "epoch": 2.30924665056007, + "grad_norm": 2.156782865524292, + "learning_rate": 5e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7758318185806274, + "num_tokens": 544355350.0, + "step": 21028 + }, + { + "epoch": 2.309356468262684, + "grad_norm": 1.928249716758728, + "learning_rate": 5e-06, + "loss": 0.6292, + "mean_token_accuracy": 0.7994125485420227, + "num_tokens": 544382750.0, + "step": 21029 + }, + { + "epoch": 2.3094662859652977, + "grad_norm": 2.0887930393218994, + "learning_rate": 5e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7609381079673767, + "num_tokens": 544408494.0, + "step": 21030 + }, + { + "epoch": 2.3095761036679114, + "grad_norm": 2.242823600769043, + "learning_rate": 5e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7648993134498596, + "num_tokens": 544430941.0, + "step": 21031 + }, + { + "epoch": 2.3096859213705248, + "grad_norm": 2.23374342918396, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7738602757453918, + "num_tokens": 544454549.0, + "step": 21032 + }, + { + "epoch": 2.3097957390731385, + "grad_norm": 2.2935001850128174, + "learning_rate": 5e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7608931660652161, + "num_tokens": 544478748.0, + "step": 21033 + }, + { + "epoch": 2.3099055567757523, + "grad_norm": 2.0803143978118896, + "learning_rate": 5e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.7835034132003784, + "num_tokens": 544504409.0, + "step": 21034 + }, + { + "epoch": 2.310015374478366, + "grad_norm": 1.872573733329773, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7597028017044067, + "num_tokens": 544535801.0, + "step": 21035 + }, + { + "epoch": 2.31012519218098, + "grad_norm": 2.4141156673431396, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7704471349716187, + "num_tokens": 544557941.0, + "step": 21036 + }, + { + "epoch": 2.310235009883593, + "grad_norm": 2.266244411468506, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.77125483751297, + "num_tokens": 544582398.0, + "step": 21037 + }, + { + "epoch": 2.310344827586207, + "grad_norm": 2.1163077354431152, + "learning_rate": 5e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.7791965007781982, + "num_tokens": 544607429.0, + "step": 21038 + }, + { + "epoch": 2.3104546452888206, + "grad_norm": 2.3025074005126953, + "learning_rate": 5e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.7887285947799683, + "num_tokens": 544628820.0, + "step": 21039 + }, + { + "epoch": 2.3105644629914344, + "grad_norm": 2.334810256958008, + "learning_rate": 5e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7576841115951538, + "num_tokens": 544652276.0, + "step": 21040 + }, + { + "epoch": 2.310674280694048, + "grad_norm": 2.0073511600494385, + "learning_rate": 5e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7471539378166199, + "num_tokens": 544680818.0, + "step": 21041 + }, + { + "epoch": 2.3107840983966614, + "grad_norm": 2.252946138381958, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7559700608253479, + "num_tokens": 544702222.0, + "step": 21042 + }, + { + "epoch": 2.310893916099275, + "grad_norm": 2.1616876125335693, + "learning_rate": 5e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7613638043403625, + "num_tokens": 544728768.0, + "step": 21043 + }, + { + "epoch": 2.311003733801889, + "grad_norm": 2.6690170764923096, + "learning_rate": 5e-06, + "loss": 0.6435, + "mean_token_accuracy": 0.7865623831748962, + "num_tokens": 544746755.0, + "step": 21044 + }, + { + "epoch": 2.3111135515045027, + "grad_norm": 2.0392565727233887, + "learning_rate": 5e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.746356189250946, + "num_tokens": 544774986.0, + "step": 21045 + }, + { + "epoch": 2.311223369207116, + "grad_norm": 2.163588285446167, + "learning_rate": 5e-06, + "loss": 0.6035, + "mean_token_accuracy": 0.7964087724685669, + "num_tokens": 544796522.0, + "step": 21046 + }, + { + "epoch": 2.3113331869097298, + "grad_norm": 1.942211627960205, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7718169689178467, + "num_tokens": 544824182.0, + "step": 21047 + }, + { + "epoch": 2.3114430046123435, + "grad_norm": 1.7745414972305298, + "learning_rate": 5e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7502604722976685, + "num_tokens": 544858906.0, + "step": 21048 + }, + { + "epoch": 2.3115528223149573, + "grad_norm": 2.2559945583343506, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7600022554397583, + "num_tokens": 544883189.0, + "step": 21049 + }, + { + "epoch": 2.3116626400175706, + "grad_norm": 2.3583905696868896, + "learning_rate": 5e-06, + "loss": 0.6701, + "mean_token_accuracy": 0.7805514931678772, + "num_tokens": 544904770.0, + "step": 21050 + }, + { + "epoch": 2.3117724577201844, + "grad_norm": 2.009355068206787, + "learning_rate": 5e-06, + "loss": 0.6809, + "mean_token_accuracy": 0.773992657661438, + "num_tokens": 544931449.0, + "step": 21051 + }, + { + "epoch": 2.311882275422798, + "grad_norm": 2.0142812728881836, + "learning_rate": 5e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.7960233092308044, + "num_tokens": 544956121.0, + "step": 21052 + }, + { + "epoch": 2.311992093125412, + "grad_norm": 2.1702687740325928, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.77618408203125, + "num_tokens": 544980120.0, + "step": 21053 + }, + { + "epoch": 2.3121019108280256, + "grad_norm": 1.9544994831085205, + "learning_rate": 5e-06, + "loss": 0.6116, + "mean_token_accuracy": 0.7972805500030518, + "num_tokens": 545007664.0, + "step": 21054 + }, + { + "epoch": 2.312211728530639, + "grad_norm": 1.991566777229309, + "learning_rate": 5e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7623074054718018, + "num_tokens": 545040107.0, + "step": 21055 + }, + { + "epoch": 2.3123215462332527, + "grad_norm": 2.059380292892456, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.773903489112854, + "num_tokens": 545066561.0, + "step": 21056 + }, + { + "epoch": 2.3124313639358665, + "grad_norm": 2.527540922164917, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7674268484115601, + "num_tokens": 545087613.0, + "step": 21057 + }, + { + "epoch": 2.31254118163848, + "grad_norm": 1.8518691062927246, + "learning_rate": 5e-06, + "loss": 0.7156, + "mean_token_accuracy": 0.7677294611930847, + "num_tokens": 545115409.0, + "step": 21058 + }, + { + "epoch": 2.312650999341094, + "grad_norm": 2.1458139419555664, + "learning_rate": 5e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.7760559916496277, + "num_tokens": 545141448.0, + "step": 21059 + }, + { + "epoch": 2.3127608170437073, + "grad_norm": 1.9688754081726074, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.772350549697876, + "num_tokens": 545168177.0, + "step": 21060 + }, + { + "epoch": 2.312870634746321, + "grad_norm": 1.9197782278060913, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7648021578788757, + "num_tokens": 545197419.0, + "step": 21061 + }, + { + "epoch": 2.312980452448935, + "grad_norm": 2.1207692623138428, + "learning_rate": 5e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7620246410369873, + "num_tokens": 545223540.0, + "step": 21062 + }, + { + "epoch": 2.3130902701515486, + "grad_norm": 2.1516964435577393, + "learning_rate": 5e-06, + "loss": 0.654, + "mean_token_accuracy": 0.7794179916381836, + "num_tokens": 545248271.0, + "step": 21063 + }, + { + "epoch": 2.3132000878541623, + "grad_norm": 2.1111953258514404, + "learning_rate": 5e-06, + "loss": 0.626, + "mean_token_accuracy": 0.7930728197097778, + "num_tokens": 545272088.0, + "step": 21064 + }, + { + "epoch": 2.3133099055567756, + "grad_norm": 2.494244337081909, + "learning_rate": 5e-06, + "loss": 0.5808, + "mean_token_accuracy": 0.8092182278633118, + "num_tokens": 545289267.0, + "step": 21065 + }, + { + "epoch": 2.3134197232593894, + "grad_norm": 2.0808603763580322, + "learning_rate": 5e-06, + "loss": 0.661, + "mean_token_accuracy": 0.7781691551208496, + "num_tokens": 545312488.0, + "step": 21066 + }, + { + "epoch": 2.313529540962003, + "grad_norm": 2.0207056999206543, + "learning_rate": 5e-06, + "loss": 0.6449, + "mean_token_accuracy": 0.787448525428772, + "num_tokens": 545337518.0, + "step": 21067 + }, + { + "epoch": 2.313639358664617, + "grad_norm": 2.2654144763946533, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7590101957321167, + "num_tokens": 545364629.0, + "step": 21068 + }, + { + "epoch": 2.31374917636723, + "grad_norm": 1.9449496269226074, + "learning_rate": 5e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.747911274433136, + "num_tokens": 545395069.0, + "step": 21069 + }, + { + "epoch": 2.313858994069844, + "grad_norm": 2.1088716983795166, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7558957934379578, + "num_tokens": 545421829.0, + "step": 21070 + }, + { + "epoch": 2.3139688117724577, + "grad_norm": 2.3491337299346924, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7595090866088867, + "num_tokens": 545444734.0, + "step": 21071 + }, + { + "epoch": 2.3140786294750715, + "grad_norm": 1.9008172750473022, + "learning_rate": 5e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7775961756706238, + "num_tokens": 545472763.0, + "step": 21072 + }, + { + "epoch": 2.314188447177685, + "grad_norm": 2.359951972961426, + "learning_rate": 5e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7622785568237305, + "num_tokens": 545492773.0, + "step": 21073 + }, + { + "epoch": 2.3142982648802986, + "grad_norm": 1.9600331783294678, + "learning_rate": 5e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7727705240249634, + "num_tokens": 545520931.0, + "step": 21074 + }, + { + "epoch": 2.3144080825829123, + "grad_norm": 2.159043073654175, + "learning_rate": 5e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7707216739654541, + "num_tokens": 545546054.0, + "step": 21075 + }, + { + "epoch": 2.314517900285526, + "grad_norm": 2.5256400108337402, + "learning_rate": 5e-06, + "loss": 0.642, + "mean_token_accuracy": 0.7813995480537415, + "num_tokens": 545564518.0, + "step": 21076 + }, + { + "epoch": 2.31462771798814, + "grad_norm": 1.9510644674301147, + "learning_rate": 5e-06, + "loss": 0.6767, + "mean_token_accuracy": 0.7752355933189392, + "num_tokens": 545591946.0, + "step": 21077 + }, + { + "epoch": 2.314737535690753, + "grad_norm": 1.7798638343811035, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7623462677001953, + "num_tokens": 545627043.0, + "step": 21078 + }, + { + "epoch": 2.314847353393367, + "grad_norm": 2.0397934913635254, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7725523710250854, + "num_tokens": 545652435.0, + "step": 21079 + }, + { + "epoch": 2.3149571710959806, + "grad_norm": 2.2066445350646973, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7593704462051392, + "num_tokens": 545677942.0, + "step": 21080 + }, + { + "epoch": 2.3150669887985944, + "grad_norm": 2.5119638442993164, + "learning_rate": 5e-06, + "loss": 0.6228, + "mean_token_accuracy": 0.7972996234893799, + "num_tokens": 545696574.0, + "step": 21081 + }, + { + "epoch": 2.315176806501208, + "grad_norm": 2.2423839569091797, + "learning_rate": 5e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7587882876396179, + "num_tokens": 545723146.0, + "step": 21082 + }, + { + "epoch": 2.3152866242038215, + "grad_norm": 2.1768596172332764, + "learning_rate": 5e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.7789808511734009, + "num_tokens": 545749608.0, + "step": 21083 + }, + { + "epoch": 2.3153964419064352, + "grad_norm": 2.1896626949310303, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7631015181541443, + "num_tokens": 545774689.0, + "step": 21084 + }, + { + "epoch": 2.315506259609049, + "grad_norm": 2.18489933013916, + "learning_rate": 5e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.7852402925491333, + "num_tokens": 545799225.0, + "step": 21085 + }, + { + "epoch": 2.3156160773116627, + "grad_norm": 2.217237710952759, + "learning_rate": 5e-06, + "loss": 0.5905, + "mean_token_accuracy": 0.8074597716331482, + "num_tokens": 545819748.0, + "step": 21086 + }, + { + "epoch": 2.3157258950142765, + "grad_norm": 2.1730618476867676, + "learning_rate": 5e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7612521052360535, + "num_tokens": 545845124.0, + "step": 21087 + }, + { + "epoch": 2.31583571271689, + "grad_norm": 1.9401636123657227, + "learning_rate": 5e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7655669450759888, + "num_tokens": 545873295.0, + "step": 21088 + }, + { + "epoch": 2.3159455304195036, + "grad_norm": 2.0144264698028564, + "learning_rate": 5e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.7614892721176147, + "num_tokens": 545902093.0, + "step": 21089 + }, + { + "epoch": 2.3160553481221173, + "grad_norm": 2.245734930038452, + "learning_rate": 5e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.7779544591903687, + "num_tokens": 545925395.0, + "step": 21090 + }, + { + "epoch": 2.316165165824731, + "grad_norm": 2.3917696475982666, + "learning_rate": 5e-06, + "loss": 0.6287, + "mean_token_accuracy": 0.7886514663696289, + "num_tokens": 545945258.0, + "step": 21091 + }, + { + "epoch": 2.316274983527345, + "grad_norm": 1.9313671588897705, + "learning_rate": 5e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7287009954452515, + "num_tokens": 545978115.0, + "step": 21092 + }, + { + "epoch": 2.316384801229958, + "grad_norm": 2.4241347312927246, + "learning_rate": 5e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7730230093002319, + "num_tokens": 545999307.0, + "step": 21093 + }, + { + "epoch": 2.316494618932572, + "grad_norm": 1.9699994325637817, + "learning_rate": 5e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7645944952964783, + "num_tokens": 546031059.0, + "step": 21094 + }, + { + "epoch": 2.3166044366351857, + "grad_norm": 2.121321439743042, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7585073113441467, + "num_tokens": 546059210.0, + "step": 21095 + }, + { + "epoch": 2.3167142543377994, + "grad_norm": 1.9982366561889648, + "learning_rate": 5e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7498974800109863, + "num_tokens": 546089632.0, + "step": 21096 + }, + { + "epoch": 2.3168240720404127, + "grad_norm": 2.200098752975464, + "learning_rate": 5e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7629421949386597, + "num_tokens": 546114862.0, + "step": 21097 + }, + { + "epoch": 2.3169338897430265, + "grad_norm": 2.0315608978271484, + "learning_rate": 5e-06, + "loss": 0.6395, + "mean_token_accuracy": 0.7769259810447693, + "num_tokens": 546140413.0, + "step": 21098 + }, + { + "epoch": 2.3170437074456403, + "grad_norm": 2.5004003047943115, + "learning_rate": 5e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.7760955095291138, + "num_tokens": 546159345.0, + "step": 21099 + }, + { + "epoch": 2.317153525148254, + "grad_norm": 1.9001412391662598, + "learning_rate": 5e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7473441362380981, + "num_tokens": 546188677.0, + "step": 21100 + }, + { + "epoch": 2.3172633428508673, + "grad_norm": 2.342620611190796, + "learning_rate": 5e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.7677426338195801, + "num_tokens": 546210600.0, + "step": 21101 + }, + { + "epoch": 2.317373160553481, + "grad_norm": 2.031836986541748, + "learning_rate": 5e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.766209602355957, + "num_tokens": 546239449.0, + "step": 21102 + }, + { + "epoch": 2.317482978256095, + "grad_norm": 2.0458903312683105, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7490601539611816, + "num_tokens": 546265798.0, + "step": 21103 + }, + { + "epoch": 2.3175927959587086, + "grad_norm": 1.927565574645996, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7485121488571167, + "num_tokens": 546296302.0, + "step": 21104 + }, + { + "epoch": 2.3177026136613224, + "grad_norm": 2.029555320739746, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7568193674087524, + "num_tokens": 546323889.0, + "step": 21105 + }, + { + "epoch": 2.3178124313639357, + "grad_norm": 2.031621217727661, + "learning_rate": 5e-06, + "loss": 0.736, + "mean_token_accuracy": 0.758161723613739, + "num_tokens": 546349452.0, + "step": 21106 + }, + { + "epoch": 2.3179222490665494, + "grad_norm": 1.8147170543670654, + "learning_rate": 5e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7654318809509277, + "num_tokens": 546384737.0, + "step": 21107 + }, + { + "epoch": 2.318032066769163, + "grad_norm": 2.106558322906494, + "learning_rate": 5e-06, + "loss": 0.5851, + "mean_token_accuracy": 0.8022366166114807, + "num_tokens": 546407695.0, + "step": 21108 + }, + { + "epoch": 2.318141884471777, + "grad_norm": 2.009309768676758, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7743273973464966, + "num_tokens": 546434612.0, + "step": 21109 + }, + { + "epoch": 2.3182517021743907, + "grad_norm": 2.022710084915161, + "learning_rate": 5e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7777962684631348, + "num_tokens": 546458962.0, + "step": 21110 + }, + { + "epoch": 2.318361519877004, + "grad_norm": 2.423604726791382, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7614889740943909, + "num_tokens": 546483119.0, + "step": 21111 + }, + { + "epoch": 2.3184713375796178, + "grad_norm": 2.101229667663574, + "learning_rate": 5e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.781369686126709, + "num_tokens": 546509187.0, + "step": 21112 + }, + { + "epoch": 2.3185811552822315, + "grad_norm": 1.9564591646194458, + "learning_rate": 5e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.7802810072898865, + "num_tokens": 546539530.0, + "step": 21113 + }, + { + "epoch": 2.3186909729848453, + "grad_norm": 2.1905593872070312, + "learning_rate": 5e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7693551778793335, + "num_tokens": 546565285.0, + "step": 21114 + }, + { + "epoch": 2.318800790687459, + "grad_norm": 1.9203957319259644, + "learning_rate": 5e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7604738473892212, + "num_tokens": 546595331.0, + "step": 21115 + }, + { + "epoch": 2.3189106083900723, + "grad_norm": 2.2924468517303467, + "learning_rate": 5e-06, + "loss": 0.6757, + "mean_token_accuracy": 0.7773340344429016, + "num_tokens": 546618125.0, + "step": 21116 + }, + { + "epoch": 2.319020426092686, + "grad_norm": 2.6481003761291504, + "learning_rate": 5e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.778395414352417, + "num_tokens": 546636032.0, + "step": 21117 + }, + { + "epoch": 2.3191302437953, + "grad_norm": 2.1829333305358887, + "learning_rate": 5e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7520203590393066, + "num_tokens": 546663017.0, + "step": 21118 + }, + { + "epoch": 2.3192400614979136, + "grad_norm": 2.0598480701446533, + "learning_rate": 5e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7695357203483582, + "num_tokens": 546690601.0, + "step": 21119 + }, + { + "epoch": 2.319349879200527, + "grad_norm": 1.9031622409820557, + "learning_rate": 5e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7589784860610962, + "num_tokens": 546718047.0, + "step": 21120 + }, + { + "epoch": 2.3194596969031407, + "grad_norm": 2.089475393295288, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7742711305618286, + "num_tokens": 546744116.0, + "step": 21121 + }, + { + "epoch": 2.3195695146057544, + "grad_norm": 2.5499157905578613, + "learning_rate": 5e-06, + "loss": 0.6663, + "mean_token_accuracy": 0.7786394953727722, + "num_tokens": 546762231.0, + "step": 21122 + }, + { + "epoch": 2.319679332308368, + "grad_norm": 2.434384346008301, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.763889491558075, + "num_tokens": 546784408.0, + "step": 21123 + }, + { + "epoch": 2.319789150010982, + "grad_norm": 1.9253766536712646, + "learning_rate": 5e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7701089978218079, + "num_tokens": 546813790.0, + "step": 21124 + }, + { + "epoch": 2.3198989677135953, + "grad_norm": 1.8820233345031738, + "learning_rate": 5e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.766960859298706, + "num_tokens": 546842114.0, + "step": 21125 + }, + { + "epoch": 2.320008785416209, + "grad_norm": 2.0872223377227783, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7661902904510498, + "num_tokens": 546866666.0, + "step": 21126 + }, + { + "epoch": 2.320118603118823, + "grad_norm": 1.997962236404419, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7593804001808167, + "num_tokens": 546895288.0, + "step": 21127 + }, + { + "epoch": 2.3202284208214365, + "grad_norm": 1.8995215892791748, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.759349524974823, + "num_tokens": 546923870.0, + "step": 21128 + }, + { + "epoch": 2.32033823852405, + "grad_norm": 2.0786139965057373, + "learning_rate": 5e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7681005001068115, + "num_tokens": 546949173.0, + "step": 21129 + }, + { + "epoch": 2.3204480562266636, + "grad_norm": 1.8637678623199463, + "learning_rate": 5e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7574523687362671, + "num_tokens": 546981445.0, + "step": 21130 + }, + { + "epoch": 2.3205578739292774, + "grad_norm": 1.9808666706085205, + "learning_rate": 5e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7387362718582153, + "num_tokens": 547008156.0, + "step": 21131 + }, + { + "epoch": 2.320667691631891, + "grad_norm": 2.203101396560669, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7582616806030273, + "num_tokens": 547033225.0, + "step": 21132 + }, + { + "epoch": 2.320777509334505, + "grad_norm": 2.293307304382324, + "learning_rate": 5e-06, + "loss": 0.5802, + "mean_token_accuracy": 0.8019089698791504, + "num_tokens": 547051522.0, + "step": 21133 + }, + { + "epoch": 2.320887327037118, + "grad_norm": 2.2611641883850098, + "learning_rate": 5e-06, + "loss": 0.665, + "mean_token_accuracy": 0.7770237922668457, + "num_tokens": 547074612.0, + "step": 21134 + }, + { + "epoch": 2.320997144739732, + "grad_norm": 2.4275007247924805, + "learning_rate": 5e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7719078063964844, + "num_tokens": 547094102.0, + "step": 21135 + }, + { + "epoch": 2.3211069624423457, + "grad_norm": 2.1687772274017334, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.757784903049469, + "num_tokens": 547121807.0, + "step": 21136 + }, + { + "epoch": 2.3212167801449595, + "grad_norm": 2.0438082218170166, + "learning_rate": 5e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.766209602355957, + "num_tokens": 547149576.0, + "step": 21137 + }, + { + "epoch": 2.3213265978475732, + "grad_norm": 1.891972303390503, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7564826011657715, + "num_tokens": 547179622.0, + "step": 21138 + }, + { + "epoch": 2.3214364155501865, + "grad_norm": 2.3859517574310303, + "learning_rate": 5e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.7848129272460938, + "num_tokens": 547200185.0, + "step": 21139 + }, + { + "epoch": 2.3215462332528003, + "grad_norm": 2.396094560623169, + "learning_rate": 5e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.7794064283370972, + "num_tokens": 547221015.0, + "step": 21140 + }, + { + "epoch": 2.321656050955414, + "grad_norm": 2.019052267074585, + "learning_rate": 5e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7663589715957642, + "num_tokens": 547247724.0, + "step": 21141 + }, + { + "epoch": 2.321765868658028, + "grad_norm": 2.0275919437408447, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7784517407417297, + "num_tokens": 547273950.0, + "step": 21142 + }, + { + "epoch": 2.3218756863606416, + "grad_norm": 2.116832971572876, + "learning_rate": 5e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7584060430526733, + "num_tokens": 547300379.0, + "step": 21143 + }, + { + "epoch": 2.321985504063255, + "grad_norm": 2.0869643688201904, + "learning_rate": 5e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7533857822418213, + "num_tokens": 547324823.0, + "step": 21144 + }, + { + "epoch": 2.3220953217658686, + "grad_norm": 1.9850059747695923, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7591354846954346, + "num_tokens": 547352430.0, + "step": 21145 + }, + { + "epoch": 2.3222051394684824, + "grad_norm": 1.9955894947052002, + "learning_rate": 5e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7764682173728943, + "num_tokens": 547380585.0, + "step": 21146 + }, + { + "epoch": 2.322314957171096, + "grad_norm": 2.035871744155884, + "learning_rate": 5e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.7728561162948608, + "num_tokens": 547410070.0, + "step": 21147 + }, + { + "epoch": 2.3224247748737095, + "grad_norm": 2.021688938140869, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7712182998657227, + "num_tokens": 547438288.0, + "step": 21148 + }, + { + "epoch": 2.322534592576323, + "grad_norm": 2.117966413497925, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7736897468566895, + "num_tokens": 547461434.0, + "step": 21149 + }, + { + "epoch": 2.322644410278937, + "grad_norm": 2.4887499809265137, + "learning_rate": 5e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.7777928113937378, + "num_tokens": 547480469.0, + "step": 21150 + }, + { + "epoch": 2.3227542279815507, + "grad_norm": 1.7766759395599365, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7412625551223755, + "num_tokens": 547516895.0, + "step": 21151 + }, + { + "epoch": 2.322864045684164, + "grad_norm": 2.225968360900879, + "learning_rate": 5e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7663283348083496, + "num_tokens": 547540470.0, + "step": 21152 + }, + { + "epoch": 2.322973863386778, + "grad_norm": 2.240574598312378, + "learning_rate": 5e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7575850486755371, + "num_tokens": 547565929.0, + "step": 21153 + }, + { + "epoch": 2.3230836810893916, + "grad_norm": 2.5157310962677, + "learning_rate": 5e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.7982110977172852, + "num_tokens": 547584418.0, + "step": 21154 + }, + { + "epoch": 2.3231934987920053, + "grad_norm": 2.1681203842163086, + "learning_rate": 5e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7511569261550903, + "num_tokens": 547608148.0, + "step": 21155 + }, + { + "epoch": 2.323303316494619, + "grad_norm": 2.2281460762023926, + "learning_rate": 5e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.7764801979064941, + "num_tokens": 547631529.0, + "step": 21156 + }, + { + "epoch": 2.3234131341972324, + "grad_norm": 2.0964550971984863, + "learning_rate": 5e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7634936571121216, + "num_tokens": 547657984.0, + "step": 21157 + }, + { + "epoch": 2.323522951899846, + "grad_norm": 2.1167843341827393, + "learning_rate": 5e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.750468909740448, + "num_tokens": 547684213.0, + "step": 21158 + }, + { + "epoch": 2.32363276960246, + "grad_norm": 2.4444122314453125, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.763137698173523, + "num_tokens": 547705993.0, + "step": 21159 + }, + { + "epoch": 2.3237425873050737, + "grad_norm": 2.1441502571105957, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7468326091766357, + "num_tokens": 547730911.0, + "step": 21160 + }, + { + "epoch": 2.3238524050076874, + "grad_norm": 2.247471570968628, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7614549398422241, + "num_tokens": 547754374.0, + "step": 21161 + }, + { + "epoch": 2.3239622227103007, + "grad_norm": 1.8578661680221558, + "learning_rate": 5e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7630652189254761, + "num_tokens": 547785435.0, + "step": 21162 + }, + { + "epoch": 2.3240720404129145, + "grad_norm": 1.9565855264663696, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7760587930679321, + "num_tokens": 547816508.0, + "step": 21163 + }, + { + "epoch": 2.3241818581155282, + "grad_norm": 2.0810134410858154, + "learning_rate": 5e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7495090961456299, + "num_tokens": 547844366.0, + "step": 21164 + }, + { + "epoch": 2.324291675818142, + "grad_norm": 1.9604315757751465, + "learning_rate": 5e-06, + "loss": 0.6923, + "mean_token_accuracy": 0.7650519609451294, + "num_tokens": 547873492.0, + "step": 21165 + }, + { + "epoch": 2.3244014935207558, + "grad_norm": 2.301177740097046, + "learning_rate": 5e-06, + "loss": 0.6548, + "mean_token_accuracy": 0.7804791927337646, + "num_tokens": 547892624.0, + "step": 21166 + }, + { + "epoch": 2.324511311223369, + "grad_norm": 2.2470850944519043, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7637141942977905, + "num_tokens": 547913141.0, + "step": 21167 + }, + { + "epoch": 2.324621128925983, + "grad_norm": 1.9056380987167358, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7730562090873718, + "num_tokens": 547939333.0, + "step": 21168 + }, + { + "epoch": 2.3247309466285966, + "grad_norm": 1.8841161727905273, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7599679231643677, + "num_tokens": 547968290.0, + "step": 21169 + }, + { + "epoch": 2.3248407643312103, + "grad_norm": 2.1408419609069824, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7725398540496826, + "num_tokens": 547993318.0, + "step": 21170 + }, + { + "epoch": 2.324950582033824, + "grad_norm": 2.031369209289551, + "learning_rate": 5e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.7814099788665771, + "num_tokens": 548021671.0, + "step": 21171 + }, + { + "epoch": 2.3250603997364374, + "grad_norm": 2.2475945949554443, + "learning_rate": 5e-06, + "loss": 0.5942, + "mean_token_accuracy": 0.7970970273017883, + "num_tokens": 548044293.0, + "step": 21172 + }, + { + "epoch": 2.325170217439051, + "grad_norm": 2.0881118774414062, + "learning_rate": 5e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.7796458601951599, + "num_tokens": 548067290.0, + "step": 21173 + }, + { + "epoch": 2.325280035141665, + "grad_norm": 1.9230616092681885, + "learning_rate": 5e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.750512957572937, + "num_tokens": 548096980.0, + "step": 21174 + }, + { + "epoch": 2.3253898528442787, + "grad_norm": 1.9922575950622559, + "learning_rate": 5e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7520564198493958, + "num_tokens": 548127445.0, + "step": 21175 + }, + { + "epoch": 2.325499670546892, + "grad_norm": 2.110013246536255, + "learning_rate": 5e-06, + "loss": 0.6639, + "mean_token_accuracy": 0.7842921018600464, + "num_tokens": 548152289.0, + "step": 21176 + }, + { + "epoch": 2.3256094882495058, + "grad_norm": 2.597410202026367, + "learning_rate": 5e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.7967101335525513, + "num_tokens": 548169349.0, + "step": 21177 + }, + { + "epoch": 2.3257193059521195, + "grad_norm": 2.0799567699432373, + "learning_rate": 5e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7518720626831055, + "num_tokens": 548194503.0, + "step": 21178 + }, + { + "epoch": 2.3258291236547333, + "grad_norm": 2.3147873878479004, + "learning_rate": 5e-06, + "loss": 0.6167, + "mean_token_accuracy": 0.7888023853302002, + "num_tokens": 548215167.0, + "step": 21179 + }, + { + "epoch": 2.3259389413573466, + "grad_norm": 1.9001675844192505, + "learning_rate": 5e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7777668237686157, + "num_tokens": 548239738.0, + "step": 21180 + }, + { + "epoch": 2.3260487590599603, + "grad_norm": 2.0864882469177246, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7767971158027649, + "num_tokens": 548264183.0, + "step": 21181 + }, + { + "epoch": 2.326158576762574, + "grad_norm": 2.0485129356384277, + "learning_rate": 5e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.758819580078125, + "num_tokens": 548292154.0, + "step": 21182 + }, + { + "epoch": 2.326268394465188, + "grad_norm": 2.3324055671691895, + "learning_rate": 5e-06, + "loss": 0.6383, + "mean_token_accuracy": 0.7866355180740356, + "num_tokens": 548312683.0, + "step": 21183 + }, + { + "epoch": 2.3263782121678016, + "grad_norm": 2.458522319793701, + "learning_rate": 5e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7786638736724854, + "num_tokens": 548333268.0, + "step": 21184 + }, + { + "epoch": 2.326488029870415, + "grad_norm": 2.165113687515259, + "learning_rate": 5e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.7797978520393372, + "num_tokens": 548356243.0, + "step": 21185 + }, + { + "epoch": 2.3265978475730287, + "grad_norm": 2.059908628463745, + "learning_rate": 5e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.7762203216552734, + "num_tokens": 548380770.0, + "step": 21186 + }, + { + "epoch": 2.3267076652756424, + "grad_norm": 1.939110279083252, + "learning_rate": 5e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7467957735061646, + "num_tokens": 548411697.0, + "step": 21187 + }, + { + "epoch": 2.326817482978256, + "grad_norm": 2.009643793106079, + "learning_rate": 5e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.7839934825897217, + "num_tokens": 548437992.0, + "step": 21188 + }, + { + "epoch": 2.32692730068087, + "grad_norm": 2.1390504837036133, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.777474582195282, + "num_tokens": 548460742.0, + "step": 21189 + }, + { + "epoch": 2.3270371183834833, + "grad_norm": 2.154597759246826, + "learning_rate": 5e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7468691468238831, + "num_tokens": 548487727.0, + "step": 21190 + }, + { + "epoch": 2.327146936086097, + "grad_norm": 1.9040488004684448, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7527753114700317, + "num_tokens": 548519016.0, + "step": 21191 + }, + { + "epoch": 2.3272567537887108, + "grad_norm": 2.3047521114349365, + "learning_rate": 5e-06, + "loss": 0.6494, + "mean_token_accuracy": 0.7839242219924927, + "num_tokens": 548539212.0, + "step": 21192 + }, + { + "epoch": 2.3273665714913245, + "grad_norm": 2.13440203666687, + "learning_rate": 5e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7604137659072876, + "num_tokens": 548565203.0, + "step": 21193 + }, + { + "epoch": 2.3274763891939383, + "grad_norm": 1.9428220987319946, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7571572065353394, + "num_tokens": 548593904.0, + "step": 21194 + }, + { + "epoch": 2.3275862068965516, + "grad_norm": 2.1231484413146973, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7750519514083862, + "num_tokens": 548618773.0, + "step": 21195 + }, + { + "epoch": 2.3276960245991654, + "grad_norm": 2.259493350982666, + "learning_rate": 5e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7603784799575806, + "num_tokens": 548643091.0, + "step": 21196 + }, + { + "epoch": 2.327805842301779, + "grad_norm": 2.0260937213897705, + "learning_rate": 5e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7689890265464783, + "num_tokens": 548669354.0, + "step": 21197 + }, + { + "epoch": 2.327915660004393, + "grad_norm": 2.1598854064941406, + "learning_rate": 5e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.758870542049408, + "num_tokens": 548697133.0, + "step": 21198 + }, + { + "epoch": 2.328025477707006, + "grad_norm": 2.091489315032959, + "learning_rate": 5e-06, + "loss": 0.6222, + "mean_token_accuracy": 0.7893053889274597, + "num_tokens": 548721369.0, + "step": 21199 + }, + { + "epoch": 2.32813529540962, + "grad_norm": 1.9393994808197021, + "learning_rate": 5e-06, + "loss": 0.6757, + "mean_token_accuracy": 0.7719453573226929, + "num_tokens": 548750613.0, + "step": 21200 + }, + { + "epoch": 2.3282451131122337, + "grad_norm": 2.0073564052581787, + "learning_rate": 5e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7519917488098145, + "num_tokens": 548781341.0, + "step": 21201 + }, + { + "epoch": 2.3283549308148475, + "grad_norm": 2.0435831546783447, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7610898017883301, + "num_tokens": 548808545.0, + "step": 21202 + }, + { + "epoch": 2.3284647485174608, + "grad_norm": 2.4452998638153076, + "learning_rate": 5e-06, + "loss": 0.697, + "mean_token_accuracy": 0.7819191813468933, + "num_tokens": 548828697.0, + "step": 21203 + }, + { + "epoch": 2.3285745662200745, + "grad_norm": 2.233289957046509, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.755364716053009, + "num_tokens": 548851934.0, + "step": 21204 + }, + { + "epoch": 2.3286843839226883, + "grad_norm": 1.9912528991699219, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7656550407409668, + "num_tokens": 548880260.0, + "step": 21205 + }, + { + "epoch": 2.328794201625302, + "grad_norm": 2.0443825721740723, + "learning_rate": 5e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7589408755302429, + "num_tokens": 548911397.0, + "step": 21206 + }, + { + "epoch": 2.328904019327916, + "grad_norm": 2.1391220092773438, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7700165510177612, + "num_tokens": 548939803.0, + "step": 21207 + }, + { + "epoch": 2.329013837030529, + "grad_norm": 1.877787470817566, + "learning_rate": 5e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7634349465370178, + "num_tokens": 548971910.0, + "step": 21208 + }, + { + "epoch": 2.329123654733143, + "grad_norm": 2.1659178733825684, + "learning_rate": 5e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7340604066848755, + "num_tokens": 548999989.0, + "step": 21209 + }, + { + "epoch": 2.3292334724357566, + "grad_norm": 2.1053993701934814, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7546626329421997, + "num_tokens": 549027165.0, + "step": 21210 + }, + { + "epoch": 2.3293432901383704, + "grad_norm": 2.5000884532928467, + "learning_rate": 5e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7593995332717896, + "num_tokens": 549053158.0, + "step": 21211 + }, + { + "epoch": 2.329453107840984, + "grad_norm": 2.0668883323669434, + "learning_rate": 5e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7641845941543579, + "num_tokens": 549080555.0, + "step": 21212 + }, + { + "epoch": 2.3295629255435975, + "grad_norm": 1.913892149925232, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7661916613578796, + "num_tokens": 549110681.0, + "step": 21213 + }, + { + "epoch": 2.329672743246211, + "grad_norm": 2.0335967540740967, + "learning_rate": 5e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.7854578495025635, + "num_tokens": 549136734.0, + "step": 21214 + }, + { + "epoch": 2.329782560948825, + "grad_norm": 2.1696083545684814, + "learning_rate": 5e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7487965822219849, + "num_tokens": 549163447.0, + "step": 21215 + }, + { + "epoch": 2.3298923786514387, + "grad_norm": 1.9915190935134888, + "learning_rate": 5e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7311734557151794, + "num_tokens": 549194662.0, + "step": 21216 + }, + { + "epoch": 2.3300021963540525, + "grad_norm": 2.147258996963501, + "learning_rate": 5e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7795281410217285, + "num_tokens": 549217656.0, + "step": 21217 + }, + { + "epoch": 2.330112014056666, + "grad_norm": 2.1075122356414795, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7561299800872803, + "num_tokens": 549242673.0, + "step": 21218 + }, + { + "epoch": 2.3302218317592795, + "grad_norm": 1.6679764986038208, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7592630386352539, + "num_tokens": 549279897.0, + "step": 21219 + }, + { + "epoch": 2.3303316494618933, + "grad_norm": 2.0928378105163574, + "learning_rate": 5e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7437790632247925, + "num_tokens": 549306976.0, + "step": 21220 + }, + { + "epoch": 2.330441467164507, + "grad_norm": 2.3302595615386963, + "learning_rate": 5e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7664698362350464, + "num_tokens": 549328343.0, + "step": 21221 + }, + { + "epoch": 2.330551284867121, + "grad_norm": 1.8900529146194458, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7560580968856812, + "num_tokens": 549357570.0, + "step": 21222 + }, + { + "epoch": 2.330661102569734, + "grad_norm": 2.242387294769287, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7621167898178101, + "num_tokens": 549381094.0, + "step": 21223 + }, + { + "epoch": 2.330770920272348, + "grad_norm": 2.25834059715271, + "learning_rate": 5e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.7812700271606445, + "num_tokens": 549403357.0, + "step": 21224 + }, + { + "epoch": 2.3308807379749616, + "grad_norm": 1.9287309646606445, + "learning_rate": 5e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7444311380386353, + "num_tokens": 549430656.0, + "step": 21225 + }, + { + "epoch": 2.3309905556775754, + "grad_norm": 2.050752639770508, + "learning_rate": 5e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7572650909423828, + "num_tokens": 549455036.0, + "step": 21226 + }, + { + "epoch": 2.3311003733801887, + "grad_norm": 2.1101036071777344, + "learning_rate": 5e-06, + "loss": 0.618, + "mean_token_accuracy": 0.7931089997291565, + "num_tokens": 549478768.0, + "step": 21227 + }, + { + "epoch": 2.3312101910828025, + "grad_norm": 2.195797920227051, + "learning_rate": 5e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7606368660926819, + "num_tokens": 549503086.0, + "step": 21228 + }, + { + "epoch": 2.3313200087854162, + "grad_norm": 2.056044340133667, + "learning_rate": 5e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7681972980499268, + "num_tokens": 549526210.0, + "step": 21229 + }, + { + "epoch": 2.33142982648803, + "grad_norm": 1.8036746978759766, + "learning_rate": 5e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7552573084831238, + "num_tokens": 549558790.0, + "step": 21230 + }, + { + "epoch": 2.3315396441906433, + "grad_norm": 2.074118137359619, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7637172937393188, + "num_tokens": 549584194.0, + "step": 21231 + }, + { + "epoch": 2.331649461893257, + "grad_norm": 1.959550380706787, + "learning_rate": 5e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.7814505100250244, + "num_tokens": 549611015.0, + "step": 21232 + }, + { + "epoch": 2.331759279595871, + "grad_norm": 2.1452839374542236, + "learning_rate": 5e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7550215721130371, + "num_tokens": 549637163.0, + "step": 21233 + }, + { + "epoch": 2.3318690972984846, + "grad_norm": 2.181487798690796, + "learning_rate": 5e-06, + "loss": 0.6523, + "mean_token_accuracy": 0.7828810214996338, + "num_tokens": 549658518.0, + "step": 21234 + }, + { + "epoch": 2.3319789150010983, + "grad_norm": 1.9289120435714722, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7613494396209717, + "num_tokens": 549686687.0, + "step": 21235 + }, + { + "epoch": 2.3320887327037116, + "grad_norm": 2.169768810272217, + "learning_rate": 5e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.764066755771637, + "num_tokens": 549712174.0, + "step": 21236 + }, + { + "epoch": 2.3321985504063254, + "grad_norm": 2.1037752628326416, + "learning_rate": 5e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7711876034736633, + "num_tokens": 549737492.0, + "step": 21237 + }, + { + "epoch": 2.332308368108939, + "grad_norm": 2.1469104290008545, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7628828883171082, + "num_tokens": 549763151.0, + "step": 21238 + }, + { + "epoch": 2.332418185811553, + "grad_norm": 1.8138318061828613, + "learning_rate": 5e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.7677913308143616, + "num_tokens": 549792677.0, + "step": 21239 + }, + { + "epoch": 2.3325280035141667, + "grad_norm": 1.9631112813949585, + "learning_rate": 5e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7512212991714478, + "num_tokens": 549819975.0, + "step": 21240 + }, + { + "epoch": 2.33263782121678, + "grad_norm": 2.4463469982147217, + "learning_rate": 5e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7705718278884888, + "num_tokens": 549838830.0, + "step": 21241 + }, + { + "epoch": 2.3327476389193937, + "grad_norm": 1.8781932592391968, + "learning_rate": 5e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7429871559143066, + "num_tokens": 549871183.0, + "step": 21242 + }, + { + "epoch": 2.3328574566220075, + "grad_norm": 2.0623741149902344, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7759512662887573, + "num_tokens": 549896502.0, + "step": 21243 + }, + { + "epoch": 2.3329672743246213, + "grad_norm": 2.2675058841705322, + "learning_rate": 5e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.7792618870735168, + "num_tokens": 549917849.0, + "step": 21244 + }, + { + "epoch": 2.333077092027235, + "grad_norm": 2.334576368331909, + "learning_rate": 5e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.7804909944534302, + "num_tokens": 549939883.0, + "step": 21245 + }, + { + "epoch": 2.3331869097298483, + "grad_norm": 2.00555682182312, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7464646697044373, + "num_tokens": 549970060.0, + "step": 21246 + }, + { + "epoch": 2.333296727432462, + "grad_norm": 1.9838473796844482, + "learning_rate": 5e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7505276203155518, + "num_tokens": 549999105.0, + "step": 21247 + }, + { + "epoch": 2.333406545135076, + "grad_norm": 2.2217373847961426, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7806861996650696, + "num_tokens": 550023811.0, + "step": 21248 + }, + { + "epoch": 2.3335163628376896, + "grad_norm": 2.0757181644439697, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7625186443328857, + "num_tokens": 550049917.0, + "step": 21249 + }, + { + "epoch": 2.333626180540303, + "grad_norm": 2.180508852005005, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7684875726699829, + "num_tokens": 550074326.0, + "step": 21250 + }, + { + "epoch": 2.3337359982429167, + "grad_norm": 2.091538667678833, + "learning_rate": 5e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.7824735641479492, + "num_tokens": 550098685.0, + "step": 21251 + }, + { + "epoch": 2.3338458159455304, + "grad_norm": 2.089569330215454, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7587305307388306, + "num_tokens": 550124509.0, + "step": 21252 + }, + { + "epoch": 2.333955633648144, + "grad_norm": 1.8685545921325684, + "learning_rate": 5e-06, + "loss": 0.6531, + "mean_token_accuracy": 0.7818987369537354, + "num_tokens": 550153094.0, + "step": 21253 + }, + { + "epoch": 2.334065451350758, + "grad_norm": 2.0870158672332764, + "learning_rate": 5e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.7801481485366821, + "num_tokens": 550176604.0, + "step": 21254 + }, + { + "epoch": 2.3341752690533712, + "grad_norm": 2.0331380367279053, + "learning_rate": 5e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.7815696001052856, + "num_tokens": 550201526.0, + "step": 21255 + }, + { + "epoch": 2.334285086755985, + "grad_norm": 2.1020772457122803, + "learning_rate": 5e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7622897624969482, + "num_tokens": 550228432.0, + "step": 21256 + }, + { + "epoch": 2.3343949044585988, + "grad_norm": 2.1870827674865723, + "learning_rate": 5e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.7807292342185974, + "num_tokens": 550251551.0, + "step": 21257 + }, + { + "epoch": 2.3345047221612125, + "grad_norm": 2.2172815799713135, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7561349868774414, + "num_tokens": 550273327.0, + "step": 21258 + }, + { + "epoch": 2.334614539863826, + "grad_norm": 1.9271849393844604, + "learning_rate": 5e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7472772598266602, + "num_tokens": 550302312.0, + "step": 21259 + }, + { + "epoch": 2.3347243575664396, + "grad_norm": 2.246288299560547, + "learning_rate": 5e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7590548396110535, + "num_tokens": 550326007.0, + "step": 21260 + }, + { + "epoch": 2.3348341752690533, + "grad_norm": 1.8371565341949463, + "learning_rate": 5e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7382564544677734, + "num_tokens": 550360830.0, + "step": 21261 + }, + { + "epoch": 2.334943992971667, + "grad_norm": 2.089348554611206, + "learning_rate": 5e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7694368958473206, + "num_tokens": 550385815.0, + "step": 21262 + }, + { + "epoch": 2.335053810674281, + "grad_norm": 2.004138469696045, + "learning_rate": 5e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7579556703567505, + "num_tokens": 550413970.0, + "step": 21263 + }, + { + "epoch": 2.335163628376894, + "grad_norm": 2.016073703765869, + "learning_rate": 5e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.761954665184021, + "num_tokens": 550439458.0, + "step": 21264 + }, + { + "epoch": 2.335273446079508, + "grad_norm": 2.2021961212158203, + "learning_rate": 5e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8244404792785645, + "num_tokens": 550458810.0, + "step": 21265 + }, + { + "epoch": 2.3353832637821217, + "grad_norm": 1.9340384006500244, + "learning_rate": 5e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7494493722915649, + "num_tokens": 550490934.0, + "step": 21266 + }, + { + "epoch": 2.3354930814847354, + "grad_norm": 2.020578145980835, + "learning_rate": 5e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7629873156547546, + "num_tokens": 550519261.0, + "step": 21267 + }, + { + "epoch": 2.335602899187349, + "grad_norm": 2.027076244354248, + "learning_rate": 5e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7763668298721313, + "num_tokens": 550545847.0, + "step": 21268 + }, + { + "epoch": 2.3357127168899625, + "grad_norm": 2.401848554611206, + "learning_rate": 5e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7569879293441772, + "num_tokens": 550566751.0, + "step": 21269 + }, + { + "epoch": 2.3358225345925763, + "grad_norm": 2.3156638145446777, + "learning_rate": 5e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.7812923192977905, + "num_tokens": 550587399.0, + "step": 21270 + }, + { + "epoch": 2.33593235229519, + "grad_norm": 2.0868470668792725, + "learning_rate": 5e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7687919735908508, + "num_tokens": 550615008.0, + "step": 21271 + }, + { + "epoch": 2.336042169997804, + "grad_norm": 2.428124189376831, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7626317143440247, + "num_tokens": 550637698.0, + "step": 21272 + }, + { + "epoch": 2.3361519877004175, + "grad_norm": 2.5941505432128906, + "learning_rate": 5e-06, + "loss": 0.5938, + "mean_token_accuracy": 0.7968889474868774, + "num_tokens": 550654115.0, + "step": 21273 + }, + { + "epoch": 2.336261805403031, + "grad_norm": 2.0710289478302, + "learning_rate": 5e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7578073740005493, + "num_tokens": 550680165.0, + "step": 21274 + }, + { + "epoch": 2.3363716231056446, + "grad_norm": 2.0076658725738525, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7401247024536133, + "num_tokens": 550707835.0, + "step": 21275 + }, + { + "epoch": 2.3364814408082584, + "grad_norm": 2.273953437805176, + "learning_rate": 5e-06, + "loss": 0.6395, + "mean_token_accuracy": 0.7867634296417236, + "num_tokens": 550729478.0, + "step": 21276 + }, + { + "epoch": 2.336591258510872, + "grad_norm": 2.398094892501831, + "learning_rate": 5e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.7761597633361816, + "num_tokens": 550751079.0, + "step": 21277 + }, + { + "epoch": 2.3367010762134854, + "grad_norm": 2.4747841358184814, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.77397620677948, + "num_tokens": 550774407.0, + "step": 21278 + }, + { + "epoch": 2.336810893916099, + "grad_norm": 2.154815673828125, + "learning_rate": 5e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7736793756484985, + "num_tokens": 550800753.0, + "step": 21279 + }, + { + "epoch": 2.336920711618713, + "grad_norm": 2.1132137775421143, + "learning_rate": 5e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.7776287198066711, + "num_tokens": 550828241.0, + "step": 21280 + }, + { + "epoch": 2.3370305293213267, + "grad_norm": 2.423346519470215, + "learning_rate": 5e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7518045902252197, + "num_tokens": 550850410.0, + "step": 21281 + }, + { + "epoch": 2.33714034702394, + "grad_norm": 1.9316716194152832, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7574125528335571, + "num_tokens": 550879189.0, + "step": 21282 + }, + { + "epoch": 2.337250164726554, + "grad_norm": 2.2505741119384766, + "learning_rate": 5e-06, + "loss": 0.752, + "mean_token_accuracy": 0.752496600151062, + "num_tokens": 550904049.0, + "step": 21283 + }, + { + "epoch": 2.3373599824291675, + "grad_norm": 2.191636085510254, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.759064793586731, + "num_tokens": 550930927.0, + "step": 21284 + }, + { + "epoch": 2.3374698001317813, + "grad_norm": 1.974857211112976, + "learning_rate": 5e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7676461935043335, + "num_tokens": 550956726.0, + "step": 21285 + }, + { + "epoch": 2.337579617834395, + "grad_norm": 2.1656200885772705, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7567745447158813, + "num_tokens": 550982875.0, + "step": 21286 + }, + { + "epoch": 2.3376894355370084, + "grad_norm": 2.0455377101898193, + "learning_rate": 5e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7524845004081726, + "num_tokens": 551012143.0, + "step": 21287 + }, + { + "epoch": 2.337799253239622, + "grad_norm": 1.8270463943481445, + "learning_rate": 5e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7756792306900024, + "num_tokens": 551044908.0, + "step": 21288 + }, + { + "epoch": 2.337909070942236, + "grad_norm": 2.166259288787842, + "learning_rate": 5e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7744786143302917, + "num_tokens": 551069939.0, + "step": 21289 + }, + { + "epoch": 2.3380188886448496, + "grad_norm": 2.117274761199951, + "learning_rate": 5e-06, + "loss": 0.6768, + "mean_token_accuracy": 0.7719364762306213, + "num_tokens": 551092314.0, + "step": 21290 + }, + { + "epoch": 2.3381287063474634, + "grad_norm": 1.9637985229492188, + "learning_rate": 5e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.770980954170227, + "num_tokens": 551119780.0, + "step": 21291 + }, + { + "epoch": 2.3382385240500767, + "grad_norm": 2.1176648139953613, + "learning_rate": 5e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.7875544428825378, + "num_tokens": 551144805.0, + "step": 21292 + }, + { + "epoch": 2.3383483417526905, + "grad_norm": 2.1708199977874756, + "learning_rate": 5e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.7755695581436157, + "num_tokens": 551167714.0, + "step": 21293 + }, + { + "epoch": 2.338458159455304, + "grad_norm": 2.1075680255889893, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7677320837974548, + "num_tokens": 551193504.0, + "step": 21294 + }, + { + "epoch": 2.338567977157918, + "grad_norm": 2.2053043842315674, + "learning_rate": 5e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7651596665382385, + "num_tokens": 551217749.0, + "step": 21295 + }, + { + "epoch": 2.3386777948605317, + "grad_norm": 1.8126369714736938, + "learning_rate": 5e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7569653987884521, + "num_tokens": 551248522.0, + "step": 21296 + }, + { + "epoch": 2.338787612563145, + "grad_norm": 2.0620710849761963, + "learning_rate": 5e-06, + "loss": 0.7725, + "mean_token_accuracy": 0.748456597328186, + "num_tokens": 551278159.0, + "step": 21297 + }, + { + "epoch": 2.338897430265759, + "grad_norm": 2.365454912185669, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7787624597549438, + "num_tokens": 551298139.0, + "step": 21298 + }, + { + "epoch": 2.3390072479683726, + "grad_norm": 1.9346933364868164, + "learning_rate": 5e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.755515456199646, + "num_tokens": 551331100.0, + "step": 21299 + }, + { + "epoch": 2.3391170656709863, + "grad_norm": 2.260007619857788, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7698798179626465, + "num_tokens": 551354354.0, + "step": 21300 + }, + { + "epoch": 2.3392268833735996, + "grad_norm": 2.2823660373687744, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7559532523155212, + "num_tokens": 551377116.0, + "step": 21301 + }, + { + "epoch": 2.3393367010762134, + "grad_norm": 1.8547236919403076, + "learning_rate": 5e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.7889443635940552, + "num_tokens": 551403833.0, + "step": 21302 + }, + { + "epoch": 2.339446518778827, + "grad_norm": 1.8810046911239624, + "learning_rate": 5e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7629146575927734, + "num_tokens": 551436759.0, + "step": 21303 + }, + { + "epoch": 2.339556336481441, + "grad_norm": 2.016467332839966, + "learning_rate": 5e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7537816762924194, + "num_tokens": 551466014.0, + "step": 21304 + }, + { + "epoch": 2.3396661541840547, + "grad_norm": 2.1830670833587646, + "learning_rate": 5e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7713529467582703, + "num_tokens": 551490879.0, + "step": 21305 + }, + { + "epoch": 2.339775971886668, + "grad_norm": 2.155285596847534, + "learning_rate": 5e-06, + "loss": 0.6634, + "mean_token_accuracy": 0.7827277779579163, + "num_tokens": 551513937.0, + "step": 21306 + }, + { + "epoch": 2.3398857895892817, + "grad_norm": 1.9294310808181763, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7319843173027039, + "num_tokens": 551546526.0, + "step": 21307 + }, + { + "epoch": 2.3399956072918955, + "grad_norm": 2.2103044986724854, + "learning_rate": 5e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7617533206939697, + "num_tokens": 551569746.0, + "step": 21308 + }, + { + "epoch": 2.3401054249945092, + "grad_norm": 1.9714999198913574, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.7818822860717773, + "num_tokens": 551597707.0, + "step": 21309 + }, + { + "epoch": 2.3402152426971226, + "grad_norm": 2.0392162799835205, + "learning_rate": 5e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7589171528816223, + "num_tokens": 551622703.0, + "step": 21310 + }, + { + "epoch": 2.3403250603997363, + "grad_norm": 1.7668547630310059, + "learning_rate": 5e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7506716847419739, + "num_tokens": 551655661.0, + "step": 21311 + }, + { + "epoch": 2.34043487810235, + "grad_norm": 2.357801914215088, + "learning_rate": 5e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7677450180053711, + "num_tokens": 551675291.0, + "step": 21312 + }, + { + "epoch": 2.340544695804964, + "grad_norm": 2.1907081604003906, + "learning_rate": 5e-06, + "loss": 0.683, + "mean_token_accuracy": 0.773918867111206, + "num_tokens": 551698542.0, + "step": 21313 + }, + { + "epoch": 2.3406545135075776, + "grad_norm": 2.529679775238037, + "learning_rate": 5e-06, + "loss": 0.5934, + "mean_token_accuracy": 0.793753981590271, + "num_tokens": 551717206.0, + "step": 21314 + }, + { + "epoch": 2.340764331210191, + "grad_norm": 1.994227647781372, + "learning_rate": 5e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7652010321617126, + "num_tokens": 551744796.0, + "step": 21315 + }, + { + "epoch": 2.3408741489128047, + "grad_norm": 2.275909423828125, + "learning_rate": 5e-06, + "loss": 0.6568, + "mean_token_accuracy": 0.7801326513290405, + "num_tokens": 551767685.0, + "step": 21316 + }, + { + "epoch": 2.3409839666154184, + "grad_norm": 2.0569067001342773, + "learning_rate": 5e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7440637350082397, + "num_tokens": 551797553.0, + "step": 21317 + }, + { + "epoch": 2.341093784318032, + "grad_norm": 2.204315185546875, + "learning_rate": 5e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.798015832901001, + "num_tokens": 551820051.0, + "step": 21318 + }, + { + "epoch": 2.341203602020646, + "grad_norm": 2.1262359619140625, + "learning_rate": 5e-06, + "loss": 0.665, + "mean_token_accuracy": 0.7835360169410706, + "num_tokens": 551845382.0, + "step": 21319 + }, + { + "epoch": 2.3413134197232592, + "grad_norm": 2.151733636856079, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.773666501045227, + "num_tokens": 551872293.0, + "step": 21320 + }, + { + "epoch": 2.341423237425873, + "grad_norm": 1.884841799736023, + "learning_rate": 5e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7624562978744507, + "num_tokens": 551901656.0, + "step": 21321 + }, + { + "epoch": 2.3415330551284868, + "grad_norm": 1.8779339790344238, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7519034147262573, + "num_tokens": 551931860.0, + "step": 21322 + }, + { + "epoch": 2.3416428728311005, + "grad_norm": 2.3525848388671875, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7567138075828552, + "num_tokens": 551955214.0, + "step": 21323 + }, + { + "epoch": 2.3417526905337143, + "grad_norm": 2.0447750091552734, + "learning_rate": 5e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7732632756233215, + "num_tokens": 551978837.0, + "step": 21324 + }, + { + "epoch": 2.3418625082363276, + "grad_norm": 2.415268898010254, + "learning_rate": 5e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.7773864269256592, + "num_tokens": 552000509.0, + "step": 21325 + }, + { + "epoch": 2.3419723259389413, + "grad_norm": 2.401986598968506, + "learning_rate": 5e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7742795348167419, + "num_tokens": 552021064.0, + "step": 21326 + }, + { + "epoch": 2.342082143641555, + "grad_norm": 2.3068783283233643, + "learning_rate": 5e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7746555805206299, + "num_tokens": 552044125.0, + "step": 21327 + }, + { + "epoch": 2.342191961344169, + "grad_norm": 1.9796990156173706, + "learning_rate": 5e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.752027690410614, + "num_tokens": 552071414.0, + "step": 21328 + }, + { + "epoch": 2.342301779046782, + "grad_norm": 2.204925298690796, + "learning_rate": 5e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7534224987030029, + "num_tokens": 552093278.0, + "step": 21329 + }, + { + "epoch": 2.342411596749396, + "grad_norm": 2.429753065109253, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7543256282806396, + "num_tokens": 552114877.0, + "step": 21330 + }, + { + "epoch": 2.3425214144520097, + "grad_norm": 2.2056190967559814, + "learning_rate": 5e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7631981372833252, + "num_tokens": 552140216.0, + "step": 21331 + }, + { + "epoch": 2.3426312321546234, + "grad_norm": 2.1996660232543945, + "learning_rate": 5e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.7812105417251587, + "num_tokens": 552162915.0, + "step": 21332 + }, + { + "epoch": 2.3427410498572367, + "grad_norm": 2.009455442428589, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7554641962051392, + "num_tokens": 552192545.0, + "step": 21333 + }, + { + "epoch": 2.3428508675598505, + "grad_norm": 2.0910468101501465, + "learning_rate": 5e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.76772540807724, + "num_tokens": 552216914.0, + "step": 21334 + }, + { + "epoch": 2.3429606852624643, + "grad_norm": 2.1668293476104736, + "learning_rate": 5e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.7736436128616333, + "num_tokens": 552239450.0, + "step": 21335 + }, + { + "epoch": 2.343070502965078, + "grad_norm": 2.0485479831695557, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7547237873077393, + "num_tokens": 552267713.0, + "step": 21336 + }, + { + "epoch": 2.3431803206676918, + "grad_norm": 2.165041208267212, + "learning_rate": 5e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7542750835418701, + "num_tokens": 552296643.0, + "step": 21337 + }, + { + "epoch": 2.343290138370305, + "grad_norm": 1.9140839576721191, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.76666259765625, + "num_tokens": 552326668.0, + "step": 21338 + }, + { + "epoch": 2.343399956072919, + "grad_norm": 2.0142698287963867, + "learning_rate": 5e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7762318849563599, + "num_tokens": 552353866.0, + "step": 21339 + }, + { + "epoch": 2.3435097737755326, + "grad_norm": 2.3176400661468506, + "learning_rate": 5e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7697454690933228, + "num_tokens": 552377106.0, + "step": 21340 + }, + { + "epoch": 2.3436195914781464, + "grad_norm": 2.082123279571533, + "learning_rate": 5e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.7881320118904114, + "num_tokens": 552402820.0, + "step": 21341 + }, + { + "epoch": 2.34372940918076, + "grad_norm": 2.053966522216797, + "learning_rate": 5e-06, + "loss": 0.5927, + "mean_token_accuracy": 0.8048524856567383, + "num_tokens": 552427730.0, + "step": 21342 + }, + { + "epoch": 2.3438392268833734, + "grad_norm": 2.0919673442840576, + "learning_rate": 5e-06, + "loss": 0.654, + "mean_token_accuracy": 0.7864202260971069, + "num_tokens": 552453567.0, + "step": 21343 + }, + { + "epoch": 2.343949044585987, + "grad_norm": 1.9113662242889404, + "learning_rate": 5e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7676176428794861, + "num_tokens": 552486720.0, + "step": 21344 + }, + { + "epoch": 2.344058862288601, + "grad_norm": 2.2432701587677, + "learning_rate": 5e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7690021991729736, + "num_tokens": 552513136.0, + "step": 21345 + }, + { + "epoch": 2.3441686799912147, + "grad_norm": 2.25614070892334, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7653815746307373, + "num_tokens": 552539519.0, + "step": 21346 + }, + { + "epoch": 2.3442784976938285, + "grad_norm": 2.1263856887817383, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7698513269424438, + "num_tokens": 552564447.0, + "step": 21347 + }, + { + "epoch": 2.3443883153964418, + "grad_norm": 1.8659467697143555, + "learning_rate": 5e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.76276695728302, + "num_tokens": 552595727.0, + "step": 21348 + }, + { + "epoch": 2.3444981330990555, + "grad_norm": 1.9607542753219604, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7701754570007324, + "num_tokens": 552624874.0, + "step": 21349 + }, + { + "epoch": 2.3446079508016693, + "grad_norm": 2.041084051132202, + "learning_rate": 5e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7843471765518188, + "num_tokens": 552650431.0, + "step": 21350 + }, + { + "epoch": 2.344717768504283, + "grad_norm": 2.046980619430542, + "learning_rate": 5e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7651423215866089, + "num_tokens": 552676585.0, + "step": 21351 + }, + { + "epoch": 2.344827586206897, + "grad_norm": 2.2345852851867676, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7639102339744568, + "num_tokens": 552701738.0, + "step": 21352 + }, + { + "epoch": 2.34493740390951, + "grad_norm": 2.177734851837158, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.756159782409668, + "num_tokens": 552725784.0, + "step": 21353 + }, + { + "epoch": 2.345047221612124, + "grad_norm": 2.145601272583008, + "learning_rate": 5e-06, + "loss": 0.6232, + "mean_token_accuracy": 0.7904354333877563, + "num_tokens": 552749695.0, + "step": 21354 + }, + { + "epoch": 2.3451570393147376, + "grad_norm": 2.125609874725342, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7698343396186829, + "num_tokens": 552773470.0, + "step": 21355 + }, + { + "epoch": 2.3452668570173514, + "grad_norm": 2.3153154850006104, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7749472856521606, + "num_tokens": 552794851.0, + "step": 21356 + }, + { + "epoch": 2.3453766747199647, + "grad_norm": 2.1102399826049805, + "learning_rate": 5e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7697756290435791, + "num_tokens": 552820769.0, + "step": 21357 + }, + { + "epoch": 2.3454864924225785, + "grad_norm": 2.104511022567749, + "learning_rate": 5e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7508480548858643, + "num_tokens": 552850879.0, + "step": 21358 + }, + { + "epoch": 2.345596310125192, + "grad_norm": 2.138155937194824, + "learning_rate": 5e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7616573572158813, + "num_tokens": 552875914.0, + "step": 21359 + }, + { + "epoch": 2.345706127827806, + "grad_norm": 1.8716641664505005, + "learning_rate": 5e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.748464047908783, + "num_tokens": 552907555.0, + "step": 21360 + }, + { + "epoch": 2.3458159455304193, + "grad_norm": 2.194887161254883, + "learning_rate": 5e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7510466575622559, + "num_tokens": 552934209.0, + "step": 21361 + }, + { + "epoch": 2.345925763233033, + "grad_norm": 2.0522117614746094, + "learning_rate": 5e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7780136466026306, + "num_tokens": 552961267.0, + "step": 21362 + }, + { + "epoch": 2.346035580935647, + "grad_norm": 2.272956609725952, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7678072452545166, + "num_tokens": 552981988.0, + "step": 21363 + }, + { + "epoch": 2.3461453986382605, + "grad_norm": 1.9997873306274414, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7758521437644958, + "num_tokens": 553010534.0, + "step": 21364 + }, + { + "epoch": 2.3462552163408743, + "grad_norm": 2.052412509918213, + "learning_rate": 5e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7466771602630615, + "num_tokens": 553037485.0, + "step": 21365 + }, + { + "epoch": 2.3463650340434876, + "grad_norm": 1.9646121263504028, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7532965540885925, + "num_tokens": 553066428.0, + "step": 21366 + }, + { + "epoch": 2.3464748517461014, + "grad_norm": 2.1701443195343018, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7746834754943848, + "num_tokens": 553092684.0, + "step": 21367 + }, + { + "epoch": 2.346584669448715, + "grad_norm": 2.1937053203582764, + "learning_rate": 5e-06, + "loss": 0.645, + "mean_token_accuracy": 0.7817159295082092, + "num_tokens": 553114829.0, + "step": 21368 + }, + { + "epoch": 2.346694487151329, + "grad_norm": 2.1895177364349365, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7738660573959351, + "num_tokens": 553139757.0, + "step": 21369 + }, + { + "epoch": 2.3468043048539426, + "grad_norm": 2.0382883548736572, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7457234859466553, + "num_tokens": 553170891.0, + "step": 21370 + }, + { + "epoch": 2.346914122556556, + "grad_norm": 2.358591318130493, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7675371170043945, + "num_tokens": 553193769.0, + "step": 21371 + }, + { + "epoch": 2.3470239402591697, + "grad_norm": 1.9499213695526123, + "learning_rate": 5e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7769555449485779, + "num_tokens": 553222260.0, + "step": 21372 + }, + { + "epoch": 2.3471337579617835, + "grad_norm": 2.157623052597046, + "learning_rate": 5e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7352918386459351, + "num_tokens": 553250090.0, + "step": 21373 + }, + { + "epoch": 2.3472435756643972, + "grad_norm": 1.9793349504470825, + "learning_rate": 5e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7661150097846985, + "num_tokens": 553276692.0, + "step": 21374 + }, + { + "epoch": 2.347353393367011, + "grad_norm": 2.3448495864868164, + "learning_rate": 5e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7594537138938904, + "num_tokens": 553297800.0, + "step": 21375 + }, + { + "epoch": 2.3474632110696243, + "grad_norm": 2.127715826034546, + "learning_rate": 5e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7775940895080566, + "num_tokens": 553322306.0, + "step": 21376 + }, + { + "epoch": 2.347573028772238, + "grad_norm": 2.357844591140747, + "learning_rate": 5e-06, + "loss": 0.6762, + "mean_token_accuracy": 0.7867414951324463, + "num_tokens": 553345492.0, + "step": 21377 + }, + { + "epoch": 2.347682846474852, + "grad_norm": 2.3256595134735107, + "learning_rate": 5e-06, + "loss": 0.6492, + "mean_token_accuracy": 0.7792876958847046, + "num_tokens": 553365590.0, + "step": 21378 + }, + { + "epoch": 2.3477926641774656, + "grad_norm": 2.0249195098876953, + "learning_rate": 5e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.7701758146286011, + "num_tokens": 553391621.0, + "step": 21379 + }, + { + "epoch": 2.347902481880079, + "grad_norm": 2.162522792816162, + "learning_rate": 5e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7653371095657349, + "num_tokens": 553416387.0, + "step": 21380 + }, + { + "epoch": 2.3480122995826926, + "grad_norm": 2.1329071521759033, + "learning_rate": 5e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.7832679748535156, + "num_tokens": 553441525.0, + "step": 21381 + }, + { + "epoch": 2.3481221172853064, + "grad_norm": 2.1403307914733887, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7589249014854431, + "num_tokens": 553465038.0, + "step": 21382 + }, + { + "epoch": 2.34823193498792, + "grad_norm": 2.401883125305176, + "learning_rate": 5e-06, + "loss": 0.587, + "mean_token_accuracy": 0.8099817037582397, + "num_tokens": 553482834.0, + "step": 21383 + }, + { + "epoch": 2.3483417526905335, + "grad_norm": 1.9446548223495483, + "learning_rate": 5e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7781250476837158, + "num_tokens": 553512226.0, + "step": 21384 + }, + { + "epoch": 2.3484515703931472, + "grad_norm": 2.5214908123016357, + "learning_rate": 5e-06, + "loss": 0.6548, + "mean_token_accuracy": 0.7836163640022278, + "num_tokens": 553530050.0, + "step": 21385 + }, + { + "epoch": 2.348561388095761, + "grad_norm": 2.166961431503296, + "learning_rate": 5e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7678216695785522, + "num_tokens": 553554382.0, + "step": 21386 + }, + { + "epoch": 2.3486712057983747, + "grad_norm": 2.177293062210083, + "learning_rate": 5e-06, + "loss": 0.6981, + "mean_token_accuracy": 0.7678281664848328, + "num_tokens": 553579892.0, + "step": 21387 + }, + { + "epoch": 2.3487810235009885, + "grad_norm": 1.9994348287582397, + "learning_rate": 5e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7569872140884399, + "num_tokens": 553606950.0, + "step": 21388 + }, + { + "epoch": 2.348890841203602, + "grad_norm": 2.160224676132202, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7681427001953125, + "num_tokens": 553632532.0, + "step": 21389 + }, + { + "epoch": 2.3490006589062156, + "grad_norm": 2.029574155807495, + "learning_rate": 5e-06, + "loss": 0.699, + "mean_token_accuracy": 0.7736471891403198, + "num_tokens": 553660951.0, + "step": 21390 + }, + { + "epoch": 2.3491104766088293, + "grad_norm": 1.9135401248931885, + "learning_rate": 5e-06, + "loss": 0.8198, + "mean_token_accuracy": 0.7386685609817505, + "num_tokens": 553689440.0, + "step": 21391 + }, + { + "epoch": 2.349220294311443, + "grad_norm": 2.0638954639434814, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.762662947177887, + "num_tokens": 553715678.0, + "step": 21392 + }, + { + "epoch": 2.349330112014057, + "grad_norm": 2.0004823207855225, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7689576148986816, + "num_tokens": 553741742.0, + "step": 21393 + }, + { + "epoch": 2.34943992971667, + "grad_norm": 1.8304591178894043, + "learning_rate": 5e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7347406148910522, + "num_tokens": 553775567.0, + "step": 21394 + }, + { + "epoch": 2.349549747419284, + "grad_norm": 2.28017258644104, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.781259298324585, + "num_tokens": 553798037.0, + "step": 21395 + }, + { + "epoch": 2.3496595651218977, + "grad_norm": 2.277353286743164, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7550524473190308, + "num_tokens": 553824433.0, + "step": 21396 + }, + { + "epoch": 2.3497693828245114, + "grad_norm": 2.1617305278778076, + "learning_rate": 5e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7643451690673828, + "num_tokens": 553846881.0, + "step": 21397 + }, + { + "epoch": 2.349879200527125, + "grad_norm": 2.0309786796569824, + "learning_rate": 5e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7619264125823975, + "num_tokens": 553874576.0, + "step": 21398 + }, + { + "epoch": 2.3499890182297385, + "grad_norm": 1.959702491760254, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7480752468109131, + "num_tokens": 553902319.0, + "step": 21399 + }, + { + "epoch": 2.3500988359323522, + "grad_norm": 2.0155882835388184, + "learning_rate": 5e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7506923079490662, + "num_tokens": 553930807.0, + "step": 21400 + }, + { + "epoch": 2.350208653634966, + "grad_norm": 2.0256385803222656, + "learning_rate": 5e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7481207847595215, + "num_tokens": 553960957.0, + "step": 21401 + }, + { + "epoch": 2.3503184713375798, + "grad_norm": 2.2032077312469482, + "learning_rate": 5e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.7768706679344177, + "num_tokens": 553985831.0, + "step": 21402 + }, + { + "epoch": 2.3504282890401935, + "grad_norm": 1.8388645648956299, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7711943984031677, + "num_tokens": 554017615.0, + "step": 21403 + }, + { + "epoch": 2.350538106742807, + "grad_norm": 2.0872550010681152, + "learning_rate": 5e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7546994090080261, + "num_tokens": 554043594.0, + "step": 21404 + }, + { + "epoch": 2.3506479244454206, + "grad_norm": 2.081300735473633, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7602658271789551, + "num_tokens": 554068849.0, + "step": 21405 + }, + { + "epoch": 2.3507577421480343, + "grad_norm": 2.1966183185577393, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7622557878494263, + "num_tokens": 554093096.0, + "step": 21406 + }, + { + "epoch": 2.350867559850648, + "grad_norm": 2.249596118927002, + "learning_rate": 5e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.7756602764129639, + "num_tokens": 554115721.0, + "step": 21407 + }, + { + "epoch": 2.3509773775532614, + "grad_norm": 2.181847333908081, + "learning_rate": 5e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7669394612312317, + "num_tokens": 554139555.0, + "step": 21408 + }, + { + "epoch": 2.351087195255875, + "grad_norm": 1.9088550806045532, + "learning_rate": 5e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7659062147140503, + "num_tokens": 554171059.0, + "step": 21409 + }, + { + "epoch": 2.351197012958489, + "grad_norm": 2.076586961746216, + "learning_rate": 5e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7502330541610718, + "num_tokens": 554198351.0, + "step": 21410 + }, + { + "epoch": 2.3513068306611027, + "grad_norm": 1.9347035884857178, + "learning_rate": 5e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.743558406829834, + "num_tokens": 554228979.0, + "step": 21411 + }, + { + "epoch": 2.351416648363716, + "grad_norm": 2.173776388168335, + "learning_rate": 5e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7842228412628174, + "num_tokens": 554252498.0, + "step": 21412 + }, + { + "epoch": 2.3515264660663298, + "grad_norm": 2.1545331478118896, + "learning_rate": 5e-06, + "loss": 0.6021, + "mean_token_accuracy": 0.7934312224388123, + "num_tokens": 554274248.0, + "step": 21413 + }, + { + "epoch": 2.3516362837689435, + "grad_norm": 2.0589725971221924, + "learning_rate": 5e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7716103196144104, + "num_tokens": 554300321.0, + "step": 21414 + }, + { + "epoch": 2.3517461014715573, + "grad_norm": 2.097102403640747, + "learning_rate": 5e-06, + "loss": 0.6383, + "mean_token_accuracy": 0.7834337949752808, + "num_tokens": 554327228.0, + "step": 21415 + }, + { + "epoch": 2.351855919174171, + "grad_norm": 2.249936819076538, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7816572189331055, + "num_tokens": 554348902.0, + "step": 21416 + }, + { + "epoch": 2.3519657368767843, + "grad_norm": 2.111497163772583, + "learning_rate": 5e-06, + "loss": 0.6235, + "mean_token_accuracy": 0.7910320162773132, + "num_tokens": 554372056.0, + "step": 21417 + }, + { + "epoch": 2.352075554579398, + "grad_norm": 2.173835277557373, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7722928524017334, + "num_tokens": 554396159.0, + "step": 21418 + }, + { + "epoch": 2.352185372282012, + "grad_norm": 2.451566696166992, + "learning_rate": 5e-06, + "loss": 0.573, + "mean_token_accuracy": 0.8013342022895813, + "num_tokens": 554413263.0, + "step": 21419 + }, + { + "epoch": 2.3522951899846256, + "grad_norm": 2.1719601154327393, + "learning_rate": 5e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.772895336151123, + "num_tokens": 554436322.0, + "step": 21420 + }, + { + "epoch": 2.3524050076872394, + "grad_norm": 2.212022542953491, + "learning_rate": 5e-06, + "loss": 0.7045, + "mean_token_accuracy": 0.7775290608406067, + "num_tokens": 554458972.0, + "step": 21421 + }, + { + "epoch": 2.3525148253898527, + "grad_norm": 1.9019393920898438, + "learning_rate": 5e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7448080778121948, + "num_tokens": 554488949.0, + "step": 21422 + }, + { + "epoch": 2.3526246430924664, + "grad_norm": 2.038255453109741, + "learning_rate": 5e-06, + "loss": 0.6875, + "mean_token_accuracy": 0.7779786586761475, + "num_tokens": 554514309.0, + "step": 21423 + }, + { + "epoch": 2.35273446079508, + "grad_norm": 2.2161126136779785, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7687386274337769, + "num_tokens": 554539962.0, + "step": 21424 + }, + { + "epoch": 2.352844278497694, + "grad_norm": 2.0587782859802246, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7660175561904907, + "num_tokens": 554568780.0, + "step": 21425 + }, + { + "epoch": 2.3529540962003077, + "grad_norm": 1.802535057067871, + "learning_rate": 5e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7615344524383545, + "num_tokens": 554603620.0, + "step": 21426 + }, + { + "epoch": 2.353063913902921, + "grad_norm": 2.083561658859253, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7535557746887207, + "num_tokens": 554634549.0, + "step": 21427 + }, + { + "epoch": 2.353173731605535, + "grad_norm": 2.2428085803985596, + "learning_rate": 5e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7675108909606934, + "num_tokens": 554659605.0, + "step": 21428 + }, + { + "epoch": 2.3532835493081485, + "grad_norm": 2.2037031650543213, + "learning_rate": 5e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7660205364227295, + "num_tokens": 554685262.0, + "step": 21429 + }, + { + "epoch": 2.3533933670107623, + "grad_norm": 2.1003763675689697, + "learning_rate": 5e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.7934175729751587, + "num_tokens": 554710716.0, + "step": 21430 + }, + { + "epoch": 2.3535031847133756, + "grad_norm": 1.9236007928848267, + "learning_rate": 5e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.7762775421142578, + "num_tokens": 554739015.0, + "step": 21431 + }, + { + "epoch": 2.3536130024159894, + "grad_norm": 2.056489944458008, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7456655502319336, + "num_tokens": 554769769.0, + "step": 21432 + }, + { + "epoch": 2.353722820118603, + "grad_norm": 2.334721803665161, + "learning_rate": 5e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7666575908660889, + "num_tokens": 554793049.0, + "step": 21433 + }, + { + "epoch": 2.353832637821217, + "grad_norm": 1.9351375102996826, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7575451135635376, + "num_tokens": 554821321.0, + "step": 21434 + }, + { + "epoch": 2.3539424555238306, + "grad_norm": 1.8764855861663818, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7477099895477295, + "num_tokens": 554854902.0, + "step": 21435 + }, + { + "epoch": 2.354052273226444, + "grad_norm": 1.9701443910598755, + "learning_rate": 5e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.737845242023468, + "num_tokens": 554885154.0, + "step": 21436 + }, + { + "epoch": 2.3541620909290577, + "grad_norm": 2.020578384399414, + "learning_rate": 5e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.7751060724258423, + "num_tokens": 554911889.0, + "step": 21437 + }, + { + "epoch": 2.3542719086316715, + "grad_norm": 1.9264771938323975, + "learning_rate": 5e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7512784600257874, + "num_tokens": 554942889.0, + "step": 21438 + }, + { + "epoch": 2.354381726334285, + "grad_norm": 1.9690768718719482, + "learning_rate": 5e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7449227571487427, + "num_tokens": 554972778.0, + "step": 21439 + }, + { + "epoch": 2.3544915440368985, + "grad_norm": 2.0665557384490967, + "learning_rate": 5e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7523421049118042, + "num_tokens": 554997592.0, + "step": 21440 + }, + { + "epoch": 2.3546013617395123, + "grad_norm": 2.3089263439178467, + "learning_rate": 5e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7706454992294312, + "num_tokens": 555020100.0, + "step": 21441 + }, + { + "epoch": 2.354711179442126, + "grad_norm": 2.0989856719970703, + "learning_rate": 5e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.7744897603988647, + "num_tokens": 555046316.0, + "step": 21442 + }, + { + "epoch": 2.35482099714474, + "grad_norm": 1.9781173467636108, + "learning_rate": 5e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7383525371551514, + "num_tokens": 555075190.0, + "step": 21443 + }, + { + "epoch": 2.3549308148473536, + "grad_norm": 2.1192402839660645, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7651252746582031, + "num_tokens": 555102439.0, + "step": 21444 + }, + { + "epoch": 2.355040632549967, + "grad_norm": 2.2899186611175537, + "learning_rate": 5e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7645139098167419, + "num_tokens": 555127176.0, + "step": 21445 + }, + { + "epoch": 2.3551504502525806, + "grad_norm": 1.8698995113372803, + "learning_rate": 5e-06, + "loss": 0.7306, + "mean_token_accuracy": 0.7575889825820923, + "num_tokens": 555159220.0, + "step": 21446 + }, + { + "epoch": 2.3552602679551944, + "grad_norm": 1.9754493236541748, + "learning_rate": 5e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7813554406166077, + "num_tokens": 555185638.0, + "step": 21447 + }, + { + "epoch": 2.355370085657808, + "grad_norm": 2.0896313190460205, + "learning_rate": 5e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7520735263824463, + "num_tokens": 555213153.0, + "step": 21448 + }, + { + "epoch": 2.355479903360422, + "grad_norm": 2.1382791996002197, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7707732915878296, + "num_tokens": 555236370.0, + "step": 21449 + }, + { + "epoch": 2.355589721063035, + "grad_norm": 2.252692699432373, + "learning_rate": 5e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.7784075736999512, + "num_tokens": 555258341.0, + "step": 21450 + }, + { + "epoch": 2.355699538765649, + "grad_norm": 1.9287010431289673, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7640340924263, + "num_tokens": 555285893.0, + "step": 21451 + }, + { + "epoch": 2.3558093564682627, + "grad_norm": 1.9586628675460815, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.755834698677063, + "num_tokens": 555315404.0, + "step": 21452 + }, + { + "epoch": 2.3559191741708765, + "grad_norm": 2.03652286529541, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7636733055114746, + "num_tokens": 555343899.0, + "step": 21453 + }, + { + "epoch": 2.3560289918734902, + "grad_norm": 2.351783514022827, + "learning_rate": 5e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7746495008468628, + "num_tokens": 555366480.0, + "step": 21454 + }, + { + "epoch": 2.3561388095761036, + "grad_norm": 2.0043671131134033, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7633646130561829, + "num_tokens": 555393725.0, + "step": 21455 + }, + { + "epoch": 2.3562486272787173, + "grad_norm": 2.1147451400756836, + "learning_rate": 5e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7760401964187622, + "num_tokens": 555417121.0, + "step": 21456 + }, + { + "epoch": 2.356358444981331, + "grad_norm": 2.0435333251953125, + "learning_rate": 5e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7599508762359619, + "num_tokens": 555445531.0, + "step": 21457 + }, + { + "epoch": 2.356468262683945, + "grad_norm": 2.174492597579956, + "learning_rate": 5e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.7794378399848938, + "num_tokens": 555469644.0, + "step": 21458 + }, + { + "epoch": 2.356578080386558, + "grad_norm": 2.284870147705078, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7699379920959473, + "num_tokens": 555493214.0, + "step": 21459 + }, + { + "epoch": 2.356687898089172, + "grad_norm": 2.502178192138672, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7638506889343262, + "num_tokens": 555516026.0, + "step": 21460 + }, + { + "epoch": 2.3567977157917857, + "grad_norm": 1.9129753112792969, + "learning_rate": 5e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7638811469078064, + "num_tokens": 555548315.0, + "step": 21461 + }, + { + "epoch": 2.3569075334943994, + "grad_norm": 2.2486422061920166, + "learning_rate": 5e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.7788740396499634, + "num_tokens": 555570836.0, + "step": 21462 + }, + { + "epoch": 2.3570173511970127, + "grad_norm": 2.01798415184021, + "learning_rate": 5e-06, + "loss": 0.7248, + "mean_token_accuracy": 0.7578676342964172, + "num_tokens": 555599264.0, + "step": 21463 + }, + { + "epoch": 2.3571271688996265, + "grad_norm": 2.118182420730591, + "learning_rate": 5e-06, + "loss": 0.6477, + "mean_token_accuracy": 0.7875785827636719, + "num_tokens": 555624312.0, + "step": 21464 + }, + { + "epoch": 2.3572369866022402, + "grad_norm": 1.9088953733444214, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7510532140731812, + "num_tokens": 555656143.0, + "step": 21465 + }, + { + "epoch": 2.357346804304854, + "grad_norm": 2.3661367893218994, + "learning_rate": 5e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7750202417373657, + "num_tokens": 555679879.0, + "step": 21466 + }, + { + "epoch": 2.3574566220074678, + "grad_norm": 1.8633673191070557, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.770454466342926, + "num_tokens": 555712302.0, + "step": 21467 + }, + { + "epoch": 2.357566439710081, + "grad_norm": 2.294095277786255, + "learning_rate": 5e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.7866300344467163, + "num_tokens": 555734885.0, + "step": 21468 + }, + { + "epoch": 2.357676257412695, + "grad_norm": 2.229167938232422, + "learning_rate": 5e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7636212110519409, + "num_tokens": 555760826.0, + "step": 21469 + }, + { + "epoch": 2.3577860751153086, + "grad_norm": 2.108640670776367, + "learning_rate": 5e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7584356665611267, + "num_tokens": 555787356.0, + "step": 21470 + }, + { + "epoch": 2.3578958928179223, + "grad_norm": 2.124870538711548, + "learning_rate": 5e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.7909737825393677, + "num_tokens": 555814765.0, + "step": 21471 + }, + { + "epoch": 2.358005710520536, + "grad_norm": 2.2923810482025146, + "learning_rate": 5e-06, + "loss": 0.6463, + "mean_token_accuracy": 0.7858202457427979, + "num_tokens": 555838186.0, + "step": 21472 + }, + { + "epoch": 2.3581155282231494, + "grad_norm": 1.954318642616272, + "learning_rate": 5e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.757422685623169, + "num_tokens": 555866141.0, + "step": 21473 + }, + { + "epoch": 2.358225345925763, + "grad_norm": 2.029634714126587, + "learning_rate": 5e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7626979351043701, + "num_tokens": 555891135.0, + "step": 21474 + }, + { + "epoch": 2.358335163628377, + "grad_norm": 2.064927816390991, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.76099693775177, + "num_tokens": 555918670.0, + "step": 21475 + }, + { + "epoch": 2.3584449813309907, + "grad_norm": 2.3216452598571777, + "learning_rate": 5e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7520062923431396, + "num_tokens": 555941283.0, + "step": 21476 + }, + { + "epoch": 2.3585547990336044, + "grad_norm": 1.796541690826416, + "learning_rate": 5e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7368535399436951, + "num_tokens": 555976081.0, + "step": 21477 + }, + { + "epoch": 2.3586646167362177, + "grad_norm": 2.347541332244873, + "learning_rate": 5e-06, + "loss": 0.5845, + "mean_token_accuracy": 0.8009601831436157, + "num_tokens": 555994839.0, + "step": 21478 + }, + { + "epoch": 2.3587744344388315, + "grad_norm": 2.0399978160858154, + "learning_rate": 5e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.763005793094635, + "num_tokens": 556024261.0, + "step": 21479 + }, + { + "epoch": 2.3588842521414453, + "grad_norm": 1.8951491117477417, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7720156311988831, + "num_tokens": 556051857.0, + "step": 21480 + }, + { + "epoch": 2.358994069844059, + "grad_norm": 2.3226876258850098, + "learning_rate": 5e-06, + "loss": 0.662, + "mean_token_accuracy": 0.7773391604423523, + "num_tokens": 556072447.0, + "step": 21481 + }, + { + "epoch": 2.3591038875466728, + "grad_norm": 2.0441882610321045, + "learning_rate": 5e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.7513176202774048, + "num_tokens": 556100522.0, + "step": 21482 + }, + { + "epoch": 2.359213705249286, + "grad_norm": 2.2688417434692383, + "learning_rate": 5e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7547279596328735, + "num_tokens": 556124776.0, + "step": 21483 + }, + { + "epoch": 2.3593235229519, + "grad_norm": 2.029690742492676, + "learning_rate": 5e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7615057826042175, + "num_tokens": 556152630.0, + "step": 21484 + }, + { + "epoch": 2.3594333406545136, + "grad_norm": 2.412548780441284, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7776082754135132, + "num_tokens": 556173782.0, + "step": 21485 + }, + { + "epoch": 2.3595431583571274, + "grad_norm": 2.110030174255371, + "learning_rate": 5e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7395546436309814, + "num_tokens": 556202279.0, + "step": 21486 + }, + { + "epoch": 2.3596529760597407, + "grad_norm": 2.03033709526062, + "learning_rate": 5e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.788642168045044, + "num_tokens": 556227684.0, + "step": 21487 + }, + { + "epoch": 2.3597627937623544, + "grad_norm": 2.34702467918396, + "learning_rate": 5e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7673903107643127, + "num_tokens": 556250265.0, + "step": 21488 + }, + { + "epoch": 2.359872611464968, + "grad_norm": 2.007774591445923, + "learning_rate": 5e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.7579483985900879, + "num_tokens": 556277986.0, + "step": 21489 + }, + { + "epoch": 2.359982429167582, + "grad_norm": 2.0457286834716797, + "learning_rate": 5e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7669966220855713, + "num_tokens": 556302264.0, + "step": 21490 + }, + { + "epoch": 2.3600922468701953, + "grad_norm": 1.9594645500183105, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7571250796318054, + "num_tokens": 556331786.0, + "step": 21491 + }, + { + "epoch": 2.360202064572809, + "grad_norm": 1.9466445446014404, + "learning_rate": 5e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7626397609710693, + "num_tokens": 556361633.0, + "step": 21492 + }, + { + "epoch": 2.3603118822754228, + "grad_norm": 1.9490705728530884, + "learning_rate": 5e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7366145253181458, + "num_tokens": 556392333.0, + "step": 21493 + }, + { + "epoch": 2.3604216999780365, + "grad_norm": 2.230111598968506, + "learning_rate": 5e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.779291033744812, + "num_tokens": 556415488.0, + "step": 21494 + }, + { + "epoch": 2.3605315176806503, + "grad_norm": 2.007457733154297, + "learning_rate": 5e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.7838736772537231, + "num_tokens": 556440776.0, + "step": 21495 + }, + { + "epoch": 2.3606413353832636, + "grad_norm": 2.027038812637329, + "learning_rate": 5e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7700790166854858, + "num_tokens": 556465686.0, + "step": 21496 + }, + { + "epoch": 2.3607511530858774, + "grad_norm": 2.244807243347168, + "learning_rate": 5e-06, + "loss": 0.6606, + "mean_token_accuracy": 0.7877991199493408, + "num_tokens": 556489146.0, + "step": 21497 + }, + { + "epoch": 2.360860970788491, + "grad_norm": 2.3037943840026855, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7629861235618591, + "num_tokens": 556510061.0, + "step": 21498 + }, + { + "epoch": 2.360970788491105, + "grad_norm": 2.225965976715088, + "learning_rate": 5e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.7953165769577026, + "num_tokens": 556532315.0, + "step": 21499 + }, + { + "epoch": 2.3610806061937186, + "grad_norm": 2.1123569011688232, + "learning_rate": 5e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7727562189102173, + "num_tokens": 556556413.0, + "step": 21500 + }, + { + "epoch": 2.361190423896332, + "grad_norm": 1.9304046630859375, + "learning_rate": 5e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7565209269523621, + "num_tokens": 556585331.0, + "step": 21501 + }, + { + "epoch": 2.3613002415989457, + "grad_norm": 2.256845474243164, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7708048820495605, + "num_tokens": 556608673.0, + "step": 21502 + }, + { + "epoch": 2.3614100593015594, + "grad_norm": 2.278228282928467, + "learning_rate": 5e-06, + "loss": 0.6671, + "mean_token_accuracy": 0.7759610414505005, + "num_tokens": 556629047.0, + "step": 21503 + }, + { + "epoch": 2.361519877004173, + "grad_norm": 2.143555164337158, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7671312689781189, + "num_tokens": 556653052.0, + "step": 21504 + }, + { + "epoch": 2.361629694706787, + "grad_norm": 2.284791946411133, + "learning_rate": 5e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8015826940536499, + "num_tokens": 556673393.0, + "step": 21505 + }, + { + "epoch": 2.3617395124094003, + "grad_norm": 2.328895092010498, + "learning_rate": 5e-06, + "loss": 0.664, + "mean_token_accuracy": 0.7788982391357422, + "num_tokens": 556695368.0, + "step": 21506 + }, + { + "epoch": 2.361849330112014, + "grad_norm": 2.180791139602661, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7635828256607056, + "num_tokens": 556719806.0, + "step": 21507 + }, + { + "epoch": 2.361959147814628, + "grad_norm": 1.9115723371505737, + "learning_rate": 5e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7518764734268188, + "num_tokens": 556749371.0, + "step": 21508 + }, + { + "epoch": 2.3620689655172415, + "grad_norm": 2.0427136421203613, + "learning_rate": 5e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7473857402801514, + "num_tokens": 556778236.0, + "step": 21509 + }, + { + "epoch": 2.362178783219855, + "grad_norm": 2.0599007606506348, + "learning_rate": 5e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7492824196815491, + "num_tokens": 556807131.0, + "step": 21510 + }, + { + "epoch": 2.3622886009224686, + "grad_norm": 2.2065482139587402, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7696712017059326, + "num_tokens": 556830014.0, + "step": 21511 + }, + { + "epoch": 2.3623984186250824, + "grad_norm": 2.1436123847961426, + "learning_rate": 5e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7496102452278137, + "num_tokens": 556857424.0, + "step": 21512 + }, + { + "epoch": 2.362508236327696, + "grad_norm": 2.0328524112701416, + "learning_rate": 5e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7440091371536255, + "num_tokens": 556886638.0, + "step": 21513 + }, + { + "epoch": 2.3626180540303094, + "grad_norm": 2.0693931579589844, + "learning_rate": 5e-06, + "loss": 0.7697, + "mean_token_accuracy": 0.7526867389678955, + "num_tokens": 556913365.0, + "step": 21514 + }, + { + "epoch": 2.362727871732923, + "grad_norm": 2.019815683364868, + "learning_rate": 5e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7726083993911743, + "num_tokens": 556939688.0, + "step": 21515 + }, + { + "epoch": 2.362837689435537, + "grad_norm": 2.0407893657684326, + "learning_rate": 5e-06, + "loss": 0.6461, + "mean_token_accuracy": 0.7894498109817505, + "num_tokens": 556966067.0, + "step": 21516 + }, + { + "epoch": 2.3629475071381507, + "grad_norm": 1.8962100744247437, + "learning_rate": 5e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7667019963264465, + "num_tokens": 556992339.0, + "step": 21517 + }, + { + "epoch": 2.3630573248407645, + "grad_norm": 2.0625436305999756, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7721602916717529, + "num_tokens": 557018153.0, + "step": 21518 + }, + { + "epoch": 2.363167142543378, + "grad_norm": 1.962854266166687, + "learning_rate": 5e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7538344264030457, + "num_tokens": 557046394.0, + "step": 21519 + }, + { + "epoch": 2.3632769602459915, + "grad_norm": 1.7794698476791382, + "learning_rate": 5e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7354492545127869, + "num_tokens": 557079186.0, + "step": 21520 + }, + { + "epoch": 2.3633867779486053, + "grad_norm": 2.2309436798095703, + "learning_rate": 5e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.771282434463501, + "num_tokens": 557101674.0, + "step": 21521 + }, + { + "epoch": 2.363496595651219, + "grad_norm": 2.145493745803833, + "learning_rate": 5e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7486999034881592, + "num_tokens": 557125220.0, + "step": 21522 + }, + { + "epoch": 2.363606413353833, + "grad_norm": 2.24404239654541, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7660708427429199, + "num_tokens": 557147646.0, + "step": 21523 + }, + { + "epoch": 2.363716231056446, + "grad_norm": 2.2313952445983887, + "learning_rate": 5e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.7672994136810303, + "num_tokens": 557170177.0, + "step": 21524 + }, + { + "epoch": 2.36382604875906, + "grad_norm": 2.1994621753692627, + "learning_rate": 5e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.7823044061660767, + "num_tokens": 557192086.0, + "step": 21525 + }, + { + "epoch": 2.3639358664616736, + "grad_norm": 2.1490132808685303, + "learning_rate": 5e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7606685161590576, + "num_tokens": 557220073.0, + "step": 21526 + }, + { + "epoch": 2.3640456841642874, + "grad_norm": 2.149073362350464, + "learning_rate": 5e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7785643339157104, + "num_tokens": 557243662.0, + "step": 21527 + }, + { + "epoch": 2.364155501866901, + "grad_norm": 2.068748712539673, + "learning_rate": 5e-06, + "loss": 0.7428, + "mean_token_accuracy": 0.7556337714195251, + "num_tokens": 557272288.0, + "step": 21528 + }, + { + "epoch": 2.3642653195695145, + "grad_norm": 2.1460320949554443, + "learning_rate": 5e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7459622025489807, + "num_tokens": 557298300.0, + "step": 21529 + }, + { + "epoch": 2.3643751372721282, + "grad_norm": 2.3834116458892822, + "learning_rate": 5e-06, + "loss": 0.5421, + "mean_token_accuracy": 0.816615104675293, + "num_tokens": 557315482.0, + "step": 21530 + }, + { + "epoch": 2.364484954974742, + "grad_norm": 2.1747446060180664, + "learning_rate": 5e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.757849931716919, + "num_tokens": 557338710.0, + "step": 21531 + }, + { + "epoch": 2.3645947726773557, + "grad_norm": 1.9882245063781738, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7622223496437073, + "num_tokens": 557368093.0, + "step": 21532 + }, + { + "epoch": 2.3647045903799695, + "grad_norm": 2.329191207885742, + "learning_rate": 5e-06, + "loss": 0.6404, + "mean_token_accuracy": 0.7863574028015137, + "num_tokens": 557387902.0, + "step": 21533 + }, + { + "epoch": 2.364814408082583, + "grad_norm": 1.9713283777236938, + "learning_rate": 5e-06, + "loss": 0.677, + "mean_token_accuracy": 0.7833849191665649, + "num_tokens": 557415083.0, + "step": 21534 + }, + { + "epoch": 2.3649242257851966, + "grad_norm": 2.201821804046631, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7639075517654419, + "num_tokens": 557439226.0, + "step": 21535 + }, + { + "epoch": 2.3650340434878103, + "grad_norm": 2.320291042327881, + "learning_rate": 5e-06, + "loss": 0.5926, + "mean_token_accuracy": 0.806921660900116, + "num_tokens": 557458270.0, + "step": 21536 + }, + { + "epoch": 2.365143861190424, + "grad_norm": 2.45328688621521, + "learning_rate": 5e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.7859600782394409, + "num_tokens": 557477469.0, + "step": 21537 + }, + { + "epoch": 2.3652536788930374, + "grad_norm": 2.139435291290283, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7792595624923706, + "num_tokens": 557501265.0, + "step": 21538 + }, + { + "epoch": 2.365363496595651, + "grad_norm": 2.160623073577881, + "learning_rate": 5e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.770037829875946, + "num_tokens": 557524935.0, + "step": 21539 + }, + { + "epoch": 2.365473314298265, + "grad_norm": 2.180692434310913, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7758339643478394, + "num_tokens": 557548948.0, + "step": 21540 + }, + { + "epoch": 2.3655831320008787, + "grad_norm": 2.047628164291382, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7493072152137756, + "num_tokens": 557576022.0, + "step": 21541 + }, + { + "epoch": 2.365692949703492, + "grad_norm": 2.244922161102295, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7716971635818481, + "num_tokens": 557598886.0, + "step": 21542 + }, + { + "epoch": 2.3658027674061057, + "grad_norm": 1.850931167602539, + "learning_rate": 5e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7375171184539795, + "num_tokens": 557633579.0, + "step": 21543 + }, + { + "epoch": 2.3659125851087195, + "grad_norm": 2.064465284347534, + "learning_rate": 5e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7601438760757446, + "num_tokens": 557660021.0, + "step": 21544 + }, + { + "epoch": 2.3660224028113332, + "grad_norm": 2.18298077583313, + "learning_rate": 5e-06, + "loss": 0.6224, + "mean_token_accuracy": 0.7928330898284912, + "num_tokens": 557683304.0, + "step": 21545 + }, + { + "epoch": 2.366132220513947, + "grad_norm": 2.015532970428467, + "learning_rate": 5e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.789441704750061, + "num_tokens": 557708479.0, + "step": 21546 + }, + { + "epoch": 2.3662420382165603, + "grad_norm": 2.148864507675171, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7681647539138794, + "num_tokens": 557732801.0, + "step": 21547 + }, + { + "epoch": 2.366351855919174, + "grad_norm": 2.2580695152282715, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7722315192222595, + "num_tokens": 557755566.0, + "step": 21548 + }, + { + "epoch": 2.366461673621788, + "grad_norm": 2.3173561096191406, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7643170952796936, + "num_tokens": 557780360.0, + "step": 21549 + }, + { + "epoch": 2.3665714913244016, + "grad_norm": 1.9074567556381226, + "learning_rate": 5e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.7681224346160889, + "num_tokens": 557812631.0, + "step": 21550 + }, + { + "epoch": 2.3666813090270153, + "grad_norm": 1.8294405937194824, + "learning_rate": 5e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7764150500297546, + "num_tokens": 557843667.0, + "step": 21551 + }, + { + "epoch": 2.3667911267296287, + "grad_norm": 2.2958221435546875, + "learning_rate": 5e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7749654054641724, + "num_tokens": 557866062.0, + "step": 21552 + }, + { + "epoch": 2.3669009444322424, + "grad_norm": 1.9671427011489868, + "learning_rate": 5e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7442772388458252, + "num_tokens": 557894269.0, + "step": 21553 + }, + { + "epoch": 2.367010762134856, + "grad_norm": 2.0162415504455566, + "learning_rate": 5e-06, + "loss": 0.6946, + "mean_token_accuracy": 0.769281268119812, + "num_tokens": 557919565.0, + "step": 21554 + }, + { + "epoch": 2.36712057983747, + "grad_norm": 2.360168218612671, + "learning_rate": 5e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.7718652486801147, + "num_tokens": 557940178.0, + "step": 21555 + }, + { + "epoch": 2.3672303975400837, + "grad_norm": 2.1057281494140625, + "learning_rate": 5e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7652748823165894, + "num_tokens": 557966620.0, + "step": 21556 + }, + { + "epoch": 2.367340215242697, + "grad_norm": 2.066502094268799, + "learning_rate": 5e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7638506889343262, + "num_tokens": 557994439.0, + "step": 21557 + }, + { + "epoch": 2.3674500329453108, + "grad_norm": 2.2633368968963623, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7562715411186218, + "num_tokens": 558017907.0, + "step": 21558 + }, + { + "epoch": 2.3675598506479245, + "grad_norm": 2.346679210662842, + "learning_rate": 5e-06, + "loss": 0.6345, + "mean_token_accuracy": 0.7841818928718567, + "num_tokens": 558037633.0, + "step": 21559 + }, + { + "epoch": 2.3676696683505383, + "grad_norm": 1.8385629653930664, + "learning_rate": 5e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7650378942489624, + "num_tokens": 558070007.0, + "step": 21560 + }, + { + "epoch": 2.3677794860531516, + "grad_norm": 2.1182384490966797, + "learning_rate": 5e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.7771711349487305, + "num_tokens": 558095965.0, + "step": 21561 + }, + { + "epoch": 2.3678893037557653, + "grad_norm": 1.9370890855789185, + "learning_rate": 5e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.7656456828117371, + "num_tokens": 558126212.0, + "step": 21562 + }, + { + "epoch": 2.367999121458379, + "grad_norm": 2.2901203632354736, + "learning_rate": 5e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.7890769243240356, + "num_tokens": 558146885.0, + "step": 21563 + }, + { + "epoch": 2.368108939160993, + "grad_norm": 2.2444803714752197, + "learning_rate": 5e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.7803040146827698, + "num_tokens": 558170332.0, + "step": 21564 + }, + { + "epoch": 2.368218756863606, + "grad_norm": 2.1186795234680176, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7478046417236328, + "num_tokens": 558198491.0, + "step": 21565 + }, + { + "epoch": 2.36832857456622, + "grad_norm": 2.599177837371826, + "learning_rate": 5e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7783447504043579, + "num_tokens": 558217158.0, + "step": 21566 + }, + { + "epoch": 2.3684383922688337, + "grad_norm": 2.115173101425171, + "learning_rate": 5e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.750542402267456, + "num_tokens": 558244219.0, + "step": 21567 + }, + { + "epoch": 2.3685482099714474, + "grad_norm": 2.320361375808716, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7784218788146973, + "num_tokens": 558267065.0, + "step": 21568 + }, + { + "epoch": 2.368658027674061, + "grad_norm": 1.9974454641342163, + "learning_rate": 5e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.753834068775177, + "num_tokens": 558296780.0, + "step": 21569 + }, + { + "epoch": 2.3687678453766745, + "grad_norm": 1.9063016176223755, + "learning_rate": 5e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7750139832496643, + "num_tokens": 558324399.0, + "step": 21570 + }, + { + "epoch": 2.3688776630792883, + "grad_norm": 2.1822588443756104, + "learning_rate": 5e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7521682977676392, + "num_tokens": 558349624.0, + "step": 21571 + }, + { + "epoch": 2.368987480781902, + "grad_norm": 2.1005239486694336, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7564476728439331, + "num_tokens": 558374761.0, + "step": 21572 + }, + { + "epoch": 2.369097298484516, + "grad_norm": 1.780257225036621, + "learning_rate": 5e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7635490894317627, + "num_tokens": 558405927.0, + "step": 21573 + }, + { + "epoch": 2.3692071161871295, + "grad_norm": 2.0991241931915283, + "learning_rate": 5e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7464506030082703, + "num_tokens": 558433246.0, + "step": 21574 + }, + { + "epoch": 2.369316933889743, + "grad_norm": 2.125349521636963, + "learning_rate": 5e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7809121608734131, + "num_tokens": 558455870.0, + "step": 21575 + }, + { + "epoch": 2.3694267515923566, + "grad_norm": 2.0530741214752197, + "learning_rate": 5e-06, + "loss": 0.6628, + "mean_token_accuracy": 0.7770251035690308, + "num_tokens": 558480655.0, + "step": 21576 + }, + { + "epoch": 2.3695365692949704, + "grad_norm": 2.143059015274048, + "learning_rate": 5e-06, + "loss": 0.6433, + "mean_token_accuracy": 0.7836799621582031, + "num_tokens": 558505204.0, + "step": 21577 + }, + { + "epoch": 2.369646386997584, + "grad_norm": 1.988214373588562, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7629898190498352, + "num_tokens": 558534150.0, + "step": 21578 + }, + { + "epoch": 2.369756204700198, + "grad_norm": 2.11448335647583, + "learning_rate": 5e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8014937043190002, + "num_tokens": 558557068.0, + "step": 21579 + }, + { + "epoch": 2.369866022402811, + "grad_norm": 1.9343217611312866, + "learning_rate": 5e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7628047466278076, + "num_tokens": 558586136.0, + "step": 21580 + }, + { + "epoch": 2.369975840105425, + "grad_norm": 2.06735897064209, + "learning_rate": 5e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7600512504577637, + "num_tokens": 558618211.0, + "step": 21581 + }, + { + "epoch": 2.3700856578080387, + "grad_norm": 2.4090871810913086, + "learning_rate": 5e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7669382691383362, + "num_tokens": 558640399.0, + "step": 21582 + }, + { + "epoch": 2.3701954755106525, + "grad_norm": 2.1046926975250244, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7545480132102966, + "num_tokens": 558669315.0, + "step": 21583 + }, + { + "epoch": 2.370305293213266, + "grad_norm": 2.2210781574249268, + "learning_rate": 5e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.7822416424751282, + "num_tokens": 558691614.0, + "step": 21584 + }, + { + "epoch": 2.3704151109158795, + "grad_norm": 2.1375300884246826, + "learning_rate": 5e-06, + "loss": 0.6921, + "mean_token_accuracy": 0.7671107053756714, + "num_tokens": 558716657.0, + "step": 21585 + }, + { + "epoch": 2.3705249286184933, + "grad_norm": 2.0046169757843018, + "learning_rate": 5e-06, + "loss": 0.668, + "mean_token_accuracy": 0.7809596061706543, + "num_tokens": 558740857.0, + "step": 21586 + }, + { + "epoch": 2.370634746321107, + "grad_norm": 1.958886981010437, + "learning_rate": 5e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.7600691318511963, + "num_tokens": 558770805.0, + "step": 21587 + }, + { + "epoch": 2.370744564023721, + "grad_norm": 1.9260505437850952, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7514159679412842, + "num_tokens": 558802883.0, + "step": 21588 + }, + { + "epoch": 2.370854381726334, + "grad_norm": 2.005131721496582, + "learning_rate": 5e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7748661041259766, + "num_tokens": 558827446.0, + "step": 21589 + }, + { + "epoch": 2.370964199428948, + "grad_norm": 2.1344900131225586, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7693890333175659, + "num_tokens": 558852085.0, + "step": 21590 + }, + { + "epoch": 2.3710740171315616, + "grad_norm": 2.2604546546936035, + "learning_rate": 5e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7773191928863525, + "num_tokens": 558874873.0, + "step": 21591 + }, + { + "epoch": 2.3711838348341754, + "grad_norm": 1.9332609176635742, + "learning_rate": 5e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.752049446105957, + "num_tokens": 558904747.0, + "step": 21592 + }, + { + "epoch": 2.3712936525367887, + "grad_norm": 2.1991331577301025, + "learning_rate": 5e-06, + "loss": 0.5959, + "mean_token_accuracy": 0.7983123064041138, + "num_tokens": 558928285.0, + "step": 21593 + }, + { + "epoch": 2.3714034702394025, + "grad_norm": 2.1983697414398193, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7460346221923828, + "num_tokens": 558953940.0, + "step": 21594 + }, + { + "epoch": 2.371513287942016, + "grad_norm": 1.7856028079986572, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.779843807220459, + "num_tokens": 558985735.0, + "step": 21595 + }, + { + "epoch": 2.37162310564463, + "grad_norm": 1.9069514274597168, + "learning_rate": 5e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7550482749938965, + "num_tokens": 559014392.0, + "step": 21596 + }, + { + "epoch": 2.3717329233472437, + "grad_norm": 2.2220702171325684, + "learning_rate": 5e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7783902287483215, + "num_tokens": 559036774.0, + "step": 21597 + }, + { + "epoch": 2.371842741049857, + "grad_norm": 2.272533655166626, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.773989737033844, + "num_tokens": 559058387.0, + "step": 21598 + }, + { + "epoch": 2.371952558752471, + "grad_norm": 2.215210199356079, + "learning_rate": 5e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.7800191640853882, + "num_tokens": 559081490.0, + "step": 21599 + }, + { + "epoch": 2.3720623764550846, + "grad_norm": 2.5643887519836426, + "learning_rate": 5e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7810264229774475, + "num_tokens": 559100401.0, + "step": 21600 + }, + { + "epoch": 2.3721721941576983, + "grad_norm": 1.8814213275909424, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7664950489997864, + "num_tokens": 559131741.0, + "step": 21601 + }, + { + "epoch": 2.372282011860312, + "grad_norm": 2.0068867206573486, + "learning_rate": 5e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7554084062576294, + "num_tokens": 559160883.0, + "step": 21602 + }, + { + "epoch": 2.3723918295629254, + "grad_norm": 2.00801157951355, + "learning_rate": 5e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7574310898780823, + "num_tokens": 559187805.0, + "step": 21603 + }, + { + "epoch": 2.372501647265539, + "grad_norm": 2.3078629970550537, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.779995322227478, + "num_tokens": 559209091.0, + "step": 21604 + }, + { + "epoch": 2.372611464968153, + "grad_norm": 2.154186487197876, + "learning_rate": 5e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7508611679077148, + "num_tokens": 559236306.0, + "step": 21605 + }, + { + "epoch": 2.3727212826707667, + "grad_norm": 2.187495231628418, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7533244490623474, + "num_tokens": 559260544.0, + "step": 21606 + }, + { + "epoch": 2.3728311003733804, + "grad_norm": 2.100440263748169, + "learning_rate": 5e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7446694374084473, + "num_tokens": 559288862.0, + "step": 21607 + }, + { + "epoch": 2.3729409180759937, + "grad_norm": 2.0747852325439453, + "learning_rate": 5e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7772118449211121, + "num_tokens": 559314209.0, + "step": 21608 + }, + { + "epoch": 2.3730507357786075, + "grad_norm": 2.2431440353393555, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7782893180847168, + "num_tokens": 559337824.0, + "step": 21609 + }, + { + "epoch": 2.3731605534812212, + "grad_norm": 2.194167375564575, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7471156120300293, + "num_tokens": 559363861.0, + "step": 21610 + }, + { + "epoch": 2.373270371183835, + "grad_norm": 2.2942206859588623, + "learning_rate": 5e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.7796722650527954, + "num_tokens": 559387345.0, + "step": 21611 + }, + { + "epoch": 2.3733801888864483, + "grad_norm": 2.1625759601593018, + "learning_rate": 5e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7545881271362305, + "num_tokens": 559416849.0, + "step": 21612 + }, + { + "epoch": 2.373490006589062, + "grad_norm": 1.9876127243041992, + "learning_rate": 5e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7700133323669434, + "num_tokens": 559441477.0, + "step": 21613 + }, + { + "epoch": 2.373599824291676, + "grad_norm": 1.8890712261199951, + "learning_rate": 5e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7663140296936035, + "num_tokens": 559473335.0, + "step": 21614 + }, + { + "epoch": 2.3737096419942896, + "grad_norm": 1.9270145893096924, + "learning_rate": 5e-06, + "loss": 0.67, + "mean_token_accuracy": 0.782227635383606, + "num_tokens": 559498385.0, + "step": 21615 + }, + { + "epoch": 2.3738194596969033, + "grad_norm": 2.1491587162017822, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7708147764205933, + "num_tokens": 559522422.0, + "step": 21616 + }, + { + "epoch": 2.3739292773995166, + "grad_norm": 2.167750358581543, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7716736793518066, + "num_tokens": 559546757.0, + "step": 21617 + }, + { + "epoch": 2.3740390951021304, + "grad_norm": 2.1763484477996826, + "learning_rate": 5e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.766904890537262, + "num_tokens": 559570977.0, + "step": 21618 + }, + { + "epoch": 2.374148912804744, + "grad_norm": 2.335292100906372, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7570062279701233, + "num_tokens": 559594331.0, + "step": 21619 + }, + { + "epoch": 2.374258730507358, + "grad_norm": 2.050516366958618, + "learning_rate": 5e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7565167546272278, + "num_tokens": 559621715.0, + "step": 21620 + }, + { + "epoch": 2.3743685482099712, + "grad_norm": 2.0983119010925293, + "learning_rate": 5e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.7763654589653015, + "num_tokens": 559644543.0, + "step": 21621 + }, + { + "epoch": 2.374478365912585, + "grad_norm": 2.4988293647766113, + "learning_rate": 5e-06, + "loss": 0.6456, + "mean_token_accuracy": 0.7799079418182373, + "num_tokens": 559662900.0, + "step": 21622 + }, + { + "epoch": 2.3745881836151987, + "grad_norm": 1.963911771774292, + "learning_rate": 5e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7583732008934021, + "num_tokens": 559694012.0, + "step": 21623 + }, + { + "epoch": 2.3746980013178125, + "grad_norm": 1.8993147611618042, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7778750658035278, + "num_tokens": 559722928.0, + "step": 21624 + }, + { + "epoch": 2.3748078190204263, + "grad_norm": 1.996958613395691, + "learning_rate": 5e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.750851035118103, + "num_tokens": 559752173.0, + "step": 21625 + }, + { + "epoch": 2.3749176367230396, + "grad_norm": 2.1578311920166016, + "learning_rate": 5e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.7627783417701721, + "num_tokens": 559777367.0, + "step": 21626 + }, + { + "epoch": 2.3750274544256533, + "grad_norm": 2.034407138824463, + "learning_rate": 5e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.7926148176193237, + "num_tokens": 559801520.0, + "step": 21627 + }, + { + "epoch": 2.375137272128267, + "grad_norm": 2.0105721950531006, + "learning_rate": 5e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7590712904930115, + "num_tokens": 559826449.0, + "step": 21628 + }, + { + "epoch": 2.375247089830881, + "grad_norm": 1.777419924736023, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7729241847991943, + "num_tokens": 559859954.0, + "step": 21629 + }, + { + "epoch": 2.3753569075334946, + "grad_norm": 2.1474852561950684, + "learning_rate": 5e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7618057727813721, + "num_tokens": 559886295.0, + "step": 21630 + }, + { + "epoch": 2.375466725236108, + "grad_norm": 1.7360187768936157, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7465000152587891, + "num_tokens": 559923859.0, + "step": 21631 + }, + { + "epoch": 2.3755765429387217, + "grad_norm": 2.1626195907592773, + "learning_rate": 5e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7559394836425781, + "num_tokens": 559947915.0, + "step": 21632 + }, + { + "epoch": 2.3756863606413354, + "grad_norm": 1.9281562566757202, + "learning_rate": 5e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7414189577102661, + "num_tokens": 559980964.0, + "step": 21633 + }, + { + "epoch": 2.375796178343949, + "grad_norm": 2.4134085178375244, + "learning_rate": 5e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.7955219745635986, + "num_tokens": 560000356.0, + "step": 21634 + }, + { + "epoch": 2.375905996046563, + "grad_norm": 1.756316065788269, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7632761001586914, + "num_tokens": 560033745.0, + "step": 21635 + }, + { + "epoch": 2.3760158137491763, + "grad_norm": 2.19203782081604, + "learning_rate": 5e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7638953924179077, + "num_tokens": 560057226.0, + "step": 21636 + }, + { + "epoch": 2.37612563145179, + "grad_norm": 2.226083278656006, + "learning_rate": 5e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.758380651473999, + "num_tokens": 560081004.0, + "step": 21637 + }, + { + "epoch": 2.3762354491544038, + "grad_norm": 2.2226202487945557, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7659903764724731, + "num_tokens": 560103143.0, + "step": 21638 + }, + { + "epoch": 2.3763452668570175, + "grad_norm": 2.2629165649414062, + "learning_rate": 5e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7679077386856079, + "num_tokens": 560127620.0, + "step": 21639 + }, + { + "epoch": 2.376455084559631, + "grad_norm": 1.7596133947372437, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7332358360290527, + "num_tokens": 560163012.0, + "step": 21640 + }, + { + "epoch": 2.3765649022622446, + "grad_norm": 1.7688039541244507, + "learning_rate": 5e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7283544540405273, + "num_tokens": 560199191.0, + "step": 21641 + }, + { + "epoch": 2.3766747199648584, + "grad_norm": 2.0024709701538086, + "learning_rate": 5e-06, + "loss": 0.5841, + "mean_token_accuracy": 0.8035893440246582, + "num_tokens": 560222175.0, + "step": 21642 + }, + { + "epoch": 2.376784537667472, + "grad_norm": 2.1415112018585205, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7666626572608948, + "num_tokens": 560248109.0, + "step": 21643 + }, + { + "epoch": 2.3768943553700854, + "grad_norm": 2.1846721172332764, + "learning_rate": 5e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7604878544807434, + "num_tokens": 560273199.0, + "step": 21644 + }, + { + "epoch": 2.377004173072699, + "grad_norm": 2.156680107116699, + "learning_rate": 5e-06, + "loss": 0.6297, + "mean_token_accuracy": 0.7919464111328125, + "num_tokens": 560295419.0, + "step": 21645 + }, + { + "epoch": 2.377113990775313, + "grad_norm": 2.1511335372924805, + "learning_rate": 5e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.7433870434761047, + "num_tokens": 560323820.0, + "step": 21646 + }, + { + "epoch": 2.3772238084779267, + "grad_norm": 2.1173038482666016, + "learning_rate": 5e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7500824928283691, + "num_tokens": 560349026.0, + "step": 21647 + }, + { + "epoch": 2.3773336261805404, + "grad_norm": 2.101492404937744, + "learning_rate": 5e-06, + "loss": 0.6193, + "mean_token_accuracy": 0.7917597889900208, + "num_tokens": 560374233.0, + "step": 21648 + }, + { + "epoch": 2.3774434438831538, + "grad_norm": 2.4868364334106445, + "learning_rate": 5e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.7932413816452026, + "num_tokens": 560393444.0, + "step": 21649 + }, + { + "epoch": 2.3775532615857675, + "grad_norm": 2.2521164417266846, + "learning_rate": 5e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7540934681892395, + "num_tokens": 560417134.0, + "step": 21650 + }, + { + "epoch": 2.3776630792883813, + "grad_norm": 2.030662775039673, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7749979496002197, + "num_tokens": 560444037.0, + "step": 21651 + }, + { + "epoch": 2.377772896990995, + "grad_norm": 2.5780222415924072, + "learning_rate": 5e-06, + "loss": 0.66, + "mean_token_accuracy": 0.776962161064148, + "num_tokens": 560462614.0, + "step": 21652 + }, + { + "epoch": 2.377882714693609, + "grad_norm": 2.117718458175659, + "learning_rate": 5e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7573114633560181, + "num_tokens": 560491005.0, + "step": 21653 + }, + { + "epoch": 2.377992532396222, + "grad_norm": 2.0066773891448975, + "learning_rate": 5e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7746880054473877, + "num_tokens": 560519707.0, + "step": 21654 + }, + { + "epoch": 2.378102350098836, + "grad_norm": 1.922224521636963, + "learning_rate": 5e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7538138031959534, + "num_tokens": 560547476.0, + "step": 21655 + }, + { + "epoch": 2.3782121678014496, + "grad_norm": 1.976409912109375, + "learning_rate": 5e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7692393064498901, + "num_tokens": 560574820.0, + "step": 21656 + }, + { + "epoch": 2.3783219855040634, + "grad_norm": 1.9203860759735107, + "learning_rate": 5e-06, + "loss": 0.6275, + "mean_token_accuracy": 0.794014573097229, + "num_tokens": 560602086.0, + "step": 21657 + }, + { + "epoch": 2.378431803206677, + "grad_norm": 2.36177396774292, + "learning_rate": 5e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.7758828401565552, + "num_tokens": 560621944.0, + "step": 21658 + }, + { + "epoch": 2.3785416209092904, + "grad_norm": 2.193016290664673, + "learning_rate": 5e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7599895596504211, + "num_tokens": 560645674.0, + "step": 21659 + }, + { + "epoch": 2.378651438611904, + "grad_norm": 2.2534823417663574, + "learning_rate": 5e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.7981444597244263, + "num_tokens": 560665912.0, + "step": 21660 + }, + { + "epoch": 2.378761256314518, + "grad_norm": 1.9874836206436157, + "learning_rate": 5e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.763146698474884, + "num_tokens": 560693875.0, + "step": 21661 + }, + { + "epoch": 2.3788710740171317, + "grad_norm": 2.237668037414551, + "learning_rate": 5e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7783557772636414, + "num_tokens": 560716903.0, + "step": 21662 + }, + { + "epoch": 2.3789808917197455, + "grad_norm": 1.9235949516296387, + "learning_rate": 5e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7689944505691528, + "num_tokens": 560744169.0, + "step": 21663 + }, + { + "epoch": 2.379090709422359, + "grad_norm": 2.3361656665802, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7597019672393799, + "num_tokens": 560764915.0, + "step": 21664 + }, + { + "epoch": 2.3792005271249725, + "grad_norm": 2.2134969234466553, + "learning_rate": 5e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7644811272621155, + "num_tokens": 560790690.0, + "step": 21665 + }, + { + "epoch": 2.3793103448275863, + "grad_norm": 2.2033188343048096, + "learning_rate": 5e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7555006742477417, + "num_tokens": 560815210.0, + "step": 21666 + }, + { + "epoch": 2.3794201625302, + "grad_norm": 2.3258347511291504, + "learning_rate": 5e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7687168121337891, + "num_tokens": 560837572.0, + "step": 21667 + }, + { + "epoch": 2.3795299802328134, + "grad_norm": 2.07311749458313, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7628121376037598, + "num_tokens": 560865510.0, + "step": 21668 + }, + { + "epoch": 2.379639797935427, + "grad_norm": 1.9407877922058105, + "learning_rate": 5e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7377432584762573, + "num_tokens": 560895737.0, + "step": 21669 + }, + { + "epoch": 2.379749615638041, + "grad_norm": 2.0990614891052246, + "learning_rate": 5e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7621293067932129, + "num_tokens": 560920060.0, + "step": 21670 + }, + { + "epoch": 2.3798594333406546, + "grad_norm": 2.3123698234558105, + "learning_rate": 5e-06, + "loss": 0.6763, + "mean_token_accuracy": 0.7763063311576843, + "num_tokens": 560940551.0, + "step": 21671 + }, + { + "epoch": 2.379969251043268, + "grad_norm": 2.1843721866607666, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7698200941085815, + "num_tokens": 560964392.0, + "step": 21672 + }, + { + "epoch": 2.3800790687458817, + "grad_norm": 1.8950941562652588, + "learning_rate": 5e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7472563982009888, + "num_tokens": 560995530.0, + "step": 21673 + }, + { + "epoch": 2.3801888864484955, + "grad_norm": 2.4879205226898193, + "learning_rate": 5e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7824453115463257, + "num_tokens": 561014218.0, + "step": 21674 + }, + { + "epoch": 2.3802987041511092, + "grad_norm": 2.315889358520508, + "learning_rate": 5e-06, + "loss": 0.657, + "mean_token_accuracy": 0.7852051258087158, + "num_tokens": 561033872.0, + "step": 21675 + }, + { + "epoch": 2.380408521853723, + "grad_norm": 2.1794207096099854, + "learning_rate": 5e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7721058130264282, + "num_tokens": 561058858.0, + "step": 21676 + }, + { + "epoch": 2.3805183395563363, + "grad_norm": 2.2324085235595703, + "learning_rate": 5e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7609863877296448, + "num_tokens": 561082748.0, + "step": 21677 + }, + { + "epoch": 2.38062815725895, + "grad_norm": 2.493014335632324, + "learning_rate": 5e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.7745830416679382, + "num_tokens": 561102775.0, + "step": 21678 + }, + { + "epoch": 2.380737974961564, + "grad_norm": 1.860640048980713, + "learning_rate": 5e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7564805746078491, + "num_tokens": 561134774.0, + "step": 21679 + }, + { + "epoch": 2.3808477926641776, + "grad_norm": 2.256399393081665, + "learning_rate": 5e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.759424090385437, + "num_tokens": 561159477.0, + "step": 21680 + }, + { + "epoch": 2.3809576103667913, + "grad_norm": 2.2392125129699707, + "learning_rate": 5e-06, + "loss": 0.6643, + "mean_token_accuracy": 0.7842584848403931, + "num_tokens": 561182696.0, + "step": 21681 + }, + { + "epoch": 2.3810674280694046, + "grad_norm": 2.337888717651367, + "learning_rate": 5e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.7919817566871643, + "num_tokens": 561203294.0, + "step": 21682 + }, + { + "epoch": 2.3811772457720184, + "grad_norm": 2.288938522338867, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7761975526809692, + "num_tokens": 561224753.0, + "step": 21683 + }, + { + "epoch": 2.381287063474632, + "grad_norm": 2.044666051864624, + "learning_rate": 5e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7432920932769775, + "num_tokens": 561251731.0, + "step": 21684 + }, + { + "epoch": 2.381396881177246, + "grad_norm": 2.315534830093384, + "learning_rate": 5e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.7791510820388794, + "num_tokens": 561273763.0, + "step": 21685 + }, + { + "epoch": 2.3815066988798597, + "grad_norm": 2.232667922973633, + "learning_rate": 5e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.7758103609085083, + "num_tokens": 561295197.0, + "step": 21686 + }, + { + "epoch": 2.381616516582473, + "grad_norm": 2.1164188385009766, + "learning_rate": 5e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7481194138526917, + "num_tokens": 561322110.0, + "step": 21687 + }, + { + "epoch": 2.3817263342850867, + "grad_norm": 2.3849148750305176, + "learning_rate": 5e-06, + "loss": 0.6516, + "mean_token_accuracy": 0.7824170589447021, + "num_tokens": 561342803.0, + "step": 21688 + }, + { + "epoch": 2.3818361519877005, + "grad_norm": 2.164625644683838, + "learning_rate": 5e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7751279473304749, + "num_tokens": 561364086.0, + "step": 21689 + }, + { + "epoch": 2.3819459696903142, + "grad_norm": 2.1492512226104736, + "learning_rate": 5e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.7801808714866638, + "num_tokens": 561388066.0, + "step": 21690 + }, + { + "epoch": 2.3820557873929276, + "grad_norm": 1.9348742961883545, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7452439069747925, + "num_tokens": 561420041.0, + "step": 21691 + }, + { + "epoch": 2.3821656050955413, + "grad_norm": 2.323740243911743, + "learning_rate": 5e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7848174571990967, + "num_tokens": 561442197.0, + "step": 21692 + }, + { + "epoch": 2.382275422798155, + "grad_norm": 2.2337069511413574, + "learning_rate": 5e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.760467529296875, + "num_tokens": 561466900.0, + "step": 21693 + }, + { + "epoch": 2.382385240500769, + "grad_norm": 2.0365889072418213, + "learning_rate": 5e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7686830163002014, + "num_tokens": 561492641.0, + "step": 21694 + }, + { + "epoch": 2.382495058203382, + "grad_norm": 2.5284054279327393, + "learning_rate": 5e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.7766517400741577, + "num_tokens": 561510452.0, + "step": 21695 + }, + { + "epoch": 2.382604875905996, + "grad_norm": 2.2614874839782715, + "learning_rate": 5e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7656229138374329, + "num_tokens": 561532771.0, + "step": 21696 + }, + { + "epoch": 2.3827146936086097, + "grad_norm": 1.9047491550445557, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.763531506061554, + "num_tokens": 561560654.0, + "step": 21697 + }, + { + "epoch": 2.3828245113112234, + "grad_norm": 2.124189615249634, + "learning_rate": 5e-06, + "loss": 0.6733, + "mean_token_accuracy": 0.7777608633041382, + "num_tokens": 561584491.0, + "step": 21698 + }, + { + "epoch": 2.382934329013837, + "grad_norm": 2.019294500350952, + "learning_rate": 5e-06, + "loss": 0.6842, + "mean_token_accuracy": 0.7790136337280273, + "num_tokens": 561610415.0, + "step": 21699 + }, + { + "epoch": 2.3830441467164505, + "grad_norm": 2.179865837097168, + "learning_rate": 5e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.7723904252052307, + "num_tokens": 561635465.0, + "step": 21700 + }, + { + "epoch": 2.3831539644190642, + "grad_norm": 1.8582180738449097, + "learning_rate": 5e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.757483720779419, + "num_tokens": 561666991.0, + "step": 21701 + }, + { + "epoch": 2.383263782121678, + "grad_norm": 2.0931618213653564, + "learning_rate": 5e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7795625329017639, + "num_tokens": 561692130.0, + "step": 21702 + }, + { + "epoch": 2.3833735998242918, + "grad_norm": 2.2209646701812744, + "learning_rate": 5e-06, + "loss": 0.6349, + "mean_token_accuracy": 0.7835559844970703, + "num_tokens": 561713705.0, + "step": 21703 + }, + { + "epoch": 2.3834834175269055, + "grad_norm": 2.4941208362579346, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7719801068305969, + "num_tokens": 561731142.0, + "step": 21704 + }, + { + "epoch": 2.383593235229519, + "grad_norm": 2.1817989349365234, + "learning_rate": 5e-06, + "loss": 0.637, + "mean_token_accuracy": 0.7861006259918213, + "num_tokens": 561754196.0, + "step": 21705 + }, + { + "epoch": 2.3837030529321326, + "grad_norm": 2.3212504386901855, + "learning_rate": 5e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.7713476419448853, + "num_tokens": 561773823.0, + "step": 21706 + }, + { + "epoch": 2.3838128706347463, + "grad_norm": 2.5247247219085693, + "learning_rate": 5e-06, + "loss": 0.6373, + "mean_token_accuracy": 0.7868419885635376, + "num_tokens": 561793247.0, + "step": 21707 + }, + { + "epoch": 2.38392268833736, + "grad_norm": 2.225809097290039, + "learning_rate": 5e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7612298727035522, + "num_tokens": 561819006.0, + "step": 21708 + }, + { + "epoch": 2.384032506039974, + "grad_norm": 1.978047490119934, + "learning_rate": 5e-06, + "loss": 0.615, + "mean_token_accuracy": 0.8044622540473938, + "num_tokens": 561846021.0, + "step": 21709 + }, + { + "epoch": 2.384142323742587, + "grad_norm": 1.9125276803970337, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7726292610168457, + "num_tokens": 561874616.0, + "step": 21710 + }, + { + "epoch": 2.384252141445201, + "grad_norm": 1.8980515003204346, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7720513343811035, + "num_tokens": 561905567.0, + "step": 21711 + }, + { + "epoch": 2.3843619591478147, + "grad_norm": 2.14216947555542, + "learning_rate": 5e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7560818195343018, + "num_tokens": 561932754.0, + "step": 21712 + }, + { + "epoch": 2.3844717768504284, + "grad_norm": 2.0771751403808594, + "learning_rate": 5e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7693188190460205, + "num_tokens": 561958510.0, + "step": 21713 + }, + { + "epoch": 2.384581594553042, + "grad_norm": 2.4065141677856445, + "learning_rate": 5e-06, + "loss": 0.5974, + "mean_token_accuracy": 0.7959513664245605, + "num_tokens": 561977392.0, + "step": 21714 + }, + { + "epoch": 2.3846914122556555, + "grad_norm": 2.020263910293579, + "learning_rate": 5e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7566784620285034, + "num_tokens": 562005392.0, + "step": 21715 + }, + { + "epoch": 2.3848012299582693, + "grad_norm": 2.130896806716919, + "learning_rate": 5e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.781424880027771, + "num_tokens": 562030875.0, + "step": 21716 + }, + { + "epoch": 2.384911047660883, + "grad_norm": 2.0154638290405273, + "learning_rate": 5e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7401077151298523, + "num_tokens": 562062619.0, + "step": 21717 + }, + { + "epoch": 2.385020865363497, + "grad_norm": 1.9593364000320435, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.762282133102417, + "num_tokens": 562089415.0, + "step": 21718 + }, + { + "epoch": 2.38513068306611, + "grad_norm": 1.9726991653442383, + "learning_rate": 5e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7654390931129456, + "num_tokens": 562117438.0, + "step": 21719 + }, + { + "epoch": 2.385240500768724, + "grad_norm": 2.2176930904388428, + "learning_rate": 5e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.774084210395813, + "num_tokens": 562139095.0, + "step": 21720 + }, + { + "epoch": 2.3853503184713376, + "grad_norm": 2.163273334503174, + "learning_rate": 5e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7674828171730042, + "num_tokens": 562162043.0, + "step": 21721 + }, + { + "epoch": 2.3854601361739514, + "grad_norm": 2.114086389541626, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7629321813583374, + "num_tokens": 562188731.0, + "step": 21722 + }, + { + "epoch": 2.3855699538765647, + "grad_norm": 2.1852176189422607, + "learning_rate": 5e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.738297700881958, + "num_tokens": 562213714.0, + "step": 21723 + }, + { + "epoch": 2.3856797715791784, + "grad_norm": 2.0505175590515137, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.766094982624054, + "num_tokens": 562241656.0, + "step": 21724 + }, + { + "epoch": 2.385789589281792, + "grad_norm": 2.0878849029541016, + "learning_rate": 5e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7651206254959106, + "num_tokens": 562267404.0, + "step": 21725 + }, + { + "epoch": 2.385899406984406, + "grad_norm": 1.914405107498169, + "learning_rate": 5e-06, + "loss": 0.784, + "mean_token_accuracy": 0.7413249015808105, + "num_tokens": 562298856.0, + "step": 21726 + }, + { + "epoch": 2.3860092246870197, + "grad_norm": 1.9543578624725342, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7630562782287598, + "num_tokens": 562327517.0, + "step": 21727 + }, + { + "epoch": 2.386119042389633, + "grad_norm": 2.3583133220672607, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7703443765640259, + "num_tokens": 562348738.0, + "step": 21728 + }, + { + "epoch": 2.3862288600922468, + "grad_norm": 2.0377609729766846, + "learning_rate": 5e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7683590054512024, + "num_tokens": 562375116.0, + "step": 21729 + }, + { + "epoch": 2.3863386777948605, + "grad_norm": 2.404707193374634, + "learning_rate": 5e-06, + "loss": 0.6322, + "mean_token_accuracy": 0.7842501401901245, + "num_tokens": 562394684.0, + "step": 21730 + }, + { + "epoch": 2.3864484954974743, + "grad_norm": 1.9972048997879028, + "learning_rate": 5e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7562734484672546, + "num_tokens": 562422835.0, + "step": 21731 + }, + { + "epoch": 2.386558313200088, + "grad_norm": 2.055471420288086, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7696627378463745, + "num_tokens": 562449399.0, + "step": 21732 + }, + { + "epoch": 2.3866681309027014, + "grad_norm": 2.4025657176971436, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7715637683868408, + "num_tokens": 562472027.0, + "step": 21733 + }, + { + "epoch": 2.386777948605315, + "grad_norm": 1.9196783304214478, + "learning_rate": 5e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7560194134712219, + "num_tokens": 562500877.0, + "step": 21734 + }, + { + "epoch": 2.386887766307929, + "grad_norm": 2.0998454093933105, + "learning_rate": 5e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7514381408691406, + "num_tokens": 562527148.0, + "step": 21735 + }, + { + "epoch": 2.3869975840105426, + "grad_norm": 2.1596150398254395, + "learning_rate": 5e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7543504238128662, + "num_tokens": 562552620.0, + "step": 21736 + }, + { + "epoch": 2.3871074017131564, + "grad_norm": 1.742638349533081, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7389112710952759, + "num_tokens": 562589643.0, + "step": 21737 + }, + { + "epoch": 2.3872172194157697, + "grad_norm": 2.17978835105896, + "learning_rate": 5e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7750290036201477, + "num_tokens": 562613032.0, + "step": 21738 + }, + { + "epoch": 2.3873270371183835, + "grad_norm": 1.90329909324646, + "learning_rate": 5e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7793823480606079, + "num_tokens": 562643667.0, + "step": 21739 + }, + { + "epoch": 2.387436854820997, + "grad_norm": 2.019223213195801, + "learning_rate": 5e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.8045640587806702, + "num_tokens": 562670008.0, + "step": 21740 + }, + { + "epoch": 2.387546672523611, + "grad_norm": 1.9384431838989258, + "learning_rate": 5e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7481654286384583, + "num_tokens": 562702195.0, + "step": 21741 + }, + { + "epoch": 2.3876564902262243, + "grad_norm": 1.931687593460083, + "learning_rate": 5e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7613897323608398, + "num_tokens": 562731996.0, + "step": 21742 + }, + { + "epoch": 2.387766307928838, + "grad_norm": 2.113654375076294, + "learning_rate": 5e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7605215311050415, + "num_tokens": 562758488.0, + "step": 21743 + }, + { + "epoch": 2.387876125631452, + "grad_norm": 2.214884042739868, + "learning_rate": 5e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.7898061275482178, + "num_tokens": 562781127.0, + "step": 21744 + }, + { + "epoch": 2.3879859433340656, + "grad_norm": 2.19093656539917, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7499586343765259, + "num_tokens": 562805982.0, + "step": 21745 + }, + { + "epoch": 2.388095761036679, + "grad_norm": 2.0814387798309326, + "learning_rate": 5e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.760898232460022, + "num_tokens": 562834529.0, + "step": 21746 + }, + { + "epoch": 2.3882055787392926, + "grad_norm": 2.0507187843322754, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7757315039634705, + "num_tokens": 562860137.0, + "step": 21747 + }, + { + "epoch": 2.3883153964419064, + "grad_norm": 2.2544307708740234, + "learning_rate": 5e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7727100849151611, + "num_tokens": 562884038.0, + "step": 21748 + }, + { + "epoch": 2.38842521414452, + "grad_norm": 2.1843340396881104, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7543119192123413, + "num_tokens": 562908011.0, + "step": 21749 + }, + { + "epoch": 2.388535031847134, + "grad_norm": 1.9398391246795654, + "learning_rate": 5e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7564243078231812, + "num_tokens": 562934948.0, + "step": 21750 + }, + { + "epoch": 2.388644849549747, + "grad_norm": 2.2321808338165283, + "learning_rate": 5e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.7796143889427185, + "num_tokens": 562956671.0, + "step": 21751 + }, + { + "epoch": 2.388754667252361, + "grad_norm": 2.013394832611084, + "learning_rate": 5e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.7854595184326172, + "num_tokens": 562983001.0, + "step": 21752 + }, + { + "epoch": 2.3888644849549747, + "grad_norm": 2.218069076538086, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7687588334083557, + "num_tokens": 563005972.0, + "step": 21753 + }, + { + "epoch": 2.3889743026575885, + "grad_norm": 2.0025222301483154, + "learning_rate": 5e-06, + "loss": 0.5812, + "mean_token_accuracy": 0.8041099309921265, + "num_tokens": 563029684.0, + "step": 21754 + }, + { + "epoch": 2.3890841203602022, + "grad_norm": 2.449540138244629, + "learning_rate": 5e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8016359806060791, + "num_tokens": 563047453.0, + "step": 21755 + }, + { + "epoch": 2.3891939380628155, + "grad_norm": 2.1007497310638428, + "learning_rate": 5e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7604786157608032, + "num_tokens": 563074574.0, + "step": 21756 + }, + { + "epoch": 2.3893037557654293, + "grad_norm": 1.920784831047058, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7698013782501221, + "num_tokens": 563104557.0, + "step": 21757 + }, + { + "epoch": 2.389413573468043, + "grad_norm": 2.5219364166259766, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7570421099662781, + "num_tokens": 563127264.0, + "step": 21758 + }, + { + "epoch": 2.389523391170657, + "grad_norm": 2.1386330127716064, + "learning_rate": 5e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.778903067111969, + "num_tokens": 563150906.0, + "step": 21759 + }, + { + "epoch": 2.3896332088732706, + "grad_norm": 2.1546380519866943, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7853209972381592, + "num_tokens": 563175346.0, + "step": 21760 + }, + { + "epoch": 2.389743026575884, + "grad_norm": 2.2495479583740234, + "learning_rate": 5e-06, + "loss": 0.6189, + "mean_token_accuracy": 0.7886122465133667, + "num_tokens": 563197100.0, + "step": 21761 + }, + { + "epoch": 2.3898528442784976, + "grad_norm": 2.028808355331421, + "learning_rate": 5e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.775685727596283, + "num_tokens": 563224523.0, + "step": 21762 + }, + { + "epoch": 2.3899626619811114, + "grad_norm": 2.3564512729644775, + "learning_rate": 5e-06, + "loss": 0.63, + "mean_token_accuracy": 0.7922934293746948, + "num_tokens": 563246174.0, + "step": 21763 + }, + { + "epoch": 2.390072479683725, + "grad_norm": 2.136972427368164, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7610437870025635, + "num_tokens": 563271058.0, + "step": 21764 + }, + { + "epoch": 2.390182297386339, + "grad_norm": 2.306030035018921, + "learning_rate": 5e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.7815443277359009, + "num_tokens": 563292971.0, + "step": 21765 + }, + { + "epoch": 2.3902921150889522, + "grad_norm": 2.110769748687744, + "learning_rate": 5e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7658889889717102, + "num_tokens": 563318392.0, + "step": 21766 + }, + { + "epoch": 2.390401932791566, + "grad_norm": 1.9851586818695068, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7685407400131226, + "num_tokens": 563345184.0, + "step": 21767 + }, + { + "epoch": 2.3905117504941797, + "grad_norm": 2.0261051654815674, + "learning_rate": 5e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.77237468957901, + "num_tokens": 563373750.0, + "step": 21768 + }, + { + "epoch": 2.3906215681967935, + "grad_norm": 2.0304150581359863, + "learning_rate": 5e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7674599885940552, + "num_tokens": 563400600.0, + "step": 21769 + }, + { + "epoch": 2.390731385899407, + "grad_norm": 2.2401204109191895, + "learning_rate": 5e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.7976483702659607, + "num_tokens": 563423650.0, + "step": 21770 + }, + { + "epoch": 2.3908412036020206, + "grad_norm": 1.901042103767395, + "learning_rate": 5e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.753672182559967, + "num_tokens": 563454086.0, + "step": 21771 + }, + { + "epoch": 2.3909510213046343, + "grad_norm": 2.185940980911255, + "learning_rate": 5e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7804704904556274, + "num_tokens": 563476548.0, + "step": 21772 + }, + { + "epoch": 2.391060839007248, + "grad_norm": 1.9782668352127075, + "learning_rate": 5e-06, + "loss": 0.7087, + "mean_token_accuracy": 0.770727813243866, + "num_tokens": 563504687.0, + "step": 21773 + }, + { + "epoch": 2.3911706567098614, + "grad_norm": 2.2982211112976074, + "learning_rate": 5e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.7891520857810974, + "num_tokens": 563528323.0, + "step": 21774 + }, + { + "epoch": 2.391280474412475, + "grad_norm": 2.227733850479126, + "learning_rate": 5e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7467509508132935, + "num_tokens": 563553788.0, + "step": 21775 + }, + { + "epoch": 2.391390292115089, + "grad_norm": 2.1558735370635986, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7848021984100342, + "num_tokens": 563576734.0, + "step": 21776 + }, + { + "epoch": 2.3915001098177027, + "grad_norm": 2.2907755374908447, + "learning_rate": 5e-06, + "loss": 0.6672, + "mean_token_accuracy": 0.776305079460144, + "num_tokens": 563599262.0, + "step": 21777 + }, + { + "epoch": 2.3916099275203164, + "grad_norm": 1.8402137756347656, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7446068525314331, + "num_tokens": 563633521.0, + "step": 21778 + }, + { + "epoch": 2.3917197452229297, + "grad_norm": 2.039715528488159, + "learning_rate": 5e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7586922645568848, + "num_tokens": 563660799.0, + "step": 21779 + }, + { + "epoch": 2.3918295629255435, + "grad_norm": 2.4446070194244385, + "learning_rate": 5e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7713249325752258, + "num_tokens": 563683079.0, + "step": 21780 + }, + { + "epoch": 2.3919393806281573, + "grad_norm": 2.203211784362793, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.769255518913269, + "num_tokens": 563706943.0, + "step": 21781 + }, + { + "epoch": 2.392049198330771, + "grad_norm": 2.304438352584839, + "learning_rate": 5e-06, + "loss": 0.6498, + "mean_token_accuracy": 0.7890399098396301, + "num_tokens": 563728692.0, + "step": 21782 + }, + { + "epoch": 2.3921590160333848, + "grad_norm": 2.033839464187622, + "learning_rate": 5e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.7634633779525757, + "num_tokens": 563754678.0, + "step": 21783 + }, + { + "epoch": 2.392268833735998, + "grad_norm": 1.9896464347839355, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7715293765068054, + "num_tokens": 563780541.0, + "step": 21784 + }, + { + "epoch": 2.392378651438612, + "grad_norm": 1.8002235889434814, + "learning_rate": 5e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7553093433380127, + "num_tokens": 563813170.0, + "step": 21785 + }, + { + "epoch": 2.3924884691412256, + "grad_norm": 1.8006620407104492, + "learning_rate": 5e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7641733288764954, + "num_tokens": 563845868.0, + "step": 21786 + }, + { + "epoch": 2.3925982868438394, + "grad_norm": 2.250978708267212, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7725907564163208, + "num_tokens": 563870247.0, + "step": 21787 + }, + { + "epoch": 2.392708104546453, + "grad_norm": 2.4003288745880127, + "learning_rate": 5e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7611169815063477, + "num_tokens": 563892893.0, + "step": 21788 + }, + { + "epoch": 2.3928179222490664, + "grad_norm": 1.9684516191482544, + "learning_rate": 5e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7718007564544678, + "num_tokens": 563918523.0, + "step": 21789 + }, + { + "epoch": 2.39292773995168, + "grad_norm": 2.0765578746795654, + "learning_rate": 5e-06, + "loss": 0.6177, + "mean_token_accuracy": 0.790526270866394, + "num_tokens": 563943958.0, + "step": 21790 + }, + { + "epoch": 2.393037557654294, + "grad_norm": 1.9319403171539307, + "learning_rate": 5e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7574739456176758, + "num_tokens": 563971693.0, + "step": 21791 + }, + { + "epoch": 2.3931473753569077, + "grad_norm": 1.766977310180664, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7513247728347778, + "num_tokens": 564004394.0, + "step": 21792 + }, + { + "epoch": 2.393257193059521, + "grad_norm": 2.158010721206665, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.77129727602005, + "num_tokens": 564027320.0, + "step": 21793 + }, + { + "epoch": 2.3933670107621348, + "grad_norm": 2.0225911140441895, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7550886869430542, + "num_tokens": 564057414.0, + "step": 21794 + }, + { + "epoch": 2.3934768284647485, + "grad_norm": 2.358365535736084, + "learning_rate": 5e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.7838255763053894, + "num_tokens": 564078657.0, + "step": 21795 + }, + { + "epoch": 2.3935866461673623, + "grad_norm": 2.156033992767334, + "learning_rate": 5e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7519327402114868, + "num_tokens": 564103131.0, + "step": 21796 + }, + { + "epoch": 2.393696463869976, + "grad_norm": 2.0521562099456787, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7765374183654785, + "num_tokens": 564129857.0, + "step": 21797 + }, + { + "epoch": 2.3938062815725893, + "grad_norm": 2.1384854316711426, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7724695205688477, + "num_tokens": 564155297.0, + "step": 21798 + }, + { + "epoch": 2.393916099275203, + "grad_norm": 2.1503584384918213, + "learning_rate": 5e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7627286911010742, + "num_tokens": 564180970.0, + "step": 21799 + }, + { + "epoch": 2.394025916977817, + "grad_norm": 2.1477584838867188, + "learning_rate": 5e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.7772988677024841, + "num_tokens": 564203706.0, + "step": 21800 + }, + { + "epoch": 2.3941357346804306, + "grad_norm": 2.0604519844055176, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7487003803253174, + "num_tokens": 564233846.0, + "step": 21801 + }, + { + "epoch": 2.394245552383044, + "grad_norm": 2.218865394592285, + "learning_rate": 5e-06, + "loss": 0.6742, + "mean_token_accuracy": 0.7766624689102173, + "num_tokens": 564257480.0, + "step": 21802 + }, + { + "epoch": 2.3943553700856577, + "grad_norm": 2.1666100025177, + "learning_rate": 5e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7604457139968872, + "num_tokens": 564282781.0, + "step": 21803 + }, + { + "epoch": 2.3944651877882714, + "grad_norm": 2.2654359340667725, + "learning_rate": 5e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7688037753105164, + "num_tokens": 564305906.0, + "step": 21804 + }, + { + "epoch": 2.394575005490885, + "grad_norm": 1.984528660774231, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7507430911064148, + "num_tokens": 564336411.0, + "step": 21805 + }, + { + "epoch": 2.394684823193499, + "grad_norm": 2.024742364883423, + "learning_rate": 5e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7695291638374329, + "num_tokens": 564364724.0, + "step": 21806 + }, + { + "epoch": 2.3947946408961123, + "grad_norm": 1.8889679908752441, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7731673717498779, + "num_tokens": 564395457.0, + "step": 21807 + }, + { + "epoch": 2.394904458598726, + "grad_norm": 2.176666259765625, + "learning_rate": 5e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7511253356933594, + "num_tokens": 564420261.0, + "step": 21808 + }, + { + "epoch": 2.39501427630134, + "grad_norm": 1.9419270753860474, + "learning_rate": 5e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7306225299835205, + "num_tokens": 564454542.0, + "step": 21809 + }, + { + "epoch": 2.3951240940039535, + "grad_norm": 2.0438854694366455, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7669336199760437, + "num_tokens": 564480935.0, + "step": 21810 + }, + { + "epoch": 2.3952339117065673, + "grad_norm": 1.9843708276748657, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7559415698051453, + "num_tokens": 564511312.0, + "step": 21811 + }, + { + "epoch": 2.3953437294091806, + "grad_norm": 2.156614303588867, + "learning_rate": 5e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7599306106567383, + "num_tokens": 564535598.0, + "step": 21812 + }, + { + "epoch": 2.3954535471117944, + "grad_norm": 1.9656147956848145, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.778998076915741, + "num_tokens": 564562475.0, + "step": 21813 + }, + { + "epoch": 2.395563364814408, + "grad_norm": 2.25041127204895, + "learning_rate": 5e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.752531886100769, + "num_tokens": 564590928.0, + "step": 21814 + }, + { + "epoch": 2.395673182517022, + "grad_norm": 2.1210877895355225, + "learning_rate": 5e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7642207741737366, + "num_tokens": 564617883.0, + "step": 21815 + }, + { + "epoch": 2.3957830002196356, + "grad_norm": 2.1484715938568115, + "learning_rate": 5e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7548558712005615, + "num_tokens": 564644723.0, + "step": 21816 + }, + { + "epoch": 2.395892817922249, + "grad_norm": 2.0092616081237793, + "learning_rate": 5e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7748211622238159, + "num_tokens": 564671512.0, + "step": 21817 + }, + { + "epoch": 2.3960026356248627, + "grad_norm": 2.0130975246429443, + "learning_rate": 5e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7754156589508057, + "num_tokens": 564698251.0, + "step": 21818 + }, + { + "epoch": 2.3961124533274765, + "grad_norm": 1.8650113344192505, + "learning_rate": 5e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7546595931053162, + "num_tokens": 564727285.0, + "step": 21819 + }, + { + "epoch": 2.3962222710300902, + "grad_norm": 2.032127618789673, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7663218975067139, + "num_tokens": 564753535.0, + "step": 21820 + }, + { + "epoch": 2.3963320887327035, + "grad_norm": 2.15512752532959, + "learning_rate": 5e-06, + "loss": 0.6657, + "mean_token_accuracy": 0.7751734256744385, + "num_tokens": 564777225.0, + "step": 21821 + }, + { + "epoch": 2.3964419064353173, + "grad_norm": 2.0312883853912354, + "learning_rate": 5e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7494921684265137, + "num_tokens": 564807687.0, + "step": 21822 + }, + { + "epoch": 2.396551724137931, + "grad_norm": 2.2266685962677, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7705623507499695, + "num_tokens": 564831067.0, + "step": 21823 + }, + { + "epoch": 2.396661541840545, + "grad_norm": 2.0731430053710938, + "learning_rate": 5e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7512674927711487, + "num_tokens": 564858160.0, + "step": 21824 + }, + { + "epoch": 2.396771359543158, + "grad_norm": 2.1179239749908447, + "learning_rate": 5e-06, + "loss": 0.6636, + "mean_token_accuracy": 0.7795652151107788, + "num_tokens": 564880741.0, + "step": 21825 + }, + { + "epoch": 2.396881177245772, + "grad_norm": 2.1530497074127197, + "learning_rate": 5e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7663952112197876, + "num_tokens": 564907424.0, + "step": 21826 + }, + { + "epoch": 2.3969909949483856, + "grad_norm": 2.003295421600342, + "learning_rate": 5e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7644129395484924, + "num_tokens": 564936038.0, + "step": 21827 + }, + { + "epoch": 2.3971008126509994, + "grad_norm": 2.224562644958496, + "learning_rate": 5e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.7731415629386902, + "num_tokens": 564961074.0, + "step": 21828 + }, + { + "epoch": 2.397210630353613, + "grad_norm": 2.1192386150360107, + "learning_rate": 5e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.7804034948348999, + "num_tokens": 564985009.0, + "step": 21829 + }, + { + "epoch": 2.3973204480562265, + "grad_norm": 2.090754747390747, + "learning_rate": 5e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.765663206577301, + "num_tokens": 565011193.0, + "step": 21830 + }, + { + "epoch": 2.39743026575884, + "grad_norm": 2.0312352180480957, + "learning_rate": 5e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7519657611846924, + "num_tokens": 565039247.0, + "step": 21831 + }, + { + "epoch": 2.397540083461454, + "grad_norm": 2.093738079071045, + "learning_rate": 5e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7683072090148926, + "num_tokens": 565064501.0, + "step": 21832 + }, + { + "epoch": 2.3976499011640677, + "grad_norm": 2.100170612335205, + "learning_rate": 5e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7627360224723816, + "num_tokens": 565090664.0, + "step": 21833 + }, + { + "epoch": 2.3977597188666815, + "grad_norm": 1.8884505033493042, + "learning_rate": 5e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7336438298225403, + "num_tokens": 565124640.0, + "step": 21834 + }, + { + "epoch": 2.397869536569295, + "grad_norm": 2.0869767665863037, + "learning_rate": 5e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7698110938072205, + "num_tokens": 565150403.0, + "step": 21835 + }, + { + "epoch": 2.3979793542719086, + "grad_norm": 2.011141538619995, + "learning_rate": 5e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7621384859085083, + "num_tokens": 565176191.0, + "step": 21836 + }, + { + "epoch": 2.3980891719745223, + "grad_norm": 2.076054096221924, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7601102590560913, + "num_tokens": 565202545.0, + "step": 21837 + }, + { + "epoch": 2.398198989677136, + "grad_norm": 1.9522624015808105, + "learning_rate": 5e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.7853198647499084, + "num_tokens": 565229089.0, + "step": 21838 + }, + { + "epoch": 2.39830880737975, + "grad_norm": 1.8967580795288086, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7627812623977661, + "num_tokens": 565258615.0, + "step": 21839 + }, + { + "epoch": 2.398418625082363, + "grad_norm": 2.4333484172821045, + "learning_rate": 5e-06, + "loss": 0.6344, + "mean_token_accuracy": 0.7842074036598206, + "num_tokens": 565277146.0, + "step": 21840 + }, + { + "epoch": 2.398528442784977, + "grad_norm": 2.177565813064575, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7666985988616943, + "num_tokens": 565303180.0, + "step": 21841 + }, + { + "epoch": 2.3986382604875907, + "grad_norm": 2.2896173000335693, + "learning_rate": 5e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7727415561676025, + "num_tokens": 565325407.0, + "step": 21842 + }, + { + "epoch": 2.3987480781902044, + "grad_norm": 1.9686540365219116, + "learning_rate": 5e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7522014379501343, + "num_tokens": 565353423.0, + "step": 21843 + }, + { + "epoch": 2.398857895892818, + "grad_norm": 2.1565017700195312, + "learning_rate": 5e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.7765664458274841, + "num_tokens": 565377501.0, + "step": 21844 + }, + { + "epoch": 2.3989677135954315, + "grad_norm": 2.252197504043579, + "learning_rate": 5e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.772598147392273, + "num_tokens": 565401389.0, + "step": 21845 + }, + { + "epoch": 2.3990775312980452, + "grad_norm": 1.8326927423477173, + "learning_rate": 5e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7374458312988281, + "num_tokens": 565432732.0, + "step": 21846 + }, + { + "epoch": 2.399187349000659, + "grad_norm": 2.3152527809143066, + "learning_rate": 5e-06, + "loss": 0.6952, + "mean_token_accuracy": 0.7718242406845093, + "num_tokens": 565453920.0, + "step": 21847 + }, + { + "epoch": 2.3992971667032728, + "grad_norm": 1.9394800662994385, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.765201210975647, + "num_tokens": 565482802.0, + "step": 21848 + }, + { + "epoch": 2.399406984405886, + "grad_norm": 2.105884552001953, + "learning_rate": 5e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.7706810235977173, + "num_tokens": 565504936.0, + "step": 21849 + }, + { + "epoch": 2.3995168021085, + "grad_norm": 1.9753118753433228, + "learning_rate": 5e-06, + "loss": 0.78, + "mean_token_accuracy": 0.7459807991981506, + "num_tokens": 565534969.0, + "step": 21850 + }, + { + "epoch": 2.3996266198111136, + "grad_norm": 2.1743876934051514, + "learning_rate": 5e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.76004958152771, + "num_tokens": 565558571.0, + "step": 21851 + }, + { + "epoch": 2.3997364375137273, + "grad_norm": 2.1896708011627197, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7659835815429688, + "num_tokens": 565582586.0, + "step": 21852 + }, + { + "epoch": 2.3998462552163407, + "grad_norm": 2.160212755203247, + "learning_rate": 5e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7687292695045471, + "num_tokens": 565608884.0, + "step": 21853 + }, + { + "epoch": 2.3999560729189544, + "grad_norm": 2.0563278198242188, + "learning_rate": 5e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.7885595560073853, + "num_tokens": 565634221.0, + "step": 21854 + }, + { + "epoch": 2.400065890621568, + "grad_norm": 2.3229877948760986, + "learning_rate": 5e-06, + "loss": 0.6441, + "mean_token_accuracy": 0.7824076414108276, + "num_tokens": 565656371.0, + "step": 21855 + }, + { + "epoch": 2.400175708324182, + "grad_norm": 1.9345495700836182, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7548260688781738, + "num_tokens": 565686417.0, + "step": 21856 + }, + { + "epoch": 2.4002855260267957, + "grad_norm": 1.917084813117981, + "learning_rate": 5e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7610830068588257, + "num_tokens": 565717851.0, + "step": 21857 + }, + { + "epoch": 2.400395343729409, + "grad_norm": 1.9312732219696045, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7794477939605713, + "num_tokens": 565747245.0, + "step": 21858 + }, + { + "epoch": 2.4005051614320227, + "grad_norm": 2.016395330429077, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7736538648605347, + "num_tokens": 565775929.0, + "step": 21859 + }, + { + "epoch": 2.4006149791346365, + "grad_norm": 1.80894935131073, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7570722103118896, + "num_tokens": 565809883.0, + "step": 21860 + }, + { + "epoch": 2.4007247968372503, + "grad_norm": 2.2738873958587646, + "learning_rate": 5e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.7766902446746826, + "num_tokens": 565831907.0, + "step": 21861 + }, + { + "epoch": 2.400834614539864, + "grad_norm": 1.9838106632232666, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7555649280548096, + "num_tokens": 565859860.0, + "step": 21862 + }, + { + "epoch": 2.4009444322424773, + "grad_norm": 2.1012656688690186, + "learning_rate": 5e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7642889022827148, + "num_tokens": 565884246.0, + "step": 21863 + }, + { + "epoch": 2.401054249945091, + "grad_norm": 1.9172258377075195, + "learning_rate": 5e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7647994160652161, + "num_tokens": 565914760.0, + "step": 21864 + }, + { + "epoch": 2.401164067647705, + "grad_norm": 2.2020716667175293, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7728841304779053, + "num_tokens": 565937434.0, + "step": 21865 + }, + { + "epoch": 2.4012738853503186, + "grad_norm": 2.5184481143951416, + "learning_rate": 5e-06, + "loss": 0.6178, + "mean_token_accuracy": 0.7913601398468018, + "num_tokens": 565955204.0, + "step": 21866 + }, + { + "epoch": 2.4013837030529324, + "grad_norm": 2.061786413192749, + "learning_rate": 5e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7541666030883789, + "num_tokens": 565982957.0, + "step": 21867 + }, + { + "epoch": 2.4014935207555457, + "grad_norm": 2.2448935508728027, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7556995153427124, + "num_tokens": 566006049.0, + "step": 21868 + }, + { + "epoch": 2.4016033384581594, + "grad_norm": 2.4108364582061768, + "learning_rate": 5e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.7823665142059326, + "num_tokens": 566027392.0, + "step": 21869 + }, + { + "epoch": 2.401713156160773, + "grad_norm": 2.197166681289673, + "learning_rate": 5e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7444103956222534, + "num_tokens": 566054605.0, + "step": 21870 + }, + { + "epoch": 2.401822973863387, + "grad_norm": 2.4885408878326416, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7733809351921082, + "num_tokens": 566074876.0, + "step": 21871 + }, + { + "epoch": 2.4019327915660003, + "grad_norm": 2.083423614501953, + "learning_rate": 5e-06, + "loss": 0.6335, + "mean_token_accuracy": 0.781252384185791, + "num_tokens": 566103317.0, + "step": 21872 + }, + { + "epoch": 2.402042609268614, + "grad_norm": 2.217332124710083, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7714884281158447, + "num_tokens": 566126136.0, + "step": 21873 + }, + { + "epoch": 2.4021524269712278, + "grad_norm": 2.0280168056488037, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7555904984474182, + "num_tokens": 566155729.0, + "step": 21874 + }, + { + "epoch": 2.4022622446738415, + "grad_norm": 2.059677839279175, + "learning_rate": 5e-06, + "loss": 0.6527, + "mean_token_accuracy": 0.7864013314247131, + "num_tokens": 566182346.0, + "step": 21875 + }, + { + "epoch": 2.402372062376455, + "grad_norm": 2.0125656127929688, + "learning_rate": 5e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7483216524124146, + "num_tokens": 566209346.0, + "step": 21876 + }, + { + "epoch": 2.4024818800790686, + "grad_norm": 1.7959140539169312, + "learning_rate": 5e-06, + "loss": 0.6621, + "mean_token_accuracy": 0.774533212184906, + "num_tokens": 566239947.0, + "step": 21877 + }, + { + "epoch": 2.4025916977816824, + "grad_norm": 2.1780407428741455, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7708454132080078, + "num_tokens": 566265313.0, + "step": 21878 + }, + { + "epoch": 2.402701515484296, + "grad_norm": 1.9888780117034912, + "learning_rate": 5e-06, + "loss": 0.6511, + "mean_token_accuracy": 0.7796666622161865, + "num_tokens": 566292772.0, + "step": 21879 + }, + { + "epoch": 2.40281133318691, + "grad_norm": 2.080390453338623, + "learning_rate": 5e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7688280940055847, + "num_tokens": 566318081.0, + "step": 21880 + }, + { + "epoch": 2.402921150889523, + "grad_norm": 1.980591058731079, + "learning_rate": 5e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.7855875492095947, + "num_tokens": 566346087.0, + "step": 21881 + }, + { + "epoch": 2.403030968592137, + "grad_norm": 2.2664363384246826, + "learning_rate": 5e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.7814526557922363, + "num_tokens": 566369216.0, + "step": 21882 + }, + { + "epoch": 2.4031407862947507, + "grad_norm": 2.1447393894195557, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7631537914276123, + "num_tokens": 566396294.0, + "step": 21883 + }, + { + "epoch": 2.4032506039973645, + "grad_norm": 2.0876176357269287, + "learning_rate": 5e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7586594820022583, + "num_tokens": 566422981.0, + "step": 21884 + }, + { + "epoch": 2.403360421699978, + "grad_norm": 2.0850493907928467, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7707688808441162, + "num_tokens": 566446462.0, + "step": 21885 + }, + { + "epoch": 2.4034702394025915, + "grad_norm": 2.2720954418182373, + "learning_rate": 5e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7615504264831543, + "num_tokens": 566469992.0, + "step": 21886 + }, + { + "epoch": 2.4035800571052053, + "grad_norm": 2.0348916053771973, + "learning_rate": 5e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7504500150680542, + "num_tokens": 566496229.0, + "step": 21887 + }, + { + "epoch": 2.403689874807819, + "grad_norm": 1.8127951622009277, + "learning_rate": 5e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7596278190612793, + "num_tokens": 566526858.0, + "step": 21888 + }, + { + "epoch": 2.403799692510433, + "grad_norm": 2.076256513595581, + "learning_rate": 5e-06, + "loss": 0.615, + "mean_token_accuracy": 0.7973670363426208, + "num_tokens": 566551468.0, + "step": 21889 + }, + { + "epoch": 2.4039095102130466, + "grad_norm": 1.9887487888336182, + "learning_rate": 5e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7676891088485718, + "num_tokens": 566579368.0, + "step": 21890 + }, + { + "epoch": 2.40401932791566, + "grad_norm": 2.137942314147949, + "learning_rate": 5e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7625303268432617, + "num_tokens": 566601854.0, + "step": 21891 + }, + { + "epoch": 2.4041291456182736, + "grad_norm": 2.060976982116699, + "learning_rate": 5e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7845581769943237, + "num_tokens": 566625391.0, + "step": 21892 + }, + { + "epoch": 2.4042389633208874, + "grad_norm": 1.8784795999526978, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7598867416381836, + "num_tokens": 566655104.0, + "step": 21893 + }, + { + "epoch": 2.404348781023501, + "grad_norm": 2.057912826538086, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7735779285430908, + "num_tokens": 566679179.0, + "step": 21894 + }, + { + "epoch": 2.404458598726115, + "grad_norm": 1.915266513824463, + "learning_rate": 5e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7500700950622559, + "num_tokens": 566711266.0, + "step": 21895 + }, + { + "epoch": 2.404568416428728, + "grad_norm": 2.089897632598877, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7695570588111877, + "num_tokens": 566736199.0, + "step": 21896 + }, + { + "epoch": 2.404678234131342, + "grad_norm": 2.2222163677215576, + "learning_rate": 5e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7756087183952332, + "num_tokens": 566757415.0, + "step": 21897 + }, + { + "epoch": 2.4047880518339557, + "grad_norm": 2.2387194633483887, + "learning_rate": 5e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7786122560501099, + "num_tokens": 566779601.0, + "step": 21898 + }, + { + "epoch": 2.4048978695365695, + "grad_norm": 2.0400614738464355, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7511179447174072, + "num_tokens": 566810637.0, + "step": 21899 + }, + { + "epoch": 2.405007687239183, + "grad_norm": 1.964540958404541, + "learning_rate": 5e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7731512188911438, + "num_tokens": 566840348.0, + "step": 21900 + }, + { + "epoch": 2.4051175049417965, + "grad_norm": 1.964487075805664, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7484583854675293, + "num_tokens": 566870836.0, + "step": 21901 + }, + { + "epoch": 2.4052273226444103, + "grad_norm": 2.04191255569458, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7444444894790649, + "num_tokens": 566899471.0, + "step": 21902 + }, + { + "epoch": 2.405337140347024, + "grad_norm": 1.9254002571105957, + "learning_rate": 5e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7431888580322266, + "num_tokens": 566930915.0, + "step": 21903 + }, + { + "epoch": 2.4054469580496374, + "grad_norm": 2.2490766048431396, + "learning_rate": 5e-06, + "loss": 0.774, + "mean_token_accuracy": 0.75716233253479, + "num_tokens": 566955566.0, + "step": 21904 + }, + { + "epoch": 2.405556775752251, + "grad_norm": 1.9935882091522217, + "learning_rate": 5e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7490648031234741, + "num_tokens": 566983849.0, + "step": 21905 + }, + { + "epoch": 2.405666593454865, + "grad_norm": 1.995181918144226, + "learning_rate": 5e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7533243894577026, + "num_tokens": 567013271.0, + "step": 21906 + }, + { + "epoch": 2.4057764111574786, + "grad_norm": 2.5503106117248535, + "learning_rate": 5e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7647212147712708, + "num_tokens": 567034828.0, + "step": 21907 + }, + { + "epoch": 2.4058862288600924, + "grad_norm": 2.463212728500366, + "learning_rate": 5e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.7738244533538818, + "num_tokens": 567057764.0, + "step": 21908 + }, + { + "epoch": 2.4059960465627057, + "grad_norm": 2.007997751235962, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7640289068222046, + "num_tokens": 567082798.0, + "step": 21909 + }, + { + "epoch": 2.4061058642653195, + "grad_norm": 2.1006364822387695, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7506036758422852, + "num_tokens": 567107162.0, + "step": 21910 + }, + { + "epoch": 2.4062156819679332, + "grad_norm": 2.0001866817474365, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7606825828552246, + "num_tokens": 567132537.0, + "step": 21911 + }, + { + "epoch": 2.406325499670547, + "grad_norm": 2.08217191696167, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7442911267280579, + "num_tokens": 567158233.0, + "step": 21912 + }, + { + "epoch": 2.4064353173731607, + "grad_norm": 1.8436131477355957, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.745854377746582, + "num_tokens": 567191471.0, + "step": 21913 + }, + { + "epoch": 2.406545135075774, + "grad_norm": 2.0297205448150635, + "learning_rate": 5e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7776573896408081, + "num_tokens": 567218798.0, + "step": 21914 + }, + { + "epoch": 2.406654952778388, + "grad_norm": 2.146127223968506, + "learning_rate": 5e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7479737997055054, + "num_tokens": 567246208.0, + "step": 21915 + }, + { + "epoch": 2.4067647704810016, + "grad_norm": 2.3119261264801025, + "learning_rate": 5e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.774816632270813, + "num_tokens": 567268034.0, + "step": 21916 + }, + { + "epoch": 2.4068745881836153, + "grad_norm": 2.04469895362854, + "learning_rate": 5e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7697964310646057, + "num_tokens": 567292073.0, + "step": 21917 + }, + { + "epoch": 2.406984405886229, + "grad_norm": 2.0320820808410645, + "learning_rate": 5e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7649765014648438, + "num_tokens": 567317116.0, + "step": 21918 + }, + { + "epoch": 2.4070942235888424, + "grad_norm": 1.9989522695541382, + "learning_rate": 5e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7740815877914429, + "num_tokens": 567343123.0, + "step": 21919 + }, + { + "epoch": 2.407204041291456, + "grad_norm": 1.9089372158050537, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7661584615707397, + "num_tokens": 567371316.0, + "step": 21920 + }, + { + "epoch": 2.40731385899407, + "grad_norm": 2.2707600593566895, + "learning_rate": 5e-06, + "loss": 0.5842, + "mean_token_accuracy": 0.803723156452179, + "num_tokens": 567391964.0, + "step": 21921 + }, + { + "epoch": 2.4074236766966837, + "grad_norm": 2.201144218444824, + "learning_rate": 5e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7701877355575562, + "num_tokens": 567416310.0, + "step": 21922 + }, + { + "epoch": 2.407533494399297, + "grad_norm": 1.972817301750183, + "learning_rate": 5e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.753671407699585, + "num_tokens": 567443700.0, + "step": 21923 + }, + { + "epoch": 2.4076433121019107, + "grad_norm": 2.210982084274292, + "learning_rate": 5e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7500181198120117, + "num_tokens": 567467750.0, + "step": 21924 + }, + { + "epoch": 2.4077531298045245, + "grad_norm": 2.215268135070801, + "learning_rate": 5e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.7802066802978516, + "num_tokens": 567492196.0, + "step": 21925 + }, + { + "epoch": 2.4078629475071383, + "grad_norm": 2.005373239517212, + "learning_rate": 5e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7489577531814575, + "num_tokens": 567520989.0, + "step": 21926 + }, + { + "epoch": 2.4079727652097516, + "grad_norm": 2.0001113414764404, + "learning_rate": 5e-06, + "loss": 0.6301, + "mean_token_accuracy": 0.7845274209976196, + "num_tokens": 567546832.0, + "step": 21927 + }, + { + "epoch": 2.4080825829123653, + "grad_norm": 2.0054378509521484, + "learning_rate": 5e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7756563425064087, + "num_tokens": 567573812.0, + "step": 21928 + }, + { + "epoch": 2.408192400614979, + "grad_norm": 1.9443995952606201, + "learning_rate": 5e-06, + "loss": 0.6467, + "mean_token_accuracy": 0.7802126407623291, + "num_tokens": 567602171.0, + "step": 21929 + }, + { + "epoch": 2.408302218317593, + "grad_norm": 2.220150947570801, + "learning_rate": 5e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7405154705047607, + "num_tokens": 567629309.0, + "step": 21930 + }, + { + "epoch": 2.4084120360202066, + "grad_norm": 2.001779079437256, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7708353996276855, + "num_tokens": 567657813.0, + "step": 21931 + }, + { + "epoch": 2.40852185372282, + "grad_norm": 1.846636176109314, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7411609888076782, + "num_tokens": 567690026.0, + "step": 21932 + }, + { + "epoch": 2.4086316714254337, + "grad_norm": 2.153676748275757, + "learning_rate": 5e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7671077251434326, + "num_tokens": 567714312.0, + "step": 21933 + }, + { + "epoch": 2.4087414891280474, + "grad_norm": 1.9431074857711792, + "learning_rate": 5e-06, + "loss": 0.662, + "mean_token_accuracy": 0.781872570514679, + "num_tokens": 567741630.0, + "step": 21934 + }, + { + "epoch": 2.408851306830661, + "grad_norm": 1.956261157989502, + "learning_rate": 5e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.761411190032959, + "num_tokens": 567770375.0, + "step": 21935 + }, + { + "epoch": 2.408961124533275, + "grad_norm": 2.2631914615631104, + "learning_rate": 5e-06, + "loss": 0.6349, + "mean_token_accuracy": 0.7827221155166626, + "num_tokens": 567793073.0, + "step": 21936 + }, + { + "epoch": 2.4090709422358882, + "grad_norm": 1.945689082145691, + "learning_rate": 5e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7592830061912537, + "num_tokens": 567822167.0, + "step": 21937 + }, + { + "epoch": 2.409180759938502, + "grad_norm": 2.11430025100708, + "learning_rate": 5e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.7765803337097168, + "num_tokens": 567846346.0, + "step": 21938 + }, + { + "epoch": 2.4092905776411158, + "grad_norm": 2.149747610092163, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7635501623153687, + "num_tokens": 567870855.0, + "step": 21939 + }, + { + "epoch": 2.4094003953437295, + "grad_norm": 2.3467907905578613, + "learning_rate": 5e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7695435881614685, + "num_tokens": 567893207.0, + "step": 21940 + }, + { + "epoch": 2.4095102130463433, + "grad_norm": 2.053617238998413, + "learning_rate": 5e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7473348379135132, + "num_tokens": 567920154.0, + "step": 21941 + }, + { + "epoch": 2.4096200307489566, + "grad_norm": 2.444495916366577, + "learning_rate": 5e-06, + "loss": 0.6142, + "mean_token_accuracy": 0.7890433669090271, + "num_tokens": 567938529.0, + "step": 21942 + }, + { + "epoch": 2.4097298484515703, + "grad_norm": 1.8957064151763916, + "learning_rate": 5e-06, + "loss": 0.768, + "mean_token_accuracy": 0.749030351638794, + "num_tokens": 567968579.0, + "step": 21943 + }, + { + "epoch": 2.409839666154184, + "grad_norm": 2.2120778560638428, + "learning_rate": 5e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7756859064102173, + "num_tokens": 567990696.0, + "step": 21944 + }, + { + "epoch": 2.409949483856798, + "grad_norm": 1.945091724395752, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7631023526191711, + "num_tokens": 568018434.0, + "step": 21945 + }, + { + "epoch": 2.4100593015594116, + "grad_norm": 1.8345141410827637, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7494663000106812, + "num_tokens": 568048101.0, + "step": 21946 + }, + { + "epoch": 2.410169119262025, + "grad_norm": 2.2946560382843018, + "learning_rate": 5e-06, + "loss": 0.6698, + "mean_token_accuracy": 0.7738889455795288, + "num_tokens": 568068722.0, + "step": 21947 + }, + { + "epoch": 2.4102789369646387, + "grad_norm": 1.9698599576950073, + "learning_rate": 5e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7679017782211304, + "num_tokens": 568097871.0, + "step": 21948 + }, + { + "epoch": 2.4103887546672524, + "grad_norm": 1.9697198867797852, + "learning_rate": 5e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7453660368919373, + "num_tokens": 568125696.0, + "step": 21949 + }, + { + "epoch": 2.410498572369866, + "grad_norm": 1.984035611152649, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7611875534057617, + "num_tokens": 568152426.0, + "step": 21950 + }, + { + "epoch": 2.4106083900724795, + "grad_norm": 2.096505641937256, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7541855573654175, + "num_tokens": 568178087.0, + "step": 21951 + }, + { + "epoch": 2.4107182077750933, + "grad_norm": 1.8436850309371948, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7597790956497192, + "num_tokens": 568207520.0, + "step": 21952 + }, + { + "epoch": 2.410828025477707, + "grad_norm": 1.9322876930236816, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7630513906478882, + "num_tokens": 568233880.0, + "step": 21953 + }, + { + "epoch": 2.410937843180321, + "grad_norm": 1.9593979120254517, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7574061155319214, + "num_tokens": 568262657.0, + "step": 21954 + }, + { + "epoch": 2.411047660882934, + "grad_norm": 2.0036721229553223, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7561221718788147, + "num_tokens": 568290509.0, + "step": 21955 + }, + { + "epoch": 2.411157478585548, + "grad_norm": 2.312791347503662, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7730984687805176, + "num_tokens": 568314362.0, + "step": 21956 + }, + { + "epoch": 2.4112672962881616, + "grad_norm": 1.9294350147247314, + "learning_rate": 5e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.7711008787155151, + "num_tokens": 568342626.0, + "step": 21957 + }, + { + "epoch": 2.4113771139907754, + "grad_norm": 2.110574960708618, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.747024416923523, + "num_tokens": 568369269.0, + "step": 21958 + }, + { + "epoch": 2.411486931693389, + "grad_norm": 2.3677408695220947, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7809915542602539, + "num_tokens": 568388554.0, + "step": 21959 + }, + { + "epoch": 2.4115967493960024, + "grad_norm": 2.0032973289489746, + "learning_rate": 5e-06, + "loss": 0.6169, + "mean_token_accuracy": 0.7957221269607544, + "num_tokens": 568414772.0, + "step": 21960 + }, + { + "epoch": 2.411706567098616, + "grad_norm": 1.7855640649795532, + "learning_rate": 5e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7424220442771912, + "num_tokens": 568450687.0, + "step": 21961 + }, + { + "epoch": 2.41181638480123, + "grad_norm": 2.002195358276367, + "learning_rate": 5e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7443657517433167, + "num_tokens": 568479308.0, + "step": 21962 + }, + { + "epoch": 2.4119262025038437, + "grad_norm": 2.063913345336914, + "learning_rate": 5e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7494637966156006, + "num_tokens": 568506952.0, + "step": 21963 + }, + { + "epoch": 2.4120360202064575, + "grad_norm": 2.05424427986145, + "learning_rate": 5e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7572898864746094, + "num_tokens": 568535172.0, + "step": 21964 + }, + { + "epoch": 2.412145837909071, + "grad_norm": 2.01235032081604, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7560084462165833, + "num_tokens": 568563593.0, + "step": 21965 + }, + { + "epoch": 2.4122556556116845, + "grad_norm": 1.958775281906128, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7562146186828613, + "num_tokens": 568593821.0, + "step": 21966 + }, + { + "epoch": 2.4123654733142983, + "grad_norm": 2.096113443374634, + "learning_rate": 5e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7596850991249084, + "num_tokens": 568621885.0, + "step": 21967 + }, + { + "epoch": 2.412475291016912, + "grad_norm": 2.1179099082946777, + "learning_rate": 5e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.768241286277771, + "num_tokens": 568648212.0, + "step": 21968 + }, + { + "epoch": 2.412585108719526, + "grad_norm": 2.1663057804107666, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7755059599876404, + "num_tokens": 568672501.0, + "step": 21969 + }, + { + "epoch": 2.412694926422139, + "grad_norm": 2.0959324836730957, + "learning_rate": 5e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7640336751937866, + "num_tokens": 568699323.0, + "step": 21970 + }, + { + "epoch": 2.412804744124753, + "grad_norm": 1.9823591709136963, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7590599060058594, + "num_tokens": 568728974.0, + "step": 21971 + }, + { + "epoch": 2.4129145618273666, + "grad_norm": 2.0816049575805664, + "learning_rate": 5e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7637307047843933, + "num_tokens": 568757760.0, + "step": 21972 + }, + { + "epoch": 2.4130243795299804, + "grad_norm": 1.9089933633804321, + "learning_rate": 5e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7516712546348572, + "num_tokens": 568787451.0, + "step": 21973 + }, + { + "epoch": 2.4131341972325937, + "grad_norm": 1.9278535842895508, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7683054804801941, + "num_tokens": 568814477.0, + "step": 21974 + }, + { + "epoch": 2.4132440149352075, + "grad_norm": 1.8674348592758179, + "learning_rate": 5e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.7783031463623047, + "num_tokens": 568845607.0, + "step": 21975 + }, + { + "epoch": 2.413353832637821, + "grad_norm": 2.4085981845855713, + "learning_rate": 5e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7707688808441162, + "num_tokens": 568865637.0, + "step": 21976 + }, + { + "epoch": 2.413463650340435, + "grad_norm": 2.0428340435028076, + "learning_rate": 5e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7599860429763794, + "num_tokens": 568892477.0, + "step": 21977 + }, + { + "epoch": 2.4135734680430487, + "grad_norm": 2.3985788822174072, + "learning_rate": 5e-06, + "loss": 0.6443, + "mean_token_accuracy": 0.782832145690918, + "num_tokens": 568914201.0, + "step": 21978 + }, + { + "epoch": 2.413683285745662, + "grad_norm": 2.297262191772461, + "learning_rate": 5e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.7891958355903625, + "num_tokens": 568938530.0, + "step": 21979 + }, + { + "epoch": 2.413793103448276, + "grad_norm": 2.035649061203003, + "learning_rate": 5e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7734955549240112, + "num_tokens": 568967043.0, + "step": 21980 + }, + { + "epoch": 2.4139029211508896, + "grad_norm": 2.187586545944214, + "learning_rate": 5e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7452085614204407, + "num_tokens": 568995299.0, + "step": 21981 + }, + { + "epoch": 2.4140127388535033, + "grad_norm": 1.934163212776184, + "learning_rate": 5e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7586128115653992, + "num_tokens": 569026169.0, + "step": 21982 + }, + { + "epoch": 2.4141225565561166, + "grad_norm": 2.066537857055664, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7511990070343018, + "num_tokens": 569053757.0, + "step": 21983 + }, + { + "epoch": 2.4142323742587304, + "grad_norm": 1.8829610347747803, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7610605955123901, + "num_tokens": 569084885.0, + "step": 21984 + }, + { + "epoch": 2.414342191961344, + "grad_norm": 2.1152560710906982, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7676827907562256, + "num_tokens": 569111026.0, + "step": 21985 + }, + { + "epoch": 2.414452009663958, + "grad_norm": 2.0313808917999268, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7504452466964722, + "num_tokens": 569138917.0, + "step": 21986 + }, + { + "epoch": 2.4145618273665717, + "grad_norm": 2.079545497894287, + "learning_rate": 5e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7648886442184448, + "num_tokens": 569166070.0, + "step": 21987 + }, + { + "epoch": 2.414671645069185, + "grad_norm": 1.8419297933578491, + "learning_rate": 5e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7544068098068237, + "num_tokens": 569198399.0, + "step": 21988 + }, + { + "epoch": 2.4147814627717987, + "grad_norm": 2.2964046001434326, + "learning_rate": 5e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.7935972213745117, + "num_tokens": 569219725.0, + "step": 21989 + }, + { + "epoch": 2.4148912804744125, + "grad_norm": 2.3244130611419678, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7726860046386719, + "num_tokens": 569243249.0, + "step": 21990 + }, + { + "epoch": 2.4150010981770262, + "grad_norm": 2.5574350357055664, + "learning_rate": 5e-06, + "loss": 0.594, + "mean_token_accuracy": 0.7970697283744812, + "num_tokens": 569261231.0, + "step": 21991 + }, + { + "epoch": 2.41511091587964, + "grad_norm": 2.2767999172210693, + "learning_rate": 5e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.7837154865264893, + "num_tokens": 569283907.0, + "step": 21992 + }, + { + "epoch": 2.4152207335822533, + "grad_norm": 2.164095878601074, + "learning_rate": 5e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.7971347570419312, + "num_tokens": 569306285.0, + "step": 21993 + }, + { + "epoch": 2.415330551284867, + "grad_norm": 2.063262939453125, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7716270089149475, + "num_tokens": 569331735.0, + "step": 21994 + }, + { + "epoch": 2.415440368987481, + "grad_norm": 2.197603940963745, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7719964981079102, + "num_tokens": 569355911.0, + "step": 21995 + }, + { + "epoch": 2.4155501866900946, + "grad_norm": 2.040895700454712, + "learning_rate": 5e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7755522727966309, + "num_tokens": 569381084.0, + "step": 21996 + }, + { + "epoch": 2.4156600043927083, + "grad_norm": 2.2980990409851074, + "learning_rate": 5e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7803740501403809, + "num_tokens": 569403380.0, + "step": 21997 + }, + { + "epoch": 2.4157698220953217, + "grad_norm": 2.041644811630249, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7705914974212646, + "num_tokens": 569430577.0, + "step": 21998 + }, + { + "epoch": 2.4158796397979354, + "grad_norm": 2.049215078353882, + "learning_rate": 5e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7682427763938904, + "num_tokens": 569460158.0, + "step": 21999 + }, + { + "epoch": 2.415989457500549, + "grad_norm": 2.1824166774749756, + "learning_rate": 5e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7720924019813538, + "num_tokens": 569484009.0, + "step": 22000 + }, + { + "epoch": 2.416099275203163, + "grad_norm": 1.9917576313018799, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7667396068572998, + "num_tokens": 569511678.0, + "step": 22001 + }, + { + "epoch": 2.4162090929057762, + "grad_norm": 2.1078176498413086, + "learning_rate": 5e-06, + "loss": 0.6323, + "mean_token_accuracy": 0.7915664315223694, + "num_tokens": 569535988.0, + "step": 22002 + }, + { + "epoch": 2.41631891060839, + "grad_norm": 1.9168012142181396, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7669661045074463, + "num_tokens": 569564260.0, + "step": 22003 + }, + { + "epoch": 2.4164287283110037, + "grad_norm": 2.124492883682251, + "learning_rate": 5e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.7717199325561523, + "num_tokens": 569590054.0, + "step": 22004 + }, + { + "epoch": 2.4165385460136175, + "grad_norm": 2.2482621669769287, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7772665023803711, + "num_tokens": 569613546.0, + "step": 22005 + }, + { + "epoch": 2.416648363716231, + "grad_norm": 2.341724157333374, + "learning_rate": 5e-06, + "loss": 0.5863, + "mean_token_accuracy": 0.8088577389717102, + "num_tokens": 569634339.0, + "step": 22006 + }, + { + "epoch": 2.4167581814188446, + "grad_norm": 1.942896842956543, + "learning_rate": 5e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.7853205800056458, + "num_tokens": 569661711.0, + "step": 22007 + }, + { + "epoch": 2.4168679991214583, + "grad_norm": 2.0844502449035645, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7773406505584717, + "num_tokens": 569687841.0, + "step": 22008 + }, + { + "epoch": 2.416977816824072, + "grad_norm": 2.44028902053833, + "learning_rate": 5e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.7809232473373413, + "num_tokens": 569707764.0, + "step": 22009 + }, + { + "epoch": 2.417087634526686, + "grad_norm": 1.9558407068252563, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.761389970779419, + "num_tokens": 569738433.0, + "step": 22010 + }, + { + "epoch": 2.417197452229299, + "grad_norm": 2.0880980491638184, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7535016536712646, + "num_tokens": 569765397.0, + "step": 22011 + }, + { + "epoch": 2.417307269931913, + "grad_norm": 2.018928050994873, + "learning_rate": 5e-06, + "loss": 0.6465, + "mean_token_accuracy": 0.7835341095924377, + "num_tokens": 569790537.0, + "step": 22012 + }, + { + "epoch": 2.4174170876345267, + "grad_norm": 2.2806267738342285, + "learning_rate": 5e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7742071151733398, + "num_tokens": 569815458.0, + "step": 22013 + }, + { + "epoch": 2.4175269053371404, + "grad_norm": 2.1422250270843506, + "learning_rate": 5e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7691477537155151, + "num_tokens": 569842010.0, + "step": 22014 + }, + { + "epoch": 2.417636723039754, + "grad_norm": 1.7883342504501343, + "learning_rate": 5e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.75070720911026, + "num_tokens": 569876846.0, + "step": 22015 + }, + { + "epoch": 2.4177465407423675, + "grad_norm": 2.254309892654419, + "learning_rate": 5e-06, + "loss": 0.6339, + "mean_token_accuracy": 0.7868340015411377, + "num_tokens": 569899690.0, + "step": 22016 + }, + { + "epoch": 2.4178563584449813, + "grad_norm": 2.0005898475646973, + "learning_rate": 5e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.7842134833335876, + "num_tokens": 569926654.0, + "step": 22017 + }, + { + "epoch": 2.417966176147595, + "grad_norm": 1.905261516571045, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7635312676429749, + "num_tokens": 569954661.0, + "step": 22018 + }, + { + "epoch": 2.4180759938502088, + "grad_norm": 2.4320266246795654, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7742607593536377, + "num_tokens": 569975826.0, + "step": 22019 + }, + { + "epoch": 2.4181858115528225, + "grad_norm": 1.977306842803955, + "learning_rate": 5e-06, + "loss": 0.7087, + "mean_token_accuracy": 0.7672499418258667, + "num_tokens": 570005133.0, + "step": 22020 + }, + { + "epoch": 2.418295629255436, + "grad_norm": 2.179818868637085, + "learning_rate": 5e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.7824850082397461, + "num_tokens": 570027575.0, + "step": 22021 + }, + { + "epoch": 2.4184054469580496, + "grad_norm": 2.106355667114258, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7714439630508423, + "num_tokens": 570052651.0, + "step": 22022 + }, + { + "epoch": 2.4185152646606634, + "grad_norm": 2.0002505779266357, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.770730197429657, + "num_tokens": 570078267.0, + "step": 22023 + }, + { + "epoch": 2.418625082363277, + "grad_norm": 2.1675124168395996, + "learning_rate": 5e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7629117965698242, + "num_tokens": 570102770.0, + "step": 22024 + }, + { + "epoch": 2.418734900065891, + "grad_norm": 1.914801001548767, + "learning_rate": 5e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7536613345146179, + "num_tokens": 570131017.0, + "step": 22025 + }, + { + "epoch": 2.418844717768504, + "grad_norm": 2.0168309211730957, + "learning_rate": 5e-06, + "loss": 0.6422, + "mean_token_accuracy": 0.7889307737350464, + "num_tokens": 570157185.0, + "step": 22026 + }, + { + "epoch": 2.418954535471118, + "grad_norm": 2.3002424240112305, + "learning_rate": 5e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7685906887054443, + "num_tokens": 570177452.0, + "step": 22027 + }, + { + "epoch": 2.4190643531737317, + "grad_norm": 2.2433438301086426, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7570250034332275, + "num_tokens": 570200112.0, + "step": 22028 + }, + { + "epoch": 2.4191741708763455, + "grad_norm": 2.198836326599121, + "learning_rate": 5e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7578591108322144, + "num_tokens": 570225471.0, + "step": 22029 + }, + { + "epoch": 2.4192839885789588, + "grad_norm": 2.0838379859924316, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7547228336334229, + "num_tokens": 570252907.0, + "step": 22030 + }, + { + "epoch": 2.4193938062815725, + "grad_norm": 1.9681586027145386, + "learning_rate": 5e-06, + "loss": 0.6306, + "mean_token_accuracy": 0.7853050827980042, + "num_tokens": 570278825.0, + "step": 22031 + }, + { + "epoch": 2.4195036239841863, + "grad_norm": 1.938614010810852, + "learning_rate": 5e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.773688554763794, + "num_tokens": 570306581.0, + "step": 22032 + }, + { + "epoch": 2.4196134416868, + "grad_norm": 1.9467673301696777, + "learning_rate": 5e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7502614855766296, + "num_tokens": 570336672.0, + "step": 22033 + }, + { + "epoch": 2.4197232593894134, + "grad_norm": 2.1690733432769775, + "learning_rate": 5e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7687639594078064, + "num_tokens": 570361695.0, + "step": 22034 + }, + { + "epoch": 2.419833077092027, + "grad_norm": 1.8630294799804688, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.751172661781311, + "num_tokens": 570395069.0, + "step": 22035 + }, + { + "epoch": 2.419942894794641, + "grad_norm": 2.030893564224243, + "learning_rate": 5e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7761969566345215, + "num_tokens": 570423446.0, + "step": 22036 + }, + { + "epoch": 2.4200527124972546, + "grad_norm": 2.2767012119293213, + "learning_rate": 5e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7759850025177002, + "num_tokens": 570444846.0, + "step": 22037 + }, + { + "epoch": 2.4201625301998684, + "grad_norm": 2.0933127403259277, + "learning_rate": 5e-06, + "loss": 0.6592, + "mean_token_accuracy": 0.7766176462173462, + "num_tokens": 570468492.0, + "step": 22038 + }, + { + "epoch": 2.4202723479024817, + "grad_norm": 2.091646432876587, + "learning_rate": 5e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.764019250869751, + "num_tokens": 570492617.0, + "step": 22039 + }, + { + "epoch": 2.4203821656050954, + "grad_norm": 1.9459965229034424, + "learning_rate": 5e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7406265735626221, + "num_tokens": 570521893.0, + "step": 22040 + }, + { + "epoch": 2.420491983307709, + "grad_norm": 1.8745265007019043, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7502938508987427, + "num_tokens": 570556142.0, + "step": 22041 + }, + { + "epoch": 2.420601801010323, + "grad_norm": 2.149014949798584, + "learning_rate": 5e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7866950631141663, + "num_tokens": 570579595.0, + "step": 22042 + }, + { + "epoch": 2.4207116187129367, + "grad_norm": 2.049576997756958, + "learning_rate": 5e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.757579505443573, + "num_tokens": 570608131.0, + "step": 22043 + }, + { + "epoch": 2.42082143641555, + "grad_norm": 1.994764804840088, + "learning_rate": 5e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7507336735725403, + "num_tokens": 570634596.0, + "step": 22044 + }, + { + "epoch": 2.420931254118164, + "grad_norm": 1.9644355773925781, + "learning_rate": 5e-06, + "loss": 0.7762, + "mean_token_accuracy": 0.7413855791091919, + "num_tokens": 570663950.0, + "step": 22045 + }, + { + "epoch": 2.4210410718207775, + "grad_norm": 2.0520830154418945, + "learning_rate": 5e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7676540613174438, + "num_tokens": 570688415.0, + "step": 22046 + }, + { + "epoch": 2.4211508895233913, + "grad_norm": 1.8646107912063599, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7486682534217834, + "num_tokens": 570719795.0, + "step": 22047 + }, + { + "epoch": 2.421260707226005, + "grad_norm": 2.3732144832611084, + "learning_rate": 5e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.8006771802902222, + "num_tokens": 570737987.0, + "step": 22048 + }, + { + "epoch": 2.4213705249286184, + "grad_norm": 2.2891221046447754, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7912406921386719, + "num_tokens": 570760898.0, + "step": 22049 + }, + { + "epoch": 2.421480342631232, + "grad_norm": 2.308326244354248, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7747699022293091, + "num_tokens": 570783044.0, + "step": 22050 + }, + { + "epoch": 2.421590160333846, + "grad_norm": 2.4113082885742188, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7557986974716187, + "num_tokens": 570803051.0, + "step": 22051 + }, + { + "epoch": 2.4216999780364596, + "grad_norm": 1.991135597229004, + "learning_rate": 5e-06, + "loss": 0.6403, + "mean_token_accuracy": 0.7812460660934448, + "num_tokens": 570829736.0, + "step": 22052 + }, + { + "epoch": 2.421809795739073, + "grad_norm": 2.085524559020996, + "learning_rate": 5e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7679842114448547, + "num_tokens": 570854448.0, + "step": 22053 + }, + { + "epoch": 2.4219196134416867, + "grad_norm": 2.095447063446045, + "learning_rate": 5e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7378301024436951, + "num_tokens": 570882257.0, + "step": 22054 + }, + { + "epoch": 2.4220294311443005, + "grad_norm": 2.0414488315582275, + "learning_rate": 5e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7598421573638916, + "num_tokens": 570911985.0, + "step": 22055 + }, + { + "epoch": 2.4221392488469142, + "grad_norm": 1.9838464260101318, + "learning_rate": 5e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.763857364654541, + "num_tokens": 570938917.0, + "step": 22056 + }, + { + "epoch": 2.4222490665495275, + "grad_norm": 1.8938508033752441, + "learning_rate": 5e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.7720007300376892, + "num_tokens": 570967167.0, + "step": 22057 + }, + { + "epoch": 2.4223588842521413, + "grad_norm": 2.0732455253601074, + "learning_rate": 5e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7602781653404236, + "num_tokens": 570992538.0, + "step": 22058 + }, + { + "epoch": 2.422468701954755, + "grad_norm": 1.997465968132019, + "learning_rate": 5e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7405210733413696, + "num_tokens": 571019617.0, + "step": 22059 + }, + { + "epoch": 2.422578519657369, + "grad_norm": 2.3777718544006348, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7607710361480713, + "num_tokens": 571044922.0, + "step": 22060 + }, + { + "epoch": 2.4226883373599826, + "grad_norm": 2.2299671173095703, + "learning_rate": 5e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7710624933242798, + "num_tokens": 571067505.0, + "step": 22061 + }, + { + "epoch": 2.422798155062596, + "grad_norm": 2.5074799060821533, + "learning_rate": 5e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.7793136239051819, + "num_tokens": 571085502.0, + "step": 22062 + }, + { + "epoch": 2.4229079727652096, + "grad_norm": 2.006481885910034, + "learning_rate": 5e-06, + "loss": 0.772, + "mean_token_accuracy": 0.746963620185852, + "num_tokens": 571114621.0, + "step": 22063 + }, + { + "epoch": 2.4230177904678234, + "grad_norm": 2.0701866149902344, + "learning_rate": 5e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.742230236530304, + "num_tokens": 571141549.0, + "step": 22064 + }, + { + "epoch": 2.423127608170437, + "grad_norm": 2.0479001998901367, + "learning_rate": 5e-06, + "loss": 0.609, + "mean_token_accuracy": 0.7936793565750122, + "num_tokens": 571166961.0, + "step": 22065 + }, + { + "epoch": 2.423237425873051, + "grad_norm": 1.9873368740081787, + "learning_rate": 5e-06, + "loss": 0.707, + "mean_token_accuracy": 0.764183759689331, + "num_tokens": 571193758.0, + "step": 22066 + }, + { + "epoch": 2.4233472435756642, + "grad_norm": 2.2011749744415283, + "learning_rate": 5e-06, + "loss": 0.668, + "mean_token_accuracy": 0.787760317325592, + "num_tokens": 571218563.0, + "step": 22067 + }, + { + "epoch": 2.423457061278278, + "grad_norm": 2.207138776779175, + "learning_rate": 5e-06, + "loss": 0.648, + "mean_token_accuracy": 0.7820184230804443, + "num_tokens": 571240711.0, + "step": 22068 + }, + { + "epoch": 2.4235668789808917, + "grad_norm": 2.1327927112579346, + "learning_rate": 5e-06, + "loss": 0.6326, + "mean_token_accuracy": 0.7899224162101746, + "num_tokens": 571263380.0, + "step": 22069 + }, + { + "epoch": 2.4236766966835055, + "grad_norm": 2.156602144241333, + "learning_rate": 5e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.770037829875946, + "num_tokens": 571287104.0, + "step": 22070 + }, + { + "epoch": 2.4237865143861193, + "grad_norm": 1.99869966506958, + "learning_rate": 5e-06, + "loss": 0.7051, + "mean_token_accuracy": 0.7660972476005554, + "num_tokens": 571313924.0, + "step": 22071 + }, + { + "epoch": 2.4238963320887326, + "grad_norm": 2.380265712738037, + "learning_rate": 5e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.7907984256744385, + "num_tokens": 571334280.0, + "step": 22072 + }, + { + "epoch": 2.4240061497913463, + "grad_norm": 2.212265729904175, + "learning_rate": 5e-06, + "loss": 0.6981, + "mean_token_accuracy": 0.7658481597900391, + "num_tokens": 571357307.0, + "step": 22073 + }, + { + "epoch": 2.42411596749396, + "grad_norm": 1.932010293006897, + "learning_rate": 5e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7491865158081055, + "num_tokens": 571386664.0, + "step": 22074 + }, + { + "epoch": 2.424225785196574, + "grad_norm": 2.0896010398864746, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7609641551971436, + "num_tokens": 571414696.0, + "step": 22075 + }, + { + "epoch": 2.4243356028991876, + "grad_norm": 2.3469362258911133, + "learning_rate": 5e-06, + "loss": 0.7314, + "mean_token_accuracy": 0.76331627368927, + "num_tokens": 571436170.0, + "step": 22076 + }, + { + "epoch": 2.424445420601801, + "grad_norm": 2.1369223594665527, + "learning_rate": 5e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.7805121541023254, + "num_tokens": 571461572.0, + "step": 22077 + }, + { + "epoch": 2.4245552383044147, + "grad_norm": 2.2529149055480957, + "learning_rate": 5e-06, + "loss": 0.6027, + "mean_token_accuracy": 0.8006162643432617, + "num_tokens": 571485045.0, + "step": 22078 + }, + { + "epoch": 2.4246650560070284, + "grad_norm": 1.9190090894699097, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.761641263961792, + "num_tokens": 571513094.0, + "step": 22079 + }, + { + "epoch": 2.424774873709642, + "grad_norm": 1.7046520709991455, + "learning_rate": 5e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7458690404891968, + "num_tokens": 571551316.0, + "step": 22080 + }, + { + "epoch": 2.4248846914122555, + "grad_norm": 1.7958495616912842, + "learning_rate": 5e-06, + "loss": 0.6006, + "mean_token_accuracy": 0.7950716018676758, + "num_tokens": 571579737.0, + "step": 22081 + }, + { + "epoch": 2.4249945091148692, + "grad_norm": 2.238210439682007, + "learning_rate": 5e-06, + "loss": 0.6522, + "mean_token_accuracy": 0.7831934690475464, + "num_tokens": 571602201.0, + "step": 22082 + }, + { + "epoch": 2.425104326817483, + "grad_norm": 2.1810035705566406, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7565604448318481, + "num_tokens": 571629485.0, + "step": 22083 + }, + { + "epoch": 2.4252141445200968, + "grad_norm": 2.1876020431518555, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7639039158821106, + "num_tokens": 571653860.0, + "step": 22084 + }, + { + "epoch": 2.42532396222271, + "grad_norm": 2.2698025703430176, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7601054906845093, + "num_tokens": 571677178.0, + "step": 22085 + }, + { + "epoch": 2.425433779925324, + "grad_norm": 1.8515383005142212, + "learning_rate": 5e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7643673419952393, + "num_tokens": 571707062.0, + "step": 22086 + }, + { + "epoch": 2.4255435976279376, + "grad_norm": 1.8311017751693726, + "learning_rate": 5e-06, + "loss": 0.6388, + "mean_token_accuracy": 0.7875075936317444, + "num_tokens": 571737238.0, + "step": 22087 + }, + { + "epoch": 2.4256534153305513, + "grad_norm": 1.8340661525726318, + "learning_rate": 5e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7684150338172913, + "num_tokens": 571767496.0, + "step": 22088 + }, + { + "epoch": 2.425763233033165, + "grad_norm": 2.131572723388672, + "learning_rate": 5e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7340341806411743, + "num_tokens": 571795267.0, + "step": 22089 + }, + { + "epoch": 2.4258730507357784, + "grad_norm": 2.0404140949249268, + "learning_rate": 5e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7794817686080933, + "num_tokens": 571823798.0, + "step": 22090 + }, + { + "epoch": 2.425982868438392, + "grad_norm": 1.78350830078125, + "learning_rate": 5e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.7718164324760437, + "num_tokens": 571854945.0, + "step": 22091 + }, + { + "epoch": 2.426092686141006, + "grad_norm": 2.242737293243408, + "learning_rate": 5e-06, + "loss": 0.6574, + "mean_token_accuracy": 0.780674934387207, + "num_tokens": 571876726.0, + "step": 22092 + }, + { + "epoch": 2.4262025038436197, + "grad_norm": 2.07387638092041, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7623581290245056, + "num_tokens": 571903170.0, + "step": 22093 + }, + { + "epoch": 2.4263123215462334, + "grad_norm": 2.0182385444641113, + "learning_rate": 5e-06, + "loss": 0.6376, + "mean_token_accuracy": 0.7843101024627686, + "num_tokens": 571930629.0, + "step": 22094 + }, + { + "epoch": 2.4264221392488468, + "grad_norm": 2.0686607360839844, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7688509821891785, + "num_tokens": 571954963.0, + "step": 22095 + }, + { + "epoch": 2.4265319569514605, + "grad_norm": 1.9850581884384155, + "learning_rate": 5e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7561039924621582, + "num_tokens": 571983023.0, + "step": 22096 + }, + { + "epoch": 2.4266417746540743, + "grad_norm": 2.0640921592712402, + "learning_rate": 5e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7719839215278625, + "num_tokens": 572009573.0, + "step": 22097 + }, + { + "epoch": 2.426751592356688, + "grad_norm": 2.118086576461792, + "learning_rate": 5e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7501394152641296, + "num_tokens": 572037779.0, + "step": 22098 + }, + { + "epoch": 2.426861410059302, + "grad_norm": 2.1740119457244873, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7720078825950623, + "num_tokens": 572061231.0, + "step": 22099 + }, + { + "epoch": 2.426971227761915, + "grad_norm": 1.9789507389068604, + "learning_rate": 5e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7370319962501526, + "num_tokens": 572090981.0, + "step": 22100 + }, + { + "epoch": 2.427081045464529, + "grad_norm": 1.8837213516235352, + "learning_rate": 5e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7583574056625366, + "num_tokens": 572119763.0, + "step": 22101 + }, + { + "epoch": 2.4271908631671426, + "grad_norm": 2.238908290863037, + "learning_rate": 5e-06, + "loss": 0.6721, + "mean_token_accuracy": 0.7773094177246094, + "num_tokens": 572142234.0, + "step": 22102 + }, + { + "epoch": 2.4273006808697564, + "grad_norm": 2.105574369430542, + "learning_rate": 5e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7705891728401184, + "num_tokens": 572166803.0, + "step": 22103 + }, + { + "epoch": 2.4274104985723697, + "grad_norm": 2.111945867538452, + "learning_rate": 5e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.768316924571991, + "num_tokens": 572190778.0, + "step": 22104 + }, + { + "epoch": 2.4275203162749834, + "grad_norm": 2.112312078475952, + "learning_rate": 5e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7545278072357178, + "num_tokens": 572217656.0, + "step": 22105 + }, + { + "epoch": 2.427630133977597, + "grad_norm": 2.0654799938201904, + "learning_rate": 5e-06, + "loss": 0.6687, + "mean_token_accuracy": 0.782403826713562, + "num_tokens": 572243095.0, + "step": 22106 + }, + { + "epoch": 2.427739951680211, + "grad_norm": 1.9139978885650635, + "learning_rate": 5e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7447295188903809, + "num_tokens": 572275272.0, + "step": 22107 + }, + { + "epoch": 2.4278497693828243, + "grad_norm": 1.9294886589050293, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7631563544273376, + "num_tokens": 572305902.0, + "step": 22108 + }, + { + "epoch": 2.427959587085438, + "grad_norm": 2.064556121826172, + "learning_rate": 5e-06, + "loss": 0.5854, + "mean_token_accuracy": 0.8023067712783813, + "num_tokens": 572329097.0, + "step": 22109 + }, + { + "epoch": 2.4280694047880518, + "grad_norm": 2.124382734298706, + "learning_rate": 5e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7614883780479431, + "num_tokens": 572354337.0, + "step": 22110 + }, + { + "epoch": 2.4281792224906655, + "grad_norm": 1.820369005203247, + "learning_rate": 5e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7610840797424316, + "num_tokens": 572387469.0, + "step": 22111 + }, + { + "epoch": 2.4282890401932793, + "grad_norm": 2.1614573001861572, + "learning_rate": 5e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7662728428840637, + "num_tokens": 572412965.0, + "step": 22112 + }, + { + "epoch": 2.4283988578958926, + "grad_norm": 2.2083067893981934, + "learning_rate": 5e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.7968696355819702, + "num_tokens": 572436506.0, + "step": 22113 + }, + { + "epoch": 2.4285086755985064, + "grad_norm": 2.005709409713745, + "learning_rate": 5e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7720205783843994, + "num_tokens": 572461603.0, + "step": 22114 + }, + { + "epoch": 2.42861849330112, + "grad_norm": 1.8632824420928955, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7621994018554688, + "num_tokens": 572495952.0, + "step": 22115 + }, + { + "epoch": 2.428728311003734, + "grad_norm": 2.1413352489471436, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7726515531539917, + "num_tokens": 572520401.0, + "step": 22116 + }, + { + "epoch": 2.4288381287063476, + "grad_norm": 2.0316367149353027, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7722190618515015, + "num_tokens": 572547705.0, + "step": 22117 + }, + { + "epoch": 2.428947946408961, + "grad_norm": 2.132408618927002, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7755300998687744, + "num_tokens": 572571662.0, + "step": 22118 + }, + { + "epoch": 2.4290577641115747, + "grad_norm": 1.9546717405319214, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7582568526268005, + "num_tokens": 572600067.0, + "step": 22119 + }, + { + "epoch": 2.4291675818141885, + "grad_norm": 1.9586089849472046, + "learning_rate": 5e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7730497717857361, + "num_tokens": 572629546.0, + "step": 22120 + }, + { + "epoch": 2.429277399516802, + "grad_norm": 1.9049968719482422, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7636666893959045, + "num_tokens": 572659573.0, + "step": 22121 + }, + { + "epoch": 2.429387217219416, + "grad_norm": 2.310537576675415, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7606257796287537, + "num_tokens": 572680948.0, + "step": 22122 + }, + { + "epoch": 2.4294970349220293, + "grad_norm": 2.4419310092926025, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7664937973022461, + "num_tokens": 572703169.0, + "step": 22123 + }, + { + "epoch": 2.429606852624643, + "grad_norm": 1.9750280380249023, + "learning_rate": 5e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.7745993137359619, + "num_tokens": 572729409.0, + "step": 22124 + }, + { + "epoch": 2.429716670327257, + "grad_norm": 1.98566472530365, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7565226554870605, + "num_tokens": 572756506.0, + "step": 22125 + }, + { + "epoch": 2.4298264880298706, + "grad_norm": 2.0039315223693848, + "learning_rate": 5e-06, + "loss": 0.653, + "mean_token_accuracy": 0.7781887650489807, + "num_tokens": 572782073.0, + "step": 22126 + }, + { + "epoch": 2.4299363057324843, + "grad_norm": 2.1012661457061768, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7591643929481506, + "num_tokens": 572808233.0, + "step": 22127 + }, + { + "epoch": 2.4300461234350976, + "grad_norm": 2.1665637493133545, + "learning_rate": 5e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7542235255241394, + "num_tokens": 572832403.0, + "step": 22128 + }, + { + "epoch": 2.4301559411377114, + "grad_norm": 2.1173367500305176, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7670042514801025, + "num_tokens": 572856310.0, + "step": 22129 + }, + { + "epoch": 2.430265758840325, + "grad_norm": 1.9886956214904785, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7661745548248291, + "num_tokens": 572884177.0, + "step": 22130 + }, + { + "epoch": 2.430375576542939, + "grad_norm": 2.155158758163452, + "learning_rate": 5e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7747139930725098, + "num_tokens": 572907455.0, + "step": 22131 + }, + { + "epoch": 2.430485394245552, + "grad_norm": 2.096440076828003, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7527294754981995, + "num_tokens": 572933592.0, + "step": 22132 + }, + { + "epoch": 2.430595211948166, + "grad_norm": 2.0486342906951904, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7587757110595703, + "num_tokens": 572960868.0, + "step": 22133 + }, + { + "epoch": 2.4307050296507797, + "grad_norm": 1.9249756336212158, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7769527435302734, + "num_tokens": 572991762.0, + "step": 22134 + }, + { + "epoch": 2.4308148473533935, + "grad_norm": 2.0979397296905518, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7664848566055298, + "num_tokens": 573017371.0, + "step": 22135 + }, + { + "epoch": 2.430924665056007, + "grad_norm": 2.1027157306671143, + "learning_rate": 5e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7629703283309937, + "num_tokens": 573043641.0, + "step": 22136 + }, + { + "epoch": 2.4310344827586206, + "grad_norm": 1.8713932037353516, + "learning_rate": 5e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7548247575759888, + "num_tokens": 573071444.0, + "step": 22137 + }, + { + "epoch": 2.4311443004612343, + "grad_norm": 2.081200122833252, + "learning_rate": 5e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7621061205863953, + "num_tokens": 573095711.0, + "step": 22138 + }, + { + "epoch": 2.431254118163848, + "grad_norm": 1.8480931520462036, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7533953189849854, + "num_tokens": 573127679.0, + "step": 22139 + }, + { + "epoch": 2.431363935866462, + "grad_norm": 2.04929256439209, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7494878768920898, + "num_tokens": 573153511.0, + "step": 22140 + }, + { + "epoch": 2.431473753569075, + "grad_norm": 2.135340690612793, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7769515514373779, + "num_tokens": 573175986.0, + "step": 22141 + }, + { + "epoch": 2.431583571271689, + "grad_norm": 1.997848629951477, + "learning_rate": 5e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7728334069252014, + "num_tokens": 573202589.0, + "step": 22142 + }, + { + "epoch": 2.4316933889743026, + "grad_norm": 2.2439475059509277, + "learning_rate": 5e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.774233341217041, + "num_tokens": 573226923.0, + "step": 22143 + }, + { + "epoch": 2.4318032066769164, + "grad_norm": 1.9537975788116455, + "learning_rate": 5e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7419734001159668, + "num_tokens": 573256835.0, + "step": 22144 + }, + { + "epoch": 2.43191302437953, + "grad_norm": 2.34354305267334, + "learning_rate": 5e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.7683879137039185, + "num_tokens": 573278700.0, + "step": 22145 + }, + { + "epoch": 2.4320228420821435, + "grad_norm": 2.140695095062256, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7664063572883606, + "num_tokens": 573303010.0, + "step": 22146 + }, + { + "epoch": 2.4321326597847572, + "grad_norm": 2.120396614074707, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7713826298713684, + "num_tokens": 573328598.0, + "step": 22147 + }, + { + "epoch": 2.432242477487371, + "grad_norm": 2.1920769214630127, + "learning_rate": 5e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7542397975921631, + "num_tokens": 573352537.0, + "step": 22148 + }, + { + "epoch": 2.4323522951899847, + "grad_norm": 2.170248508453369, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7542059421539307, + "num_tokens": 573375971.0, + "step": 22149 + }, + { + "epoch": 2.4324621128925985, + "grad_norm": 2.2982335090637207, + "learning_rate": 5e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7691417336463928, + "num_tokens": 573398600.0, + "step": 22150 + }, + { + "epoch": 2.432571930595212, + "grad_norm": 2.0708699226379395, + "learning_rate": 5e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7286121845245361, + "num_tokens": 573425582.0, + "step": 22151 + }, + { + "epoch": 2.4326817482978256, + "grad_norm": 1.9950414896011353, + "learning_rate": 5e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7462018728256226, + "num_tokens": 573457315.0, + "step": 22152 + }, + { + "epoch": 2.4327915660004393, + "grad_norm": 2.2562243938446045, + "learning_rate": 5e-06, + "loss": 0.6588, + "mean_token_accuracy": 0.787651777267456, + "num_tokens": 573478610.0, + "step": 22153 + }, + { + "epoch": 2.432901383703053, + "grad_norm": 2.203662872314453, + "learning_rate": 5e-06, + "loss": 0.5896, + "mean_token_accuracy": 0.8020463585853577, + "num_tokens": 573500674.0, + "step": 22154 + }, + { + "epoch": 2.4330112014056664, + "grad_norm": 1.7893980741500854, + "learning_rate": 5e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7426255941390991, + "num_tokens": 573535835.0, + "step": 22155 + }, + { + "epoch": 2.43312101910828, + "grad_norm": 1.9240251779556274, + "learning_rate": 5e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.7784209847450256, + "num_tokens": 573563587.0, + "step": 22156 + }, + { + "epoch": 2.433230836810894, + "grad_norm": 2.045189380645752, + "learning_rate": 5e-06, + "loss": 0.6711, + "mean_token_accuracy": 0.7819926738739014, + "num_tokens": 573590822.0, + "step": 22157 + }, + { + "epoch": 2.4333406545135077, + "grad_norm": 2.058305263519287, + "learning_rate": 5e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7664353847503662, + "num_tokens": 573616466.0, + "step": 22158 + }, + { + "epoch": 2.4334504722161214, + "grad_norm": 2.0819904804229736, + "learning_rate": 5e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.7758898735046387, + "num_tokens": 573641804.0, + "step": 22159 + }, + { + "epoch": 2.4335602899187347, + "grad_norm": 2.007291316986084, + "learning_rate": 5e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.773500919342041, + "num_tokens": 573667029.0, + "step": 22160 + }, + { + "epoch": 2.4336701076213485, + "grad_norm": 2.293633460998535, + "learning_rate": 5e-06, + "loss": 0.6319, + "mean_token_accuracy": 0.7941280007362366, + "num_tokens": 573689497.0, + "step": 22161 + }, + { + "epoch": 2.4337799253239623, + "grad_norm": 1.958756446838379, + "learning_rate": 5e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7437975406646729, + "num_tokens": 573718723.0, + "step": 22162 + }, + { + "epoch": 2.433889743026576, + "grad_norm": 1.7382080554962158, + "learning_rate": 5e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.789085865020752, + "num_tokens": 573750832.0, + "step": 22163 + }, + { + "epoch": 2.4339995607291893, + "grad_norm": 2.214045286178589, + "learning_rate": 5e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7601021528244019, + "num_tokens": 573779314.0, + "step": 22164 + }, + { + "epoch": 2.434109378431803, + "grad_norm": 1.9107589721679688, + "learning_rate": 5e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7531810998916626, + "num_tokens": 573809525.0, + "step": 22165 + }, + { + "epoch": 2.434219196134417, + "grad_norm": 1.835768699645996, + "learning_rate": 5e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7450798749923706, + "num_tokens": 573843320.0, + "step": 22166 + }, + { + "epoch": 2.4343290138370306, + "grad_norm": 2.028796672821045, + "learning_rate": 5e-06, + "loss": 0.6296, + "mean_token_accuracy": 0.7870360612869263, + "num_tokens": 573868142.0, + "step": 22167 + }, + { + "epoch": 2.4344388315396444, + "grad_norm": 2.0529942512512207, + "learning_rate": 5e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7486835718154907, + "num_tokens": 573895795.0, + "step": 22168 + }, + { + "epoch": 2.4345486492422577, + "grad_norm": 2.2644073963165283, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7638272047042847, + "num_tokens": 573919167.0, + "step": 22169 + }, + { + "epoch": 2.4346584669448714, + "grad_norm": 2.309804916381836, + "learning_rate": 5e-06, + "loss": 0.619, + "mean_token_accuracy": 0.7922077775001526, + "num_tokens": 573939552.0, + "step": 22170 + }, + { + "epoch": 2.434768284647485, + "grad_norm": 1.9883393049240112, + "learning_rate": 5e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.780661940574646, + "num_tokens": 573967154.0, + "step": 22171 + }, + { + "epoch": 2.434878102350099, + "grad_norm": 1.9984657764434814, + "learning_rate": 5e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.749412477016449, + "num_tokens": 573996852.0, + "step": 22172 + }, + { + "epoch": 2.4349879200527127, + "grad_norm": 2.131511926651001, + "learning_rate": 5e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7711572647094727, + "num_tokens": 574022018.0, + "step": 22173 + }, + { + "epoch": 2.435097737755326, + "grad_norm": 2.1797189712524414, + "learning_rate": 5e-06, + "loss": 0.65, + "mean_token_accuracy": 0.7820049524307251, + "num_tokens": 574045135.0, + "step": 22174 + }, + { + "epoch": 2.4352075554579398, + "grad_norm": 2.1094515323638916, + "learning_rate": 5e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.777484655380249, + "num_tokens": 574072510.0, + "step": 22175 + }, + { + "epoch": 2.4353173731605535, + "grad_norm": 2.007770299911499, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7347530722618103, + "num_tokens": 574104112.0, + "step": 22176 + }, + { + "epoch": 2.4354271908631673, + "grad_norm": 2.0796420574188232, + "learning_rate": 5e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7692235112190247, + "num_tokens": 574129766.0, + "step": 22177 + }, + { + "epoch": 2.435537008565781, + "grad_norm": 2.0448670387268066, + "learning_rate": 5e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7673888206481934, + "num_tokens": 574156459.0, + "step": 22178 + }, + { + "epoch": 2.4356468262683943, + "grad_norm": 1.9591294527053833, + "learning_rate": 5e-06, + "loss": 0.6836, + "mean_token_accuracy": 0.7732518911361694, + "num_tokens": 574182684.0, + "step": 22179 + }, + { + "epoch": 2.435756643971008, + "grad_norm": 2.0952978134155273, + "learning_rate": 5e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7572308778762817, + "num_tokens": 574208905.0, + "step": 22180 + }, + { + "epoch": 2.435866461673622, + "grad_norm": 2.206526756286621, + "learning_rate": 5e-06, + "loss": 0.6315, + "mean_token_accuracy": 0.793558657169342, + "num_tokens": 574229578.0, + "step": 22181 + }, + { + "epoch": 2.4359762793762356, + "grad_norm": 2.170384168624878, + "learning_rate": 5e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.748100996017456, + "num_tokens": 574253775.0, + "step": 22182 + }, + { + "epoch": 2.436086097078849, + "grad_norm": 1.9944239854812622, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7629813551902771, + "num_tokens": 574282936.0, + "step": 22183 + }, + { + "epoch": 2.4361959147814627, + "grad_norm": 2.4185261726379395, + "learning_rate": 5e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7756149768829346, + "num_tokens": 574303916.0, + "step": 22184 + }, + { + "epoch": 2.4363057324840764, + "grad_norm": 2.3648245334625244, + "learning_rate": 5e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.7942148447036743, + "num_tokens": 574325223.0, + "step": 22185 + }, + { + "epoch": 2.43641555018669, + "grad_norm": 2.179490804672241, + "learning_rate": 5e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.7865361571311951, + "num_tokens": 574349897.0, + "step": 22186 + }, + { + "epoch": 2.4365253678893035, + "grad_norm": 1.976060152053833, + "learning_rate": 5e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7731242775917053, + "num_tokens": 574377869.0, + "step": 22187 + }, + { + "epoch": 2.4366351855919173, + "grad_norm": 2.0797457695007324, + "learning_rate": 5e-06, + "loss": 0.6543, + "mean_token_accuracy": 0.7924978733062744, + "num_tokens": 574401770.0, + "step": 22188 + }, + { + "epoch": 2.436745003294531, + "grad_norm": 2.2210376262664795, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7696977257728577, + "num_tokens": 574424659.0, + "step": 22189 + }, + { + "epoch": 2.436854820997145, + "grad_norm": 2.0676510334014893, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7561602592468262, + "num_tokens": 574454422.0, + "step": 22190 + }, + { + "epoch": 2.4369646386997585, + "grad_norm": 2.2698991298675537, + "learning_rate": 5e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.7679358124732971, + "num_tokens": 574476033.0, + "step": 22191 + }, + { + "epoch": 2.437074456402372, + "grad_norm": 2.1297340393066406, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.77456134557724, + "num_tokens": 574498508.0, + "step": 22192 + }, + { + "epoch": 2.4371842741049856, + "grad_norm": 2.0779836177825928, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7664256691932678, + "num_tokens": 574523357.0, + "step": 22193 + }, + { + "epoch": 2.4372940918075994, + "grad_norm": 2.216029167175293, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7598700523376465, + "num_tokens": 574547484.0, + "step": 22194 + }, + { + "epoch": 2.437403909510213, + "grad_norm": 2.0609567165374756, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7654580473899841, + "num_tokens": 574574346.0, + "step": 22195 + }, + { + "epoch": 2.437513727212827, + "grad_norm": 2.3377490043640137, + "learning_rate": 5e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.7826570272445679, + "num_tokens": 574594171.0, + "step": 22196 + }, + { + "epoch": 2.43762354491544, + "grad_norm": 2.0621466636657715, + "learning_rate": 5e-06, + "loss": 0.7832, + "mean_token_accuracy": 0.7528554797172546, + "num_tokens": 574621087.0, + "step": 22197 + }, + { + "epoch": 2.437733362618054, + "grad_norm": 2.264954090118408, + "learning_rate": 5e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7484583854675293, + "num_tokens": 574647498.0, + "step": 22198 + }, + { + "epoch": 2.4378431803206677, + "grad_norm": 2.279670000076294, + "learning_rate": 5e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7675954103469849, + "num_tokens": 574669085.0, + "step": 22199 + }, + { + "epoch": 2.4379529980232815, + "grad_norm": 2.026334524154663, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.769455075263977, + "num_tokens": 574697743.0, + "step": 22200 + }, + { + "epoch": 2.4380628157258952, + "grad_norm": 2.081636905670166, + "learning_rate": 5e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7739982604980469, + "num_tokens": 574724326.0, + "step": 22201 + }, + { + "epoch": 2.4381726334285085, + "grad_norm": 2.188875198364258, + "learning_rate": 5e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7520059943199158, + "num_tokens": 574748804.0, + "step": 22202 + }, + { + "epoch": 2.4382824511311223, + "grad_norm": 2.1029052734375, + "learning_rate": 5e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7651612758636475, + "num_tokens": 574774231.0, + "step": 22203 + }, + { + "epoch": 2.438392268833736, + "grad_norm": 2.224494457244873, + "learning_rate": 5e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.7738350629806519, + "num_tokens": 574797056.0, + "step": 22204 + }, + { + "epoch": 2.43850208653635, + "grad_norm": 2.064832925796509, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.772361159324646, + "num_tokens": 574822832.0, + "step": 22205 + }, + { + "epoch": 2.4386119042389636, + "grad_norm": 1.9222229719161987, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7602638006210327, + "num_tokens": 574851563.0, + "step": 22206 + }, + { + "epoch": 2.438721721941577, + "grad_norm": 1.8827890157699585, + "learning_rate": 5e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7600551247596741, + "num_tokens": 574885716.0, + "step": 22207 + }, + { + "epoch": 2.4388315396441906, + "grad_norm": 2.045257806777954, + "learning_rate": 5e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.7673894166946411, + "num_tokens": 574912868.0, + "step": 22208 + }, + { + "epoch": 2.4389413573468044, + "grad_norm": 2.2195777893066406, + "learning_rate": 5e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.7762042284011841, + "num_tokens": 574936577.0, + "step": 22209 + }, + { + "epoch": 2.439051175049418, + "grad_norm": 2.1886463165283203, + "learning_rate": 5e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7745153903961182, + "num_tokens": 574958715.0, + "step": 22210 + }, + { + "epoch": 2.4391609927520315, + "grad_norm": 2.0872809886932373, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.740341305732727, + "num_tokens": 574985414.0, + "step": 22211 + }, + { + "epoch": 2.439270810454645, + "grad_norm": 1.8531285524368286, + "learning_rate": 5e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7460157871246338, + "num_tokens": 575017445.0, + "step": 22212 + }, + { + "epoch": 2.439380628157259, + "grad_norm": 2.206332206726074, + "learning_rate": 5e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7778669595718384, + "num_tokens": 575038946.0, + "step": 22213 + }, + { + "epoch": 2.4394904458598727, + "grad_norm": 1.9427777528762817, + "learning_rate": 5e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7616557478904724, + "num_tokens": 575067469.0, + "step": 22214 + }, + { + "epoch": 2.439600263562486, + "grad_norm": 1.8842583894729614, + "learning_rate": 5e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7679932117462158, + "num_tokens": 575097071.0, + "step": 22215 + }, + { + "epoch": 2.4397100812651, + "grad_norm": 2.0826845169067383, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7467218637466431, + "num_tokens": 575124380.0, + "step": 22216 + }, + { + "epoch": 2.4398198989677136, + "grad_norm": 2.1266374588012695, + "learning_rate": 5e-06, + "loss": 0.6526, + "mean_token_accuracy": 0.7863098382949829, + "num_tokens": 575148659.0, + "step": 22217 + }, + { + "epoch": 2.4399297166703273, + "grad_norm": 2.2159717082977295, + "learning_rate": 5e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7822807431221008, + "num_tokens": 575171439.0, + "step": 22218 + }, + { + "epoch": 2.440039534372941, + "grad_norm": 2.283198356628418, + "learning_rate": 5e-06, + "loss": 0.6407, + "mean_token_accuracy": 0.7882087826728821, + "num_tokens": 575194715.0, + "step": 22219 + }, + { + "epoch": 2.4401493520755544, + "grad_norm": 2.6207542419433594, + "learning_rate": 5e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7694346904754639, + "num_tokens": 575212656.0, + "step": 22220 + }, + { + "epoch": 2.440259169778168, + "grad_norm": 2.018587827682495, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7664937973022461, + "num_tokens": 575239397.0, + "step": 22221 + }, + { + "epoch": 2.440368987480782, + "grad_norm": 2.207460403442383, + "learning_rate": 5e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.779829740524292, + "num_tokens": 575261465.0, + "step": 22222 + }, + { + "epoch": 2.4404788051833957, + "grad_norm": 2.178534507751465, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7541486620903015, + "num_tokens": 575285058.0, + "step": 22223 + }, + { + "epoch": 2.4405886228860094, + "grad_norm": 2.216616153717041, + "learning_rate": 5e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.782413125038147, + "num_tokens": 575307628.0, + "step": 22224 + }, + { + "epoch": 2.4406984405886227, + "grad_norm": 1.959148645401001, + "learning_rate": 5e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.7546212673187256, + "num_tokens": 575338681.0, + "step": 22225 + }, + { + "epoch": 2.4408082582912365, + "grad_norm": 2.187417984008789, + "learning_rate": 5e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7626452445983887, + "num_tokens": 575364006.0, + "step": 22226 + }, + { + "epoch": 2.4409180759938502, + "grad_norm": 2.2845892906188965, + "learning_rate": 5e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.751495897769928, + "num_tokens": 575390092.0, + "step": 22227 + }, + { + "epoch": 2.441027893696464, + "grad_norm": 2.283282518386841, + "learning_rate": 5e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.8018264770507812, + "num_tokens": 575411851.0, + "step": 22228 + }, + { + "epoch": 2.4411377113990778, + "grad_norm": 2.2590627670288086, + "learning_rate": 5e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7530425786972046, + "num_tokens": 575435248.0, + "step": 22229 + }, + { + "epoch": 2.441247529101691, + "grad_norm": 2.0942435264587402, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7724184989929199, + "num_tokens": 575460711.0, + "step": 22230 + }, + { + "epoch": 2.441357346804305, + "grad_norm": 2.1614890098571777, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7652489542961121, + "num_tokens": 575485967.0, + "step": 22231 + }, + { + "epoch": 2.4414671645069186, + "grad_norm": 2.1184403896331787, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7843225002288818, + "num_tokens": 575510981.0, + "step": 22232 + }, + { + "epoch": 2.4415769822095323, + "grad_norm": 2.3701086044311523, + "learning_rate": 5e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.7892534732818604, + "num_tokens": 575530310.0, + "step": 22233 + }, + { + "epoch": 2.4416867999121457, + "grad_norm": 2.141047477722168, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7716819643974304, + "num_tokens": 575556878.0, + "step": 22234 + }, + { + "epoch": 2.4417966176147594, + "grad_norm": 2.1188552379608154, + "learning_rate": 5e-06, + "loss": 0.6442, + "mean_token_accuracy": 0.785585880279541, + "num_tokens": 575578766.0, + "step": 22235 + }, + { + "epoch": 2.441906435317373, + "grad_norm": 2.1094603538513184, + "learning_rate": 5e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.7770259380340576, + "num_tokens": 575604950.0, + "step": 22236 + }, + { + "epoch": 2.442016253019987, + "grad_norm": 2.059277296066284, + "learning_rate": 5e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7733215093612671, + "num_tokens": 575630796.0, + "step": 22237 + }, + { + "epoch": 2.4421260707226002, + "grad_norm": 2.169766902923584, + "learning_rate": 5e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7608528733253479, + "num_tokens": 575653312.0, + "step": 22238 + }, + { + "epoch": 2.442235888425214, + "grad_norm": 1.9738895893096924, + "learning_rate": 5e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7726513147354126, + "num_tokens": 575680864.0, + "step": 22239 + }, + { + "epoch": 2.4423457061278278, + "grad_norm": 2.3283462524414062, + "learning_rate": 5e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7758723497390747, + "num_tokens": 575704764.0, + "step": 22240 + }, + { + "epoch": 2.4424555238304415, + "grad_norm": 2.262248992919922, + "learning_rate": 5e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7609273195266724, + "num_tokens": 575730526.0, + "step": 22241 + }, + { + "epoch": 2.4425653415330553, + "grad_norm": 2.226250171661377, + "learning_rate": 5e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7609586119651794, + "num_tokens": 575755524.0, + "step": 22242 + }, + { + "epoch": 2.4426751592356686, + "grad_norm": 2.5428225994110107, + "learning_rate": 5e-06, + "loss": 0.5902, + "mean_token_accuracy": 0.7993240356445312, + "num_tokens": 575771954.0, + "step": 22243 + }, + { + "epoch": 2.4427849769382823, + "grad_norm": 2.046618700027466, + "learning_rate": 5e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7700150609016418, + "num_tokens": 575795972.0, + "step": 22244 + }, + { + "epoch": 2.442894794640896, + "grad_norm": 2.048652172088623, + "learning_rate": 5e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.7903870344161987, + "num_tokens": 575820070.0, + "step": 22245 + }, + { + "epoch": 2.44300461234351, + "grad_norm": 1.939315915107727, + "learning_rate": 5e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.7781384587287903, + "num_tokens": 575847911.0, + "step": 22246 + }, + { + "epoch": 2.4431144300461236, + "grad_norm": 2.0345427989959717, + "learning_rate": 5e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7330325245857239, + "num_tokens": 575877953.0, + "step": 22247 + }, + { + "epoch": 2.443224247748737, + "grad_norm": 2.1816906929016113, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.781747579574585, + "num_tokens": 575902461.0, + "step": 22248 + }, + { + "epoch": 2.4433340654513507, + "grad_norm": 2.1213221549987793, + "learning_rate": 5e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.776108980178833, + "num_tokens": 575926321.0, + "step": 22249 + }, + { + "epoch": 2.4434438831539644, + "grad_norm": 2.047114849090576, + "learning_rate": 5e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7628452777862549, + "num_tokens": 575955982.0, + "step": 22250 + }, + { + "epoch": 2.443553700856578, + "grad_norm": 2.251497268676758, + "learning_rate": 5e-06, + "loss": 0.6349, + "mean_token_accuracy": 0.7817938923835754, + "num_tokens": 575978485.0, + "step": 22251 + }, + { + "epoch": 2.443663518559192, + "grad_norm": 2.151416778564453, + "learning_rate": 5e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.7820199728012085, + "num_tokens": 576002028.0, + "step": 22252 + }, + { + "epoch": 2.4437733362618053, + "grad_norm": 2.0361742973327637, + "learning_rate": 5e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.7705518007278442, + "num_tokens": 576026613.0, + "step": 22253 + }, + { + "epoch": 2.443883153964419, + "grad_norm": 2.0600898265838623, + "learning_rate": 5e-06, + "loss": 0.6697, + "mean_token_accuracy": 0.775158703327179, + "num_tokens": 576053298.0, + "step": 22254 + }, + { + "epoch": 2.4439929716670328, + "grad_norm": 2.1513452529907227, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7691868543624878, + "num_tokens": 576075906.0, + "step": 22255 + }, + { + "epoch": 2.4441027893696465, + "grad_norm": 2.1960623264312744, + "learning_rate": 5e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.7750880718231201, + "num_tokens": 576098845.0, + "step": 22256 + }, + { + "epoch": 2.4442126070722603, + "grad_norm": 1.9264262914657593, + "learning_rate": 5e-06, + "loss": 0.6776, + "mean_token_accuracy": 0.7765547633171082, + "num_tokens": 576126464.0, + "step": 22257 + }, + { + "epoch": 2.4443224247748736, + "grad_norm": 2.115138292312622, + "learning_rate": 5e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.769787073135376, + "num_tokens": 576150661.0, + "step": 22258 + }, + { + "epoch": 2.4444322424774874, + "grad_norm": 1.9703497886657715, + "learning_rate": 5e-06, + "loss": 0.7148, + "mean_token_accuracy": 0.7688595056533813, + "num_tokens": 576180910.0, + "step": 22259 + }, + { + "epoch": 2.444542060180101, + "grad_norm": 2.1369895935058594, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7544758319854736, + "num_tokens": 576206765.0, + "step": 22260 + }, + { + "epoch": 2.444651877882715, + "grad_norm": 2.2109460830688477, + "learning_rate": 5e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7637226581573486, + "num_tokens": 576230231.0, + "step": 22261 + }, + { + "epoch": 2.444761695585328, + "grad_norm": 2.0183467864990234, + "learning_rate": 5e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7515298128128052, + "num_tokens": 576256163.0, + "step": 22262 + }, + { + "epoch": 2.444871513287942, + "grad_norm": 1.899917721748352, + "learning_rate": 5e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7557509541511536, + "num_tokens": 576288580.0, + "step": 22263 + }, + { + "epoch": 2.4449813309905557, + "grad_norm": 2.3652186393737793, + "learning_rate": 5e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7710024118423462, + "num_tokens": 576311296.0, + "step": 22264 + }, + { + "epoch": 2.4450911486931695, + "grad_norm": 1.9824894666671753, + "learning_rate": 5e-06, + "loss": 0.7638, + "mean_token_accuracy": 0.7535141706466675, + "num_tokens": 576340478.0, + "step": 22265 + }, + { + "epoch": 2.4452009663957828, + "grad_norm": 2.339355230331421, + "learning_rate": 5e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7766767144203186, + "num_tokens": 576361706.0, + "step": 22266 + }, + { + "epoch": 2.4453107840983965, + "grad_norm": 2.108341932296753, + "learning_rate": 5e-06, + "loss": 0.774, + "mean_token_accuracy": 0.7482547760009766, + "num_tokens": 576388673.0, + "step": 22267 + }, + { + "epoch": 2.4454206018010103, + "grad_norm": 1.977159857749939, + "learning_rate": 5e-06, + "loss": 0.6631, + "mean_token_accuracy": 0.7863465547561646, + "num_tokens": 576414492.0, + "step": 22268 + }, + { + "epoch": 2.445530419503624, + "grad_norm": 1.79849374294281, + "learning_rate": 5e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.732146143913269, + "num_tokens": 576446377.0, + "step": 22269 + }, + { + "epoch": 2.445640237206238, + "grad_norm": 2.064366579055786, + "learning_rate": 5e-06, + "loss": 0.6184, + "mean_token_accuracy": 0.7946876287460327, + "num_tokens": 576470662.0, + "step": 22270 + }, + { + "epoch": 2.445750054908851, + "grad_norm": 1.97777259349823, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7555105686187744, + "num_tokens": 576498614.0, + "step": 22271 + }, + { + "epoch": 2.445859872611465, + "grad_norm": 2.4905991554260254, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7639918327331543, + "num_tokens": 576518183.0, + "step": 22272 + }, + { + "epoch": 2.4459696903140786, + "grad_norm": 2.313856363296509, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7724895477294922, + "num_tokens": 576540834.0, + "step": 22273 + }, + { + "epoch": 2.4460795080166924, + "grad_norm": 2.215038537979126, + "learning_rate": 5e-06, + "loss": 0.6473, + "mean_token_accuracy": 0.7894749045372009, + "num_tokens": 576564363.0, + "step": 22274 + }, + { + "epoch": 2.446189325719306, + "grad_norm": 2.2804343700408936, + "learning_rate": 5e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.7778102159500122, + "num_tokens": 576584066.0, + "step": 22275 + }, + { + "epoch": 2.4462991434219195, + "grad_norm": 2.067769765853882, + "learning_rate": 5e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7708585262298584, + "num_tokens": 576610073.0, + "step": 22276 + }, + { + "epoch": 2.446408961124533, + "grad_norm": 1.797554850578308, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7498871088027954, + "num_tokens": 576641306.0, + "step": 22277 + }, + { + "epoch": 2.446518778827147, + "grad_norm": 1.9146698713302612, + "learning_rate": 5e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7538809776306152, + "num_tokens": 576678222.0, + "step": 22278 + }, + { + "epoch": 2.4466285965297607, + "grad_norm": 2.2532095909118652, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7614832520484924, + "num_tokens": 576700756.0, + "step": 22279 + }, + { + "epoch": 2.4467384142323745, + "grad_norm": 2.3302395343780518, + "learning_rate": 5e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.780530571937561, + "num_tokens": 576722432.0, + "step": 22280 + }, + { + "epoch": 2.446848231934988, + "grad_norm": 2.295631170272827, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7678031325340271, + "num_tokens": 576745860.0, + "step": 22281 + }, + { + "epoch": 2.4469580496376016, + "grad_norm": 1.9893213510513306, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7734708786010742, + "num_tokens": 576772985.0, + "step": 22282 + }, + { + "epoch": 2.4470678673402153, + "grad_norm": 2.258227825164795, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7762462496757507, + "num_tokens": 576793932.0, + "step": 22283 + }, + { + "epoch": 2.447177685042829, + "grad_norm": 2.2412362098693848, + "learning_rate": 5e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.7802771329879761, + "num_tokens": 576814378.0, + "step": 22284 + }, + { + "epoch": 2.4472875027454424, + "grad_norm": 2.1258580684661865, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7673441767692566, + "num_tokens": 576840222.0, + "step": 22285 + }, + { + "epoch": 2.447397320448056, + "grad_norm": 2.126112937927246, + "learning_rate": 5e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7708027958869934, + "num_tokens": 576866054.0, + "step": 22286 + }, + { + "epoch": 2.44750713815067, + "grad_norm": 2.192800521850586, + "learning_rate": 5e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7531317472457886, + "num_tokens": 576889340.0, + "step": 22287 + }, + { + "epoch": 2.4476169558532836, + "grad_norm": 1.8879647254943848, + "learning_rate": 5e-06, + "loss": 0.626, + "mean_token_accuracy": 0.7916312217712402, + "num_tokens": 576916994.0, + "step": 22288 + }, + { + "epoch": 2.447726773555897, + "grad_norm": 1.9570947885513306, + "learning_rate": 5e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7580658197402954, + "num_tokens": 576945789.0, + "step": 22289 + }, + { + "epoch": 2.4478365912585107, + "grad_norm": 2.330756902694702, + "learning_rate": 5e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7673015594482422, + "num_tokens": 576968179.0, + "step": 22290 + }, + { + "epoch": 2.4479464089611245, + "grad_norm": 2.0899593830108643, + "learning_rate": 5e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7639007568359375, + "num_tokens": 576995085.0, + "step": 22291 + }, + { + "epoch": 2.4480562266637382, + "grad_norm": 1.9424604177474976, + "learning_rate": 5e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7600452303886414, + "num_tokens": 577023917.0, + "step": 22292 + }, + { + "epoch": 2.448166044366352, + "grad_norm": 2.030425548553467, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7604641914367676, + "num_tokens": 577050635.0, + "step": 22293 + }, + { + "epoch": 2.4482758620689653, + "grad_norm": 2.1298649311065674, + "learning_rate": 5e-06, + "loss": 0.6073, + "mean_token_accuracy": 0.7909022569656372, + "num_tokens": 577071878.0, + "step": 22294 + }, + { + "epoch": 2.448385679771579, + "grad_norm": 2.2654550075531006, + "learning_rate": 5e-06, + "loss": 0.6646, + "mean_token_accuracy": 0.7803076505661011, + "num_tokens": 577095490.0, + "step": 22295 + }, + { + "epoch": 2.448495497474193, + "grad_norm": 2.116825580596924, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7518975138664246, + "num_tokens": 577122643.0, + "step": 22296 + }, + { + "epoch": 2.4486053151768066, + "grad_norm": 2.4531822204589844, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7702336311340332, + "num_tokens": 577144079.0, + "step": 22297 + }, + { + "epoch": 2.4487151328794203, + "grad_norm": 1.9584646224975586, + "learning_rate": 5e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7487746477127075, + "num_tokens": 577174641.0, + "step": 22298 + }, + { + "epoch": 2.4488249505820336, + "grad_norm": 1.8828569650650024, + "learning_rate": 5e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7594466209411621, + "num_tokens": 577205768.0, + "step": 22299 + }, + { + "epoch": 2.4489347682846474, + "grad_norm": 1.9758304357528687, + "learning_rate": 5e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7627185583114624, + "num_tokens": 577233636.0, + "step": 22300 + }, + { + "epoch": 2.449044585987261, + "grad_norm": 2.128758192062378, + "learning_rate": 5e-06, + "loss": 0.6776, + "mean_token_accuracy": 0.7736464738845825, + "num_tokens": 577257719.0, + "step": 22301 + }, + { + "epoch": 2.449154403689875, + "grad_norm": 1.8298884630203247, + "learning_rate": 5e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7505283355712891, + "num_tokens": 577294175.0, + "step": 22302 + }, + { + "epoch": 2.4492642213924887, + "grad_norm": 2.3477044105529785, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.750270426273346, + "num_tokens": 577315948.0, + "step": 22303 + }, + { + "epoch": 2.449374039095102, + "grad_norm": 2.223977565765381, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7737200260162354, + "num_tokens": 577340302.0, + "step": 22304 + }, + { + "epoch": 2.4494838567977157, + "grad_norm": 2.100940227508545, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7711482048034668, + "num_tokens": 577365507.0, + "step": 22305 + }, + { + "epoch": 2.4495936745003295, + "grad_norm": 2.42177152633667, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7679190039634705, + "num_tokens": 577386680.0, + "step": 22306 + }, + { + "epoch": 2.4497034922029433, + "grad_norm": 2.134605884552002, + "learning_rate": 5e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.7748262882232666, + "num_tokens": 577408602.0, + "step": 22307 + }, + { + "epoch": 2.449813309905557, + "grad_norm": 2.0318679809570312, + "learning_rate": 5e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.771599531173706, + "num_tokens": 577432636.0, + "step": 22308 + }, + { + "epoch": 2.4499231276081703, + "grad_norm": 2.178057909011841, + "learning_rate": 5e-06, + "loss": 0.6734, + "mean_token_accuracy": 0.7759342193603516, + "num_tokens": 577456869.0, + "step": 22309 + }, + { + "epoch": 2.450032945310784, + "grad_norm": 2.036303758621216, + "learning_rate": 5e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.773350179195404, + "num_tokens": 577481322.0, + "step": 22310 + }, + { + "epoch": 2.450142763013398, + "grad_norm": 2.6578917503356934, + "learning_rate": 5e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.784072756767273, + "num_tokens": 577499089.0, + "step": 22311 + }, + { + "epoch": 2.4502525807160116, + "grad_norm": 2.083338975906372, + "learning_rate": 5e-06, + "loss": 0.665, + "mean_token_accuracy": 0.7821067571640015, + "num_tokens": 577526040.0, + "step": 22312 + }, + { + "epoch": 2.450362398418625, + "grad_norm": 1.8624695539474487, + "learning_rate": 5e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7303999662399292, + "num_tokens": 577561346.0, + "step": 22313 + }, + { + "epoch": 2.4504722161212387, + "grad_norm": 2.006896734237671, + "learning_rate": 5e-06, + "loss": 0.6408, + "mean_token_accuracy": 0.7846829891204834, + "num_tokens": 577587308.0, + "step": 22314 + }, + { + "epoch": 2.4505820338238524, + "grad_norm": 2.125965118408203, + "learning_rate": 5e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.75771564245224, + "num_tokens": 577613127.0, + "step": 22315 + }, + { + "epoch": 2.450691851526466, + "grad_norm": 2.1063778400421143, + "learning_rate": 5e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7595494985580444, + "num_tokens": 577637858.0, + "step": 22316 + }, + { + "epoch": 2.4508016692290795, + "grad_norm": 2.0622315406799316, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7674704194068909, + "num_tokens": 577663849.0, + "step": 22317 + }, + { + "epoch": 2.4509114869316933, + "grad_norm": 2.2525901794433594, + "learning_rate": 5e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7524203658103943, + "num_tokens": 577687798.0, + "step": 22318 + }, + { + "epoch": 2.451021304634307, + "grad_norm": 1.9500998258590698, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.760217010974884, + "num_tokens": 577719387.0, + "step": 22319 + }, + { + "epoch": 2.4511311223369208, + "grad_norm": 2.15864634513855, + "learning_rate": 5e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7600011825561523, + "num_tokens": 577743854.0, + "step": 22320 + }, + { + "epoch": 2.4512409400395345, + "grad_norm": 2.2569122314453125, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7751281261444092, + "num_tokens": 577766025.0, + "step": 22321 + }, + { + "epoch": 2.451350757742148, + "grad_norm": 2.071202039718628, + "learning_rate": 5e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7591808438301086, + "num_tokens": 577793724.0, + "step": 22322 + }, + { + "epoch": 2.4514605754447616, + "grad_norm": 2.1237711906433105, + "learning_rate": 5e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7639994621276855, + "num_tokens": 577821595.0, + "step": 22323 + }, + { + "epoch": 2.4515703931473753, + "grad_norm": 2.21327543258667, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7789760828018188, + "num_tokens": 577846229.0, + "step": 22324 + }, + { + "epoch": 2.451680210849989, + "grad_norm": 2.0455644130706787, + "learning_rate": 5e-06, + "loss": 0.5911, + "mean_token_accuracy": 0.7939951419830322, + "num_tokens": 577871157.0, + "step": 22325 + }, + { + "epoch": 2.451790028552603, + "grad_norm": 1.962816596031189, + "learning_rate": 5e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.777879536151886, + "num_tokens": 577896437.0, + "step": 22326 + }, + { + "epoch": 2.451899846255216, + "grad_norm": 1.9404329061508179, + "learning_rate": 5e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7581790685653687, + "num_tokens": 577924889.0, + "step": 22327 + }, + { + "epoch": 2.45200966395783, + "grad_norm": 2.007261037826538, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7664976119995117, + "num_tokens": 577950876.0, + "step": 22328 + }, + { + "epoch": 2.4521194816604437, + "grad_norm": 1.9165856838226318, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7678165435791016, + "num_tokens": 577978910.0, + "step": 22329 + }, + { + "epoch": 2.4522292993630574, + "grad_norm": 2.1915128231048584, + "learning_rate": 5e-06, + "loss": 0.6265, + "mean_token_accuracy": 0.7929432392120361, + "num_tokens": 578001017.0, + "step": 22330 + }, + { + "epoch": 2.452339117065671, + "grad_norm": 2.1508002281188965, + "learning_rate": 5e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.759945273399353, + "num_tokens": 578024493.0, + "step": 22331 + }, + { + "epoch": 2.4524489347682845, + "grad_norm": 2.2129926681518555, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7762856483459473, + "num_tokens": 578047953.0, + "step": 22332 + }, + { + "epoch": 2.4525587524708983, + "grad_norm": 2.12097430229187, + "learning_rate": 5e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7532379627227783, + "num_tokens": 578077511.0, + "step": 22333 + }, + { + "epoch": 2.452668570173512, + "grad_norm": 1.9569990634918213, + "learning_rate": 5e-06, + "loss": 0.689, + "mean_token_accuracy": 0.7738581299781799, + "num_tokens": 578104659.0, + "step": 22334 + }, + { + "epoch": 2.452778387876126, + "grad_norm": 2.1259069442749023, + "learning_rate": 5e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.774799644947052, + "num_tokens": 578130679.0, + "step": 22335 + }, + { + "epoch": 2.452888205578739, + "grad_norm": 2.2025561332702637, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7648000717163086, + "num_tokens": 578154801.0, + "step": 22336 + }, + { + "epoch": 2.452998023281353, + "grad_norm": 1.940239429473877, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7635613679885864, + "num_tokens": 578183617.0, + "step": 22337 + }, + { + "epoch": 2.4531078409839666, + "grad_norm": 2.0082788467407227, + "learning_rate": 5e-06, + "loss": 0.7109, + "mean_token_accuracy": 0.766113817691803, + "num_tokens": 578210609.0, + "step": 22338 + }, + { + "epoch": 2.4532176586865804, + "grad_norm": 2.0374770164489746, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7772436141967773, + "num_tokens": 578237598.0, + "step": 22339 + }, + { + "epoch": 2.453327476389194, + "grad_norm": 2.207751750946045, + "learning_rate": 5e-06, + "loss": 0.644, + "mean_token_accuracy": 0.7822348475456238, + "num_tokens": 578259702.0, + "step": 22340 + }, + { + "epoch": 2.4534372940918074, + "grad_norm": 1.9626591205596924, + "learning_rate": 5e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.7870261669158936, + "num_tokens": 578284939.0, + "step": 22341 + }, + { + "epoch": 2.453547111794421, + "grad_norm": 2.140976667404175, + "learning_rate": 5e-06, + "loss": 0.6282, + "mean_token_accuracy": 0.7910044193267822, + "num_tokens": 578310968.0, + "step": 22342 + }, + { + "epoch": 2.453656929497035, + "grad_norm": 1.90512216091156, + "learning_rate": 5e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7580947875976562, + "num_tokens": 578341342.0, + "step": 22343 + }, + { + "epoch": 2.4537667471996487, + "grad_norm": 1.9808610677719116, + "learning_rate": 5e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7669836282730103, + "num_tokens": 578366224.0, + "step": 22344 + }, + { + "epoch": 2.453876564902262, + "grad_norm": 2.197208881378174, + "learning_rate": 5e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.7792845964431763, + "num_tokens": 578387740.0, + "step": 22345 + }, + { + "epoch": 2.453986382604876, + "grad_norm": 1.853920578956604, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7758116722106934, + "num_tokens": 578416863.0, + "step": 22346 + }, + { + "epoch": 2.4540962003074895, + "grad_norm": 1.9237022399902344, + "learning_rate": 5e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7355877161026001, + "num_tokens": 578449848.0, + "step": 22347 + }, + { + "epoch": 2.4542060180101033, + "grad_norm": 1.9435532093048096, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7731863260269165, + "num_tokens": 578476985.0, + "step": 22348 + }, + { + "epoch": 2.454315835712717, + "grad_norm": 1.9680187702178955, + "learning_rate": 5e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7699652910232544, + "num_tokens": 578504803.0, + "step": 22349 + }, + { + "epoch": 2.4544256534153304, + "grad_norm": 1.9537004232406616, + "learning_rate": 5e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7535519599914551, + "num_tokens": 578534566.0, + "step": 22350 + }, + { + "epoch": 2.454535471117944, + "grad_norm": 2.0986127853393555, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7589848041534424, + "num_tokens": 578560332.0, + "step": 22351 + }, + { + "epoch": 2.454645288820558, + "grad_norm": 2.1173720359802246, + "learning_rate": 5e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.7779796123504639, + "num_tokens": 578585469.0, + "step": 22352 + }, + { + "epoch": 2.4547551065231716, + "grad_norm": 2.088982343673706, + "learning_rate": 5e-06, + "loss": 0.6378, + "mean_token_accuracy": 0.7841346263885498, + "num_tokens": 578611497.0, + "step": 22353 + }, + { + "epoch": 2.4548649242257854, + "grad_norm": 2.4401943683624268, + "learning_rate": 5e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.7777585983276367, + "num_tokens": 578630307.0, + "step": 22354 + }, + { + "epoch": 2.4549747419283987, + "grad_norm": 2.13859224319458, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7786396741867065, + "num_tokens": 578654709.0, + "step": 22355 + }, + { + "epoch": 2.4550845596310125, + "grad_norm": 2.1061527729034424, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7660554051399231, + "num_tokens": 578681871.0, + "step": 22356 + }, + { + "epoch": 2.455194377333626, + "grad_norm": 2.009310245513916, + "learning_rate": 5e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7376992106437683, + "num_tokens": 578709479.0, + "step": 22357 + }, + { + "epoch": 2.45530419503624, + "grad_norm": 1.9779421091079712, + "learning_rate": 5e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7424150705337524, + "num_tokens": 578741156.0, + "step": 22358 + }, + { + "epoch": 2.4554140127388537, + "grad_norm": 1.878491997718811, + "learning_rate": 5e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.7769292593002319, + "num_tokens": 578770453.0, + "step": 22359 + }, + { + "epoch": 2.455523830441467, + "grad_norm": 2.0919034481048584, + "learning_rate": 5e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7725340723991394, + "num_tokens": 578796239.0, + "step": 22360 + }, + { + "epoch": 2.455633648144081, + "grad_norm": 2.1716551780700684, + "learning_rate": 5e-06, + "loss": 0.673, + "mean_token_accuracy": 0.7792925834655762, + "num_tokens": 578820808.0, + "step": 22361 + }, + { + "epoch": 2.4557434658466946, + "grad_norm": 2.1498403549194336, + "learning_rate": 5e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.7860238552093506, + "num_tokens": 578843402.0, + "step": 22362 + }, + { + "epoch": 2.4558532835493083, + "grad_norm": 2.2245984077453613, + "learning_rate": 5e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.770807683467865, + "num_tokens": 578865628.0, + "step": 22363 + }, + { + "epoch": 2.4559631012519216, + "grad_norm": 2.038069725036621, + "learning_rate": 5e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7622445225715637, + "num_tokens": 578892062.0, + "step": 22364 + }, + { + "epoch": 2.4560729189545354, + "grad_norm": 1.9622721672058105, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7540946006774902, + "num_tokens": 578922185.0, + "step": 22365 + }, + { + "epoch": 2.456182736657149, + "grad_norm": 1.825097918510437, + "learning_rate": 5e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7536271214485168, + "num_tokens": 578954041.0, + "step": 22366 + }, + { + "epoch": 2.456292554359763, + "grad_norm": 1.8231089115142822, + "learning_rate": 5e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.7788792848587036, + "num_tokens": 578984628.0, + "step": 22367 + }, + { + "epoch": 2.456402372062376, + "grad_norm": 2.3521783351898193, + "learning_rate": 5e-06, + "loss": 0.6494, + "mean_token_accuracy": 0.7907683849334717, + "num_tokens": 579006889.0, + "step": 22368 + }, + { + "epoch": 2.45651218976499, + "grad_norm": 1.7846639156341553, + "learning_rate": 5e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.778569221496582, + "num_tokens": 579036439.0, + "step": 22369 + }, + { + "epoch": 2.4566220074676037, + "grad_norm": 2.0509700775146484, + "learning_rate": 5e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.772693395614624, + "num_tokens": 579062019.0, + "step": 22370 + }, + { + "epoch": 2.4567318251702175, + "grad_norm": 2.1438639163970947, + "learning_rate": 5e-06, + "loss": 0.6502, + "mean_token_accuracy": 0.7812507152557373, + "num_tokens": 579085459.0, + "step": 22371 + }, + { + "epoch": 2.4568416428728312, + "grad_norm": 2.257814407348633, + "learning_rate": 5e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.7775208950042725, + "num_tokens": 579106919.0, + "step": 22372 + }, + { + "epoch": 2.4569514605754446, + "grad_norm": 1.980940580368042, + "learning_rate": 5e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7428538799285889, + "num_tokens": 579137799.0, + "step": 22373 + }, + { + "epoch": 2.4570612782780583, + "grad_norm": 2.2231764793395996, + "learning_rate": 5e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7597573399543762, + "num_tokens": 579161393.0, + "step": 22374 + }, + { + "epoch": 2.457171095980672, + "grad_norm": 1.938308835029602, + "learning_rate": 5e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.7862710952758789, + "num_tokens": 579189870.0, + "step": 22375 + }, + { + "epoch": 2.457280913683286, + "grad_norm": 1.9168496131896973, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7651406526565552, + "num_tokens": 579219591.0, + "step": 22376 + }, + { + "epoch": 2.4573907313858996, + "grad_norm": 1.677778720855713, + "learning_rate": 5e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7409237623214722, + "num_tokens": 579260596.0, + "step": 22377 + }, + { + "epoch": 2.457500549088513, + "grad_norm": 2.0620033740997314, + "learning_rate": 5e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7426685094833374, + "num_tokens": 579286611.0, + "step": 22378 + }, + { + "epoch": 2.4576103667911267, + "grad_norm": 1.8510745763778687, + "learning_rate": 5e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7467067241668701, + "num_tokens": 579318619.0, + "step": 22379 + }, + { + "epoch": 2.4577201844937404, + "grad_norm": 2.150432825088501, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7576438784599304, + "num_tokens": 579343781.0, + "step": 22380 + }, + { + "epoch": 2.457830002196354, + "grad_norm": 2.1988463401794434, + "learning_rate": 5e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7521162033081055, + "num_tokens": 579373527.0, + "step": 22381 + }, + { + "epoch": 2.457939819898968, + "grad_norm": 2.055166482925415, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7574968338012695, + "num_tokens": 579399743.0, + "step": 22382 + }, + { + "epoch": 2.4580496376015812, + "grad_norm": 2.149597644805908, + "learning_rate": 5e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.7809521555900574, + "num_tokens": 579422612.0, + "step": 22383 + }, + { + "epoch": 2.458159455304195, + "grad_norm": 2.3173787593841553, + "learning_rate": 5e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7393249273300171, + "num_tokens": 579445411.0, + "step": 22384 + }, + { + "epoch": 2.4582692730068088, + "grad_norm": 2.009119987487793, + "learning_rate": 5e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7655286192893982, + "num_tokens": 579473278.0, + "step": 22385 + }, + { + "epoch": 2.4583790907094225, + "grad_norm": 1.7746506929397583, + "learning_rate": 5e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7376610040664673, + "num_tokens": 579508640.0, + "step": 22386 + }, + { + "epoch": 2.4584889084120363, + "grad_norm": 2.193540573120117, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7580914497375488, + "num_tokens": 579532252.0, + "step": 22387 + }, + { + "epoch": 2.4585987261146496, + "grad_norm": 1.9595447778701782, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7427185773849487, + "num_tokens": 579566843.0, + "step": 22388 + }, + { + "epoch": 2.4587085438172633, + "grad_norm": 2.105672597885132, + "learning_rate": 5e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7682895660400391, + "num_tokens": 579594456.0, + "step": 22389 + }, + { + "epoch": 2.458818361519877, + "grad_norm": 2.4056601524353027, + "learning_rate": 5e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.7758808135986328, + "num_tokens": 579616360.0, + "step": 22390 + }, + { + "epoch": 2.458928179222491, + "grad_norm": 2.0354819297790527, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7584336996078491, + "num_tokens": 579642635.0, + "step": 22391 + }, + { + "epoch": 2.459037996925104, + "grad_norm": 2.279179811477661, + "learning_rate": 5e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.8067278265953064, + "num_tokens": 579664533.0, + "step": 22392 + }, + { + "epoch": 2.459147814627718, + "grad_norm": 2.1760671138763428, + "learning_rate": 5e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7676190137863159, + "num_tokens": 579688794.0, + "step": 22393 + }, + { + "epoch": 2.4592576323303317, + "grad_norm": 2.220684766769409, + "learning_rate": 5e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.7824270129203796, + "num_tokens": 579712105.0, + "step": 22394 + }, + { + "epoch": 2.4593674500329454, + "grad_norm": 1.9841291904449463, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7727630138397217, + "num_tokens": 579739141.0, + "step": 22395 + }, + { + "epoch": 2.4594772677355587, + "grad_norm": 2.266444683074951, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7545572519302368, + "num_tokens": 579763959.0, + "step": 22396 + }, + { + "epoch": 2.4595870854381725, + "grad_norm": 2.080599784851074, + "learning_rate": 5e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7611573338508606, + "num_tokens": 579787702.0, + "step": 22397 + }, + { + "epoch": 2.4596969031407863, + "grad_norm": 2.2206974029541016, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7804433107376099, + "num_tokens": 579811298.0, + "step": 22398 + }, + { + "epoch": 2.4598067208434, + "grad_norm": 2.0022175312042236, + "learning_rate": 5e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7500917911529541, + "num_tokens": 579837776.0, + "step": 22399 + }, + { + "epoch": 2.4599165385460138, + "grad_norm": 2.2682697772979736, + "learning_rate": 5e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7633939385414124, + "num_tokens": 579861726.0, + "step": 22400 + }, + { + "epoch": 2.460026356248627, + "grad_norm": 2.313387155532837, + "learning_rate": 5e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7577515840530396, + "num_tokens": 579885019.0, + "step": 22401 + }, + { + "epoch": 2.460136173951241, + "grad_norm": 1.8055583238601685, + "learning_rate": 5e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7608528137207031, + "num_tokens": 579914438.0, + "step": 22402 + }, + { + "epoch": 2.4602459916538546, + "grad_norm": 2.4843976497650146, + "learning_rate": 5e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7678160071372986, + "num_tokens": 579935069.0, + "step": 22403 + }, + { + "epoch": 2.4603558093564684, + "grad_norm": 1.8310295343399048, + "learning_rate": 5e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7388820648193359, + "num_tokens": 579965755.0, + "step": 22404 + }, + { + "epoch": 2.460465627059082, + "grad_norm": 2.2809553146362305, + "learning_rate": 5e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7845152616500854, + "num_tokens": 579988333.0, + "step": 22405 + }, + { + "epoch": 2.4605754447616954, + "grad_norm": 1.7530118227005005, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7635447382926941, + "num_tokens": 580020908.0, + "step": 22406 + }, + { + "epoch": 2.460685262464309, + "grad_norm": 2.002721071243286, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7490968108177185, + "num_tokens": 580049483.0, + "step": 22407 + }, + { + "epoch": 2.460795080166923, + "grad_norm": 2.17663836479187, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.761319637298584, + "num_tokens": 580073351.0, + "step": 22408 + }, + { + "epoch": 2.4609048978695367, + "grad_norm": 2.3815605640411377, + "learning_rate": 5e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7723246812820435, + "num_tokens": 580091422.0, + "step": 22409 + }, + { + "epoch": 2.4610147155721505, + "grad_norm": 1.872646689414978, + "learning_rate": 5e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7360336780548096, + "num_tokens": 580122641.0, + "step": 22410 + }, + { + "epoch": 2.4611245332747638, + "grad_norm": 1.8912389278411865, + "learning_rate": 5e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.772134006023407, + "num_tokens": 580152371.0, + "step": 22411 + }, + { + "epoch": 2.4612343509773775, + "grad_norm": 1.8292443752288818, + "learning_rate": 5e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7363131046295166, + "num_tokens": 580186809.0, + "step": 22412 + }, + { + "epoch": 2.4613441686799913, + "grad_norm": 2.338947057723999, + "learning_rate": 5e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7737898826599121, + "num_tokens": 580208443.0, + "step": 22413 + }, + { + "epoch": 2.461453986382605, + "grad_norm": 1.8021891117095947, + "learning_rate": 5e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7752801775932312, + "num_tokens": 580241076.0, + "step": 22414 + }, + { + "epoch": 2.4615638040852184, + "grad_norm": 2.0550014972686768, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7613052129745483, + "num_tokens": 580269009.0, + "step": 22415 + }, + { + "epoch": 2.461673621787832, + "grad_norm": 1.919598937034607, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7722554206848145, + "num_tokens": 580296551.0, + "step": 22416 + }, + { + "epoch": 2.461783439490446, + "grad_norm": 2.0739364624023438, + "learning_rate": 5e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7473269701004028, + "num_tokens": 580323500.0, + "step": 22417 + }, + { + "epoch": 2.4618932571930596, + "grad_norm": 2.0830163955688477, + "learning_rate": 5e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7544270753860474, + "num_tokens": 580349718.0, + "step": 22418 + }, + { + "epoch": 2.462003074895673, + "grad_norm": 1.8689416646957397, + "learning_rate": 5e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.755783200263977, + "num_tokens": 580378713.0, + "step": 22419 + }, + { + "epoch": 2.4621128925982867, + "grad_norm": 2.1888022422790527, + "learning_rate": 5e-06, + "loss": 0.6307, + "mean_token_accuracy": 0.7877442240715027, + "num_tokens": 580399979.0, + "step": 22420 + }, + { + "epoch": 2.4622227103009005, + "grad_norm": 2.3308351039886475, + "learning_rate": 5e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.7789686918258667, + "num_tokens": 580420816.0, + "step": 22421 + }, + { + "epoch": 2.462332528003514, + "grad_norm": 2.109265089035034, + "learning_rate": 5e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7783202528953552, + "num_tokens": 580447000.0, + "step": 22422 + }, + { + "epoch": 2.462442345706128, + "grad_norm": 2.0163447856903076, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7668286561965942, + "num_tokens": 580472922.0, + "step": 22423 + }, + { + "epoch": 2.4625521634087413, + "grad_norm": 2.157118082046509, + "learning_rate": 5e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7695897817611694, + "num_tokens": 580496281.0, + "step": 22424 + }, + { + "epoch": 2.462661981111355, + "grad_norm": 2.017159938812256, + "learning_rate": 5e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7659518718719482, + "num_tokens": 580523180.0, + "step": 22425 + }, + { + "epoch": 2.462771798813969, + "grad_norm": 2.2203164100646973, + "learning_rate": 5e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7499430775642395, + "num_tokens": 580548855.0, + "step": 22426 + }, + { + "epoch": 2.4628816165165826, + "grad_norm": 1.8954434394836426, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7561620473861694, + "num_tokens": 580579578.0, + "step": 22427 + }, + { + "epoch": 2.4629914342191963, + "grad_norm": 2.2002360820770264, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7704433798789978, + "num_tokens": 580603236.0, + "step": 22428 + }, + { + "epoch": 2.4631012519218096, + "grad_norm": 1.8186813592910767, + "learning_rate": 5e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7647691369056702, + "num_tokens": 580634867.0, + "step": 22429 + }, + { + "epoch": 2.4632110696244234, + "grad_norm": 2.3615312576293945, + "learning_rate": 5e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.7735284566879272, + "num_tokens": 580654734.0, + "step": 22430 + }, + { + "epoch": 2.463320887327037, + "grad_norm": 2.4208076000213623, + "learning_rate": 5e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.759380578994751, + "num_tokens": 580674404.0, + "step": 22431 + }, + { + "epoch": 2.463430705029651, + "grad_norm": 2.024188995361328, + "learning_rate": 5e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7723311185836792, + "num_tokens": 580701587.0, + "step": 22432 + }, + { + "epoch": 2.4635405227322646, + "grad_norm": 2.365769147872925, + "learning_rate": 5e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.7877262830734253, + "num_tokens": 580721544.0, + "step": 22433 + }, + { + "epoch": 2.463650340434878, + "grad_norm": 1.9790369272232056, + "learning_rate": 5e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7554042339324951, + "num_tokens": 580751040.0, + "step": 22434 + }, + { + "epoch": 2.4637601581374917, + "grad_norm": 2.176276206970215, + "learning_rate": 5e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.7874901294708252, + "num_tokens": 580776311.0, + "step": 22435 + }, + { + "epoch": 2.4638699758401055, + "grad_norm": 2.235011100769043, + "learning_rate": 5e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7460809946060181, + "num_tokens": 580802152.0, + "step": 22436 + }, + { + "epoch": 2.4639797935427192, + "grad_norm": 1.9423761367797852, + "learning_rate": 5e-06, + "loss": 0.661, + "mean_token_accuracy": 0.7757695913314819, + "num_tokens": 580830465.0, + "step": 22437 + }, + { + "epoch": 2.464089611245333, + "grad_norm": 2.177971601486206, + "learning_rate": 5e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7668715715408325, + "num_tokens": 580852783.0, + "step": 22438 + }, + { + "epoch": 2.4641994289479463, + "grad_norm": 2.2451367378234863, + "learning_rate": 5e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.761907160282135, + "num_tokens": 580876702.0, + "step": 22439 + }, + { + "epoch": 2.46430924665056, + "grad_norm": 2.1884453296661377, + "learning_rate": 5e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7687024474143982, + "num_tokens": 580899688.0, + "step": 22440 + }, + { + "epoch": 2.464419064353174, + "grad_norm": 1.8928914070129395, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7649562358856201, + "num_tokens": 580926677.0, + "step": 22441 + }, + { + "epoch": 2.4645288820557876, + "grad_norm": 1.746643304824829, + "learning_rate": 5e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.745901346206665, + "num_tokens": 580960563.0, + "step": 22442 + }, + { + "epoch": 2.464638699758401, + "grad_norm": 2.060312032699585, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7322684526443481, + "num_tokens": 580988242.0, + "step": 22443 + }, + { + "epoch": 2.4647485174610146, + "grad_norm": 1.9679818153381348, + "learning_rate": 5e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7538365125656128, + "num_tokens": 581018116.0, + "step": 22444 + }, + { + "epoch": 2.4648583351636284, + "grad_norm": 2.060183525085449, + "learning_rate": 5e-06, + "loss": 0.6549, + "mean_token_accuracy": 0.783437967300415, + "num_tokens": 581042553.0, + "step": 22445 + }, + { + "epoch": 2.464968152866242, + "grad_norm": 1.890275239944458, + "learning_rate": 5e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7561349868774414, + "num_tokens": 581071069.0, + "step": 22446 + }, + { + "epoch": 2.4650779705688555, + "grad_norm": 1.8950408697128296, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7675651907920837, + "num_tokens": 581099716.0, + "step": 22447 + }, + { + "epoch": 2.4651877882714692, + "grad_norm": 2.090895891189575, + "learning_rate": 5e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.772071123123169, + "num_tokens": 581124878.0, + "step": 22448 + }, + { + "epoch": 2.465297605974083, + "grad_norm": 2.153590679168701, + "learning_rate": 5e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7564724683761597, + "num_tokens": 581148751.0, + "step": 22449 + }, + { + "epoch": 2.4654074236766967, + "grad_norm": 1.8980231285095215, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7538266181945801, + "num_tokens": 581178003.0, + "step": 22450 + }, + { + "epoch": 2.4655172413793105, + "grad_norm": 2.4087860584259033, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7798835039138794, + "num_tokens": 581199424.0, + "step": 22451 + }, + { + "epoch": 2.465627059081924, + "grad_norm": 1.9565361738204956, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7572526931762695, + "num_tokens": 581224895.0, + "step": 22452 + }, + { + "epoch": 2.4657368767845376, + "grad_norm": 2.0732996463775635, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.768722653388977, + "num_tokens": 581249931.0, + "step": 22453 + }, + { + "epoch": 2.4658466944871513, + "grad_norm": 2.0315933227539062, + "learning_rate": 5e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7780818939208984, + "num_tokens": 581274263.0, + "step": 22454 + }, + { + "epoch": 2.465956512189765, + "grad_norm": 2.0675227642059326, + "learning_rate": 5e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7588655948638916, + "num_tokens": 581297830.0, + "step": 22455 + }, + { + "epoch": 2.466066329892379, + "grad_norm": 2.2030553817749023, + "learning_rate": 5e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7642477750778198, + "num_tokens": 581320317.0, + "step": 22456 + }, + { + "epoch": 2.466176147594992, + "grad_norm": 1.8764582872390747, + "learning_rate": 5e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.749573826789856, + "num_tokens": 581350411.0, + "step": 22457 + }, + { + "epoch": 2.466285965297606, + "grad_norm": 2.0069711208343506, + "learning_rate": 5e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7710847854614258, + "num_tokens": 581378802.0, + "step": 22458 + }, + { + "epoch": 2.4663957830002197, + "grad_norm": 2.06398344039917, + "learning_rate": 5e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7760732173919678, + "num_tokens": 581403447.0, + "step": 22459 + }, + { + "epoch": 2.4665056007028334, + "grad_norm": 2.0681941509246826, + "learning_rate": 5e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7477641105651855, + "num_tokens": 581430619.0, + "step": 22460 + }, + { + "epoch": 2.466615418405447, + "grad_norm": 2.0425713062286377, + "learning_rate": 5e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7685062885284424, + "num_tokens": 581458043.0, + "step": 22461 + }, + { + "epoch": 2.4667252361080605, + "grad_norm": 2.143975257873535, + "learning_rate": 5e-06, + "loss": 0.701, + "mean_token_accuracy": 0.7704854607582092, + "num_tokens": 581483138.0, + "step": 22462 + }, + { + "epoch": 2.4668350538106742, + "grad_norm": 2.3273234367370605, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7699975967407227, + "num_tokens": 581503099.0, + "step": 22463 + }, + { + "epoch": 2.466944871513288, + "grad_norm": 2.240658760070801, + "learning_rate": 5e-06, + "loss": 0.6703, + "mean_token_accuracy": 0.7805559635162354, + "num_tokens": 581525609.0, + "step": 22464 + }, + { + "epoch": 2.4670546892159018, + "grad_norm": 2.2303450107574463, + "learning_rate": 5e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.789548397064209, + "num_tokens": 581547762.0, + "step": 22465 + }, + { + "epoch": 2.467164506918515, + "grad_norm": 1.8502205610275269, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7624565362930298, + "num_tokens": 581576877.0, + "step": 22466 + }, + { + "epoch": 2.467274324621129, + "grad_norm": 1.9927926063537598, + "learning_rate": 5e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7526074051856995, + "num_tokens": 581603747.0, + "step": 22467 + }, + { + "epoch": 2.4673841423237426, + "grad_norm": 1.7249115705490112, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7412925958633423, + "num_tokens": 581639775.0, + "step": 22468 + }, + { + "epoch": 2.4674939600263563, + "grad_norm": 2.3511223793029785, + "learning_rate": 5e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7676572203636169, + "num_tokens": 581661077.0, + "step": 22469 + }, + { + "epoch": 2.4676037777289697, + "grad_norm": 2.042070150375366, + "learning_rate": 5e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7630100250244141, + "num_tokens": 581686729.0, + "step": 22470 + }, + { + "epoch": 2.4677135954315834, + "grad_norm": 2.2795166969299316, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7743659019470215, + "num_tokens": 581708080.0, + "step": 22471 + }, + { + "epoch": 2.467823413134197, + "grad_norm": 2.416409969329834, + "learning_rate": 5e-06, + "loss": 0.565, + "mean_token_accuracy": 0.8025371432304382, + "num_tokens": 581725704.0, + "step": 22472 + }, + { + "epoch": 2.467933230836811, + "grad_norm": 2.3277170658111572, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7667175531387329, + "num_tokens": 581748576.0, + "step": 22473 + }, + { + "epoch": 2.4680430485394247, + "grad_norm": 2.0912837982177734, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7759264707565308, + "num_tokens": 581774700.0, + "step": 22474 + }, + { + "epoch": 2.468152866242038, + "grad_norm": 1.8605135679244995, + "learning_rate": 5e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7493590712547302, + "num_tokens": 581807473.0, + "step": 22475 + }, + { + "epoch": 2.4682626839446518, + "grad_norm": 2.0288641452789307, + "learning_rate": 5e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7610599398612976, + "num_tokens": 581834356.0, + "step": 22476 + }, + { + "epoch": 2.4683725016472655, + "grad_norm": 1.9145056009292603, + "learning_rate": 5e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7633779048919678, + "num_tokens": 581866582.0, + "step": 22477 + }, + { + "epoch": 2.4684823193498793, + "grad_norm": 2.1038379669189453, + "learning_rate": 5e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.758698582649231, + "num_tokens": 581890095.0, + "step": 22478 + }, + { + "epoch": 2.468592137052493, + "grad_norm": 1.9627976417541504, + "learning_rate": 5e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7638171315193176, + "num_tokens": 581918412.0, + "step": 22479 + }, + { + "epoch": 2.4687019547551063, + "grad_norm": 2.2387430667877197, + "learning_rate": 5e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7676255106925964, + "num_tokens": 581942177.0, + "step": 22480 + }, + { + "epoch": 2.46881177245772, + "grad_norm": 2.1776413917541504, + "learning_rate": 5e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7515879273414612, + "num_tokens": 581966912.0, + "step": 22481 + }, + { + "epoch": 2.468921590160334, + "grad_norm": 2.2311391830444336, + "learning_rate": 5e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7711747288703918, + "num_tokens": 581990315.0, + "step": 22482 + }, + { + "epoch": 2.4690314078629476, + "grad_norm": 2.1200859546661377, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7668747901916504, + "num_tokens": 582018106.0, + "step": 22483 + }, + { + "epoch": 2.4691412255655614, + "grad_norm": 2.358560085296631, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7699902653694153, + "num_tokens": 582040287.0, + "step": 22484 + }, + { + "epoch": 2.4692510432681747, + "grad_norm": 2.079874277114868, + "learning_rate": 5e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7681599259376526, + "num_tokens": 582066496.0, + "step": 22485 + }, + { + "epoch": 2.4693608609707884, + "grad_norm": 2.158067464828491, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7693281173706055, + "num_tokens": 582090046.0, + "step": 22486 + }, + { + "epoch": 2.469470678673402, + "grad_norm": 2.163889169692993, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7556818723678589, + "num_tokens": 582115770.0, + "step": 22487 + }, + { + "epoch": 2.469580496376016, + "grad_norm": 2.2327792644500732, + "learning_rate": 5e-06, + "loss": 0.628, + "mean_token_accuracy": 0.7929016351699829, + "num_tokens": 582136825.0, + "step": 22488 + }, + { + "epoch": 2.4696903140786297, + "grad_norm": 2.0456342697143555, + "learning_rate": 5e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7685869336128235, + "num_tokens": 582162263.0, + "step": 22489 + }, + { + "epoch": 2.469800131781243, + "grad_norm": 2.0494658946990967, + "learning_rate": 5e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7587052583694458, + "num_tokens": 582189910.0, + "step": 22490 + }, + { + "epoch": 2.469909949483857, + "grad_norm": 1.9753572940826416, + "learning_rate": 5e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7598966956138611, + "num_tokens": 582218531.0, + "step": 22491 + }, + { + "epoch": 2.4700197671864705, + "grad_norm": 2.2419073581695557, + "learning_rate": 5e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.765036940574646, + "num_tokens": 582240764.0, + "step": 22492 + }, + { + "epoch": 2.4701295848890843, + "grad_norm": 1.8821028470993042, + "learning_rate": 5e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7559674978256226, + "num_tokens": 582272212.0, + "step": 22493 + }, + { + "epoch": 2.4702394025916976, + "grad_norm": 2.159296751022339, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7553016543388367, + "num_tokens": 582295825.0, + "step": 22494 + }, + { + "epoch": 2.4703492202943114, + "grad_norm": 1.970831274986267, + "learning_rate": 5e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7455017566680908, + "num_tokens": 582325192.0, + "step": 22495 + }, + { + "epoch": 2.470459037996925, + "grad_norm": 2.0967655181884766, + "learning_rate": 5e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7510702610015869, + "num_tokens": 582353284.0, + "step": 22496 + }, + { + "epoch": 2.470568855699539, + "grad_norm": 2.1941986083984375, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7602282762527466, + "num_tokens": 582379048.0, + "step": 22497 + }, + { + "epoch": 2.470678673402152, + "grad_norm": 2.143109083175659, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7643598318099976, + "num_tokens": 582404074.0, + "step": 22498 + }, + { + "epoch": 2.470788491104766, + "grad_norm": 2.147038698196411, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7696981430053711, + "num_tokens": 582429240.0, + "step": 22499 + }, + { + "epoch": 2.4708983088073797, + "grad_norm": 2.050863265991211, + "learning_rate": 5e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7542937994003296, + "num_tokens": 582455508.0, + "step": 22500 + }, + { + "epoch": 2.4710081265099935, + "grad_norm": 2.0419321060180664, + "learning_rate": 5e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.7806967496871948, + "num_tokens": 582480330.0, + "step": 22501 + }, + { + "epoch": 2.471117944212607, + "grad_norm": 1.9886592626571655, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7608876824378967, + "num_tokens": 582506603.0, + "step": 22502 + }, + { + "epoch": 2.4712277619152205, + "grad_norm": 2.108896017074585, + "learning_rate": 5e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7700209617614746, + "num_tokens": 582533889.0, + "step": 22503 + }, + { + "epoch": 2.4713375796178343, + "grad_norm": 2.0881898403167725, + "learning_rate": 5e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.764762818813324, + "num_tokens": 582559745.0, + "step": 22504 + }, + { + "epoch": 2.471447397320448, + "grad_norm": 2.151482343673706, + "learning_rate": 5e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.7781150937080383, + "num_tokens": 582584285.0, + "step": 22505 + }, + { + "epoch": 2.471557215023062, + "grad_norm": 2.0052573680877686, + "learning_rate": 5e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7481813430786133, + "num_tokens": 582616120.0, + "step": 22506 + }, + { + "epoch": 2.4716670327256756, + "grad_norm": 1.924020767211914, + "learning_rate": 5e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.7530887722969055, + "num_tokens": 582643917.0, + "step": 22507 + }, + { + "epoch": 2.471776850428289, + "grad_norm": 2.157482385635376, + "learning_rate": 5e-06, + "loss": 0.6763, + "mean_token_accuracy": 0.7713512778282166, + "num_tokens": 582669056.0, + "step": 22508 + }, + { + "epoch": 2.4718866681309026, + "grad_norm": 2.079641580581665, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7663642168045044, + "num_tokens": 582694800.0, + "step": 22509 + }, + { + "epoch": 2.4719964858335164, + "grad_norm": 1.99227774143219, + "learning_rate": 5e-06, + "loss": 0.6303, + "mean_token_accuracy": 0.7912421226501465, + "num_tokens": 582720611.0, + "step": 22510 + }, + { + "epoch": 2.47210630353613, + "grad_norm": 2.0697484016418457, + "learning_rate": 5e-06, + "loss": 0.6967, + "mean_token_accuracy": 0.7667877674102783, + "num_tokens": 582746607.0, + "step": 22511 + }, + { + "epoch": 2.472216121238744, + "grad_norm": 1.9786773920059204, + "learning_rate": 5e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7442489862442017, + "num_tokens": 582776424.0, + "step": 22512 + }, + { + "epoch": 2.472325938941357, + "grad_norm": 1.958950161933899, + "learning_rate": 5e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7484416961669922, + "num_tokens": 582808133.0, + "step": 22513 + }, + { + "epoch": 2.472435756643971, + "grad_norm": 2.0181782245635986, + "learning_rate": 5e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7581273317337036, + "num_tokens": 582835423.0, + "step": 22514 + }, + { + "epoch": 2.4725455743465847, + "grad_norm": 1.9863954782485962, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7455379962921143, + "num_tokens": 582863647.0, + "step": 22515 + }, + { + "epoch": 2.4726553920491985, + "grad_norm": 2.399054527282715, + "learning_rate": 5e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.7854658365249634, + "num_tokens": 582885615.0, + "step": 22516 + }, + { + "epoch": 2.472765209751812, + "grad_norm": 1.9491381645202637, + "learning_rate": 5e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7693240642547607, + "num_tokens": 582913349.0, + "step": 22517 + }, + { + "epoch": 2.4728750274544256, + "grad_norm": 2.28718638420105, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7739027738571167, + "num_tokens": 582935270.0, + "step": 22518 + }, + { + "epoch": 2.4729848451570393, + "grad_norm": 2.545306444168091, + "learning_rate": 5e-06, + "loss": 0.6016, + "mean_token_accuracy": 0.8047993183135986, + "num_tokens": 582953204.0, + "step": 22519 + }, + { + "epoch": 2.473094662859653, + "grad_norm": 2.2911694049835205, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.763529896736145, + "num_tokens": 582974702.0, + "step": 22520 + }, + { + "epoch": 2.473204480562267, + "grad_norm": 2.3154311180114746, + "learning_rate": 5e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7642216086387634, + "num_tokens": 582998905.0, + "step": 22521 + }, + { + "epoch": 2.47331429826488, + "grad_norm": 2.065983295440674, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7662268877029419, + "num_tokens": 583025461.0, + "step": 22522 + }, + { + "epoch": 2.473424115967494, + "grad_norm": 2.3965883255004883, + "learning_rate": 5e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.7699443101882935, + "num_tokens": 583045960.0, + "step": 22523 + }, + { + "epoch": 2.4735339336701077, + "grad_norm": 1.9629850387573242, + "learning_rate": 5e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.7814865112304688, + "num_tokens": 583074405.0, + "step": 22524 + }, + { + "epoch": 2.4736437513727214, + "grad_norm": 1.933247447013855, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7527681589126587, + "num_tokens": 583104621.0, + "step": 22525 + }, + { + "epoch": 2.4737535690753347, + "grad_norm": 2.1500308513641357, + "learning_rate": 5e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7580041289329529, + "num_tokens": 583133241.0, + "step": 22526 + }, + { + "epoch": 2.4738633867779485, + "grad_norm": 1.9348009824752808, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7668286561965942, + "num_tokens": 583163355.0, + "step": 22527 + }, + { + "epoch": 2.4739732044805622, + "grad_norm": 2.1665029525756836, + "learning_rate": 5e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.7716102600097656, + "num_tokens": 583187216.0, + "step": 22528 + }, + { + "epoch": 2.474083022183176, + "grad_norm": 2.0410263538360596, + "learning_rate": 5e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.7799310088157654, + "num_tokens": 583213046.0, + "step": 22529 + }, + { + "epoch": 2.4741928398857898, + "grad_norm": 2.005589008331299, + "learning_rate": 5e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7607120275497437, + "num_tokens": 583238233.0, + "step": 22530 + }, + { + "epoch": 2.474302657588403, + "grad_norm": 2.0743536949157715, + "learning_rate": 5e-06, + "loss": 0.5657, + "mean_token_accuracy": 0.807490348815918, + "num_tokens": 583260125.0, + "step": 22531 + }, + { + "epoch": 2.474412475291017, + "grad_norm": 2.2342309951782227, + "learning_rate": 5e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.7839727401733398, + "num_tokens": 583282386.0, + "step": 22532 + }, + { + "epoch": 2.4745222929936306, + "grad_norm": 2.367356300354004, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7575775384902954, + "num_tokens": 583305017.0, + "step": 22533 + }, + { + "epoch": 2.4746321106962443, + "grad_norm": 2.2119603157043457, + "learning_rate": 5e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7474395036697388, + "num_tokens": 583330814.0, + "step": 22534 + }, + { + "epoch": 2.474741928398858, + "grad_norm": 1.9768272638320923, + "learning_rate": 5e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7286167144775391, + "num_tokens": 583361314.0, + "step": 22535 + }, + { + "epoch": 2.4748517461014714, + "grad_norm": 2.0025925636291504, + "learning_rate": 5e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7645034790039062, + "num_tokens": 583386750.0, + "step": 22536 + }, + { + "epoch": 2.474961563804085, + "grad_norm": 2.3029539585113525, + "learning_rate": 5e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7767820954322815, + "num_tokens": 583407262.0, + "step": 22537 + }, + { + "epoch": 2.475071381506699, + "grad_norm": 1.99002206325531, + "learning_rate": 5e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7634983062744141, + "num_tokens": 583434741.0, + "step": 22538 + }, + { + "epoch": 2.4751811992093127, + "grad_norm": 2.362058401107788, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7708156108856201, + "num_tokens": 583456824.0, + "step": 22539 + }, + { + "epoch": 2.4752910169119264, + "grad_norm": 2.2632124423980713, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7675105929374695, + "num_tokens": 583480971.0, + "step": 22540 + }, + { + "epoch": 2.4754008346145397, + "grad_norm": 2.1427817344665527, + "learning_rate": 5e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7527995109558105, + "num_tokens": 583507085.0, + "step": 22541 + }, + { + "epoch": 2.4755106523171535, + "grad_norm": 2.159308433532715, + "learning_rate": 5e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7740098834037781, + "num_tokens": 583530377.0, + "step": 22542 + }, + { + "epoch": 2.4756204700197673, + "grad_norm": 2.085930109024048, + "learning_rate": 5e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7563806772232056, + "num_tokens": 583555448.0, + "step": 22543 + }, + { + "epoch": 2.475730287722381, + "grad_norm": 1.9809242486953735, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7596380710601807, + "num_tokens": 583580107.0, + "step": 22544 + }, + { + "epoch": 2.4758401054249943, + "grad_norm": 2.0038034915924072, + "learning_rate": 5e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7513789534568787, + "num_tokens": 583608722.0, + "step": 22545 + }, + { + "epoch": 2.475949923127608, + "grad_norm": 1.9034786224365234, + "learning_rate": 5e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.7674272060394287, + "num_tokens": 583641280.0, + "step": 22546 + }, + { + "epoch": 2.476059740830222, + "grad_norm": 2.0265445709228516, + "learning_rate": 5e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7593317627906799, + "num_tokens": 583669755.0, + "step": 22547 + }, + { + "epoch": 2.4761695585328356, + "grad_norm": 2.023536443710327, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7655554413795471, + "num_tokens": 583697476.0, + "step": 22548 + }, + { + "epoch": 2.476279376235449, + "grad_norm": 2.0180504322052, + "learning_rate": 5e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7545391321182251, + "num_tokens": 583724093.0, + "step": 22549 + }, + { + "epoch": 2.4763891939380627, + "grad_norm": 1.9563887119293213, + "learning_rate": 5e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7722753882408142, + "num_tokens": 583753259.0, + "step": 22550 + }, + { + "epoch": 2.4764990116406764, + "grad_norm": 2.4133591651916504, + "learning_rate": 5e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7719402313232422, + "num_tokens": 583773985.0, + "step": 22551 + }, + { + "epoch": 2.47660882934329, + "grad_norm": 1.8594028949737549, + "learning_rate": 5e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.7724188566207886, + "num_tokens": 583803574.0, + "step": 22552 + }, + { + "epoch": 2.476718647045904, + "grad_norm": 2.3203072547912598, + "learning_rate": 5e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.7848682999610901, + "num_tokens": 583826178.0, + "step": 22553 + }, + { + "epoch": 2.4768284647485173, + "grad_norm": 1.9022257328033447, + "learning_rate": 5e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7571144104003906, + "num_tokens": 583859303.0, + "step": 22554 + }, + { + "epoch": 2.476938282451131, + "grad_norm": 2.188791275024414, + "learning_rate": 5e-06, + "loss": 0.6891, + "mean_token_accuracy": 0.7696810960769653, + "num_tokens": 583885492.0, + "step": 22555 + }, + { + "epoch": 2.4770481001537448, + "grad_norm": 2.4620985984802246, + "learning_rate": 5e-06, + "loss": 0.6268, + "mean_token_accuracy": 0.7898815870285034, + "num_tokens": 583903550.0, + "step": 22556 + }, + { + "epoch": 2.4771579178563585, + "grad_norm": 2.225698709487915, + "learning_rate": 5e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.7723581790924072, + "num_tokens": 583927273.0, + "step": 22557 + }, + { + "epoch": 2.4772677355589723, + "grad_norm": 1.9519649744033813, + "learning_rate": 5e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7568858861923218, + "num_tokens": 583957676.0, + "step": 22558 + }, + { + "epoch": 2.4773775532615856, + "grad_norm": 2.117082118988037, + "learning_rate": 5e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7687519788742065, + "num_tokens": 583981629.0, + "step": 22559 + }, + { + "epoch": 2.4774873709641994, + "grad_norm": 1.8146804571151733, + "learning_rate": 5e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7353118062019348, + "num_tokens": 584014939.0, + "step": 22560 + }, + { + "epoch": 2.477597188666813, + "grad_norm": 1.9522141218185425, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7535718679428101, + "num_tokens": 584042699.0, + "step": 22561 + }, + { + "epoch": 2.477707006369427, + "grad_norm": 1.9633146524429321, + "learning_rate": 5e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7522938847541809, + "num_tokens": 584071336.0, + "step": 22562 + }, + { + "epoch": 2.4778168240720406, + "grad_norm": 2.0863990783691406, + "learning_rate": 5e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7743878364562988, + "num_tokens": 584095503.0, + "step": 22563 + }, + { + "epoch": 2.477926641774654, + "grad_norm": 2.4199483394622803, + "learning_rate": 5e-06, + "loss": 0.6221, + "mean_token_accuracy": 0.787216067314148, + "num_tokens": 584116045.0, + "step": 22564 + }, + { + "epoch": 2.4780364594772677, + "grad_norm": 1.8727219104766846, + "learning_rate": 5e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7744423151016235, + "num_tokens": 584144628.0, + "step": 22565 + }, + { + "epoch": 2.4781462771798815, + "grad_norm": 1.9360899925231934, + "learning_rate": 5e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.7929661273956299, + "num_tokens": 584169725.0, + "step": 22566 + }, + { + "epoch": 2.478256094882495, + "grad_norm": 2.0324881076812744, + "learning_rate": 5e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7472987174987793, + "num_tokens": 584196708.0, + "step": 22567 + }, + { + "epoch": 2.478365912585109, + "grad_norm": 1.9383571147918701, + "learning_rate": 5e-06, + "loss": 0.6432, + "mean_token_accuracy": 0.7898897528648376, + "num_tokens": 584222500.0, + "step": 22568 + }, + { + "epoch": 2.4784757302877223, + "grad_norm": 2.136324405670166, + "learning_rate": 5e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7592756748199463, + "num_tokens": 584247334.0, + "step": 22569 + }, + { + "epoch": 2.478585547990336, + "grad_norm": 2.1697890758514404, + "learning_rate": 5e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7709605693817139, + "num_tokens": 584271733.0, + "step": 22570 + }, + { + "epoch": 2.47869536569295, + "grad_norm": 1.9854857921600342, + "learning_rate": 5e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.755766749382019, + "num_tokens": 584301137.0, + "step": 22571 + }, + { + "epoch": 2.4788051833955635, + "grad_norm": 1.6844843626022339, + "learning_rate": 5e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7598150372505188, + "num_tokens": 584338099.0, + "step": 22572 + }, + { + "epoch": 2.478915001098177, + "grad_norm": 2.0564768314361572, + "learning_rate": 5e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7639001607894897, + "num_tokens": 584363765.0, + "step": 22573 + }, + { + "epoch": 2.4790248188007906, + "grad_norm": 2.1541430950164795, + "learning_rate": 5e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.7707614898681641, + "num_tokens": 584387206.0, + "step": 22574 + }, + { + "epoch": 2.4791346365034044, + "grad_norm": 2.1894161701202393, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7673825025558472, + "num_tokens": 584409716.0, + "step": 22575 + }, + { + "epoch": 2.479244454206018, + "grad_norm": 2.197943687438965, + "learning_rate": 5e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.7822396755218506, + "num_tokens": 584432865.0, + "step": 22576 + }, + { + "epoch": 2.4793542719086314, + "grad_norm": 2.0510823726654053, + "learning_rate": 5e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7618832588195801, + "num_tokens": 584459516.0, + "step": 22577 + }, + { + "epoch": 2.479464089611245, + "grad_norm": 1.7960267066955566, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7549386024475098, + "num_tokens": 584490856.0, + "step": 22578 + }, + { + "epoch": 2.479573907313859, + "grad_norm": 2.070338249206543, + "learning_rate": 5e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.7899251580238342, + "num_tokens": 584517247.0, + "step": 22579 + }, + { + "epoch": 2.4796837250164727, + "grad_norm": 1.979295253753662, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7512456178665161, + "num_tokens": 584547483.0, + "step": 22580 + }, + { + "epoch": 2.4797935427190865, + "grad_norm": 1.9724122285842896, + "learning_rate": 5e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.7661241888999939, + "num_tokens": 584576607.0, + "step": 22581 + }, + { + "epoch": 2.4799033604217, + "grad_norm": 2.093815803527832, + "learning_rate": 5e-06, + "loss": 0.6453, + "mean_token_accuracy": 0.7810956835746765, + "num_tokens": 584601007.0, + "step": 22582 + }, + { + "epoch": 2.4800131781243135, + "grad_norm": 1.9406993389129639, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7671116590499878, + "num_tokens": 584631673.0, + "step": 22583 + }, + { + "epoch": 2.4801229958269273, + "grad_norm": 2.342087745666504, + "learning_rate": 5e-06, + "loss": 0.6396, + "mean_token_accuracy": 0.7846308946609497, + "num_tokens": 584650818.0, + "step": 22584 + }, + { + "epoch": 2.480232813529541, + "grad_norm": 2.364454507827759, + "learning_rate": 5e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7690901756286621, + "num_tokens": 584672251.0, + "step": 22585 + }, + { + "epoch": 2.480342631232155, + "grad_norm": 2.1757736206054688, + "learning_rate": 5e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7687973976135254, + "num_tokens": 584696258.0, + "step": 22586 + }, + { + "epoch": 2.480452448934768, + "grad_norm": 2.2578659057617188, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7739702463150024, + "num_tokens": 584721173.0, + "step": 22587 + }, + { + "epoch": 2.480562266637382, + "grad_norm": 2.094533920288086, + "learning_rate": 5e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7390902042388916, + "num_tokens": 584748303.0, + "step": 22588 + }, + { + "epoch": 2.4806720843399956, + "grad_norm": 2.1684517860412598, + "learning_rate": 5e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7455055713653564, + "num_tokens": 584775176.0, + "step": 22589 + }, + { + "epoch": 2.4807819020426094, + "grad_norm": 1.9754561185836792, + "learning_rate": 5e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7500872015953064, + "num_tokens": 584806815.0, + "step": 22590 + }, + { + "epoch": 2.480891719745223, + "grad_norm": 2.348196268081665, + "learning_rate": 5e-06, + "loss": 0.6224, + "mean_token_accuracy": 0.7855489253997803, + "num_tokens": 584827664.0, + "step": 22591 + }, + { + "epoch": 2.4810015374478365, + "grad_norm": 2.1953113079071045, + "learning_rate": 5e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.7834704518318176, + "num_tokens": 584853463.0, + "step": 22592 + }, + { + "epoch": 2.4811113551504502, + "grad_norm": 2.363013982772827, + "learning_rate": 5e-06, + "loss": 0.6527, + "mean_token_accuracy": 0.7875165939331055, + "num_tokens": 584875663.0, + "step": 22593 + }, + { + "epoch": 2.481221172853064, + "grad_norm": 1.939892292022705, + "learning_rate": 5e-06, + "loss": 0.7203, + "mean_token_accuracy": 0.7675243616104126, + "num_tokens": 584906610.0, + "step": 22594 + }, + { + "epoch": 2.4813309905556777, + "grad_norm": 1.835681676864624, + "learning_rate": 5e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7516292333602905, + "num_tokens": 584937672.0, + "step": 22595 + }, + { + "epoch": 2.481440808258291, + "grad_norm": 1.8541944026947021, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7583448886871338, + "num_tokens": 584968490.0, + "step": 22596 + }, + { + "epoch": 2.481550625960905, + "grad_norm": 1.904565691947937, + "learning_rate": 5e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7686281204223633, + "num_tokens": 584996544.0, + "step": 22597 + }, + { + "epoch": 2.4816604436635186, + "grad_norm": 2.1651651859283447, + "learning_rate": 5e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7330055832862854, + "num_tokens": 585024275.0, + "step": 22598 + }, + { + "epoch": 2.4817702613661323, + "grad_norm": 2.1023411750793457, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.756110668182373, + "num_tokens": 585053668.0, + "step": 22599 + }, + { + "epoch": 2.4818800790687456, + "grad_norm": 2.2326998710632324, + "learning_rate": 5e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7645482420921326, + "num_tokens": 585076207.0, + "step": 22600 + }, + { + "epoch": 2.4819898967713594, + "grad_norm": 2.2337193489074707, + "learning_rate": 5e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7612453699111938, + "num_tokens": 585099836.0, + "step": 22601 + }, + { + "epoch": 2.482099714473973, + "grad_norm": 2.0659401416778564, + "learning_rate": 5e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7555868029594421, + "num_tokens": 585129794.0, + "step": 22602 + }, + { + "epoch": 2.482209532176587, + "grad_norm": 2.2225050926208496, + "learning_rate": 5e-06, + "loss": 0.6507, + "mean_token_accuracy": 0.7917706966400146, + "num_tokens": 585153826.0, + "step": 22603 + }, + { + "epoch": 2.4823193498792007, + "grad_norm": 2.1225452423095703, + "learning_rate": 5e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7558350563049316, + "num_tokens": 585181393.0, + "step": 22604 + }, + { + "epoch": 2.482429167581814, + "grad_norm": 2.1755902767181396, + "learning_rate": 5e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.779728889465332, + "num_tokens": 585205778.0, + "step": 22605 + }, + { + "epoch": 2.4825389852844277, + "grad_norm": 1.919307827949524, + "learning_rate": 5e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7460278272628784, + "num_tokens": 585237954.0, + "step": 22606 + }, + { + "epoch": 2.4826488029870415, + "grad_norm": 2.051987648010254, + "learning_rate": 5e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7542955279350281, + "num_tokens": 585264599.0, + "step": 22607 + }, + { + "epoch": 2.4827586206896552, + "grad_norm": 1.9898991584777832, + "learning_rate": 5e-06, + "loss": 0.6298, + "mean_token_accuracy": 0.7920141220092773, + "num_tokens": 585290769.0, + "step": 22608 + }, + { + "epoch": 2.482868438392269, + "grad_norm": 2.343226194381714, + "learning_rate": 5e-06, + "loss": 0.666, + "mean_token_accuracy": 0.7765734791755676, + "num_tokens": 585311796.0, + "step": 22609 + }, + { + "epoch": 2.4829782560948823, + "grad_norm": 2.200982093811035, + "learning_rate": 5e-06, + "loss": 0.689, + "mean_token_accuracy": 0.7750144004821777, + "num_tokens": 585334826.0, + "step": 22610 + }, + { + "epoch": 2.483088073797496, + "grad_norm": 2.2377712726593018, + "learning_rate": 5e-06, + "loss": 0.6499, + "mean_token_accuracy": 0.7835438251495361, + "num_tokens": 585357342.0, + "step": 22611 + }, + { + "epoch": 2.48319789150011, + "grad_norm": 2.217022657394409, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7674712538719177, + "num_tokens": 585380298.0, + "step": 22612 + }, + { + "epoch": 2.4833077092027236, + "grad_norm": 2.2425296306610107, + "learning_rate": 5e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7543975114822388, + "num_tokens": 585403141.0, + "step": 22613 + }, + { + "epoch": 2.4834175269053373, + "grad_norm": 2.009551763534546, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7657668590545654, + "num_tokens": 585431891.0, + "step": 22614 + }, + { + "epoch": 2.4835273446079507, + "grad_norm": 2.140200614929199, + "learning_rate": 5e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.774085283279419, + "num_tokens": 585455375.0, + "step": 22615 + }, + { + "epoch": 2.4836371623105644, + "grad_norm": 2.1858887672424316, + "learning_rate": 5e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7628087997436523, + "num_tokens": 585479475.0, + "step": 22616 + }, + { + "epoch": 2.483746980013178, + "grad_norm": 2.2176382541656494, + "learning_rate": 5e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7802932262420654, + "num_tokens": 585502190.0, + "step": 22617 + }, + { + "epoch": 2.483856797715792, + "grad_norm": 1.8759472370147705, + "learning_rate": 5e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7494188547134399, + "num_tokens": 585531120.0, + "step": 22618 + }, + { + "epoch": 2.4839666154184057, + "grad_norm": 2.3187620639801025, + "learning_rate": 5e-06, + "loss": 0.6061, + "mean_token_accuracy": 0.7954928874969482, + "num_tokens": 585550340.0, + "step": 22619 + }, + { + "epoch": 2.484076433121019, + "grad_norm": 2.1134047508239746, + "learning_rate": 5e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7801905870437622, + "num_tokens": 585574362.0, + "step": 22620 + }, + { + "epoch": 2.4841862508236328, + "grad_norm": 2.095925807952881, + "learning_rate": 5e-06, + "loss": 0.5988, + "mean_token_accuracy": 0.7962369918823242, + "num_tokens": 585596584.0, + "step": 22621 + }, + { + "epoch": 2.4842960685262465, + "grad_norm": 2.0503642559051514, + "learning_rate": 5e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.7489194869995117, + "num_tokens": 585623182.0, + "step": 22622 + }, + { + "epoch": 2.4844058862288603, + "grad_norm": 1.7449383735656738, + "learning_rate": 5e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7575220465660095, + "num_tokens": 585658362.0, + "step": 22623 + }, + { + "epoch": 2.4845157039314736, + "grad_norm": 2.209554433822632, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7738946676254272, + "num_tokens": 585681878.0, + "step": 22624 + }, + { + "epoch": 2.4846255216340873, + "grad_norm": 2.0640835762023926, + "learning_rate": 5e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7565074563026428, + "num_tokens": 585708144.0, + "step": 22625 + }, + { + "epoch": 2.484735339336701, + "grad_norm": 2.218181610107422, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7644610404968262, + "num_tokens": 585730801.0, + "step": 22626 + }, + { + "epoch": 2.484845157039315, + "grad_norm": 2.3158233165740967, + "learning_rate": 5e-06, + "loss": 0.6434, + "mean_token_accuracy": 0.7858235836029053, + "num_tokens": 585750536.0, + "step": 22627 + }, + { + "epoch": 2.484954974741928, + "grad_norm": 2.08805251121521, + "learning_rate": 5e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7655386924743652, + "num_tokens": 585774367.0, + "step": 22628 + }, + { + "epoch": 2.485064792444542, + "grad_norm": 1.9232702255249023, + "learning_rate": 5e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7345829010009766, + "num_tokens": 585804769.0, + "step": 22629 + }, + { + "epoch": 2.4851746101471557, + "grad_norm": 2.213416576385498, + "learning_rate": 5e-06, + "loss": 0.5985, + "mean_token_accuracy": 0.795509934425354, + "num_tokens": 585827229.0, + "step": 22630 + }, + { + "epoch": 2.4852844278497694, + "grad_norm": 2.1120312213897705, + "learning_rate": 5e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7425941228866577, + "num_tokens": 585853064.0, + "step": 22631 + }, + { + "epoch": 2.485394245552383, + "grad_norm": 2.0310275554656982, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7681236863136292, + "num_tokens": 585881297.0, + "step": 22632 + }, + { + "epoch": 2.4855040632549965, + "grad_norm": 2.2510249614715576, + "learning_rate": 5e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7593202590942383, + "num_tokens": 585903536.0, + "step": 22633 + }, + { + "epoch": 2.4856138809576103, + "grad_norm": 2.0437099933624268, + "learning_rate": 5e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7662188410758972, + "num_tokens": 585928993.0, + "step": 22634 + }, + { + "epoch": 2.485723698660224, + "grad_norm": 2.045044183731079, + "learning_rate": 5e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.7850065231323242, + "num_tokens": 585956486.0, + "step": 22635 + }, + { + "epoch": 2.485833516362838, + "grad_norm": 2.1670196056365967, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.752033531665802, + "num_tokens": 585982608.0, + "step": 22636 + }, + { + "epoch": 2.4859433340654515, + "grad_norm": 1.890669584274292, + "learning_rate": 5e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7484884262084961, + "num_tokens": 586013239.0, + "step": 22637 + }, + { + "epoch": 2.486053151768065, + "grad_norm": 1.811650276184082, + "learning_rate": 5e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7351206541061401, + "num_tokens": 586044213.0, + "step": 22638 + }, + { + "epoch": 2.4861629694706786, + "grad_norm": 2.433919668197632, + "learning_rate": 5e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.7752373814582825, + "num_tokens": 586062943.0, + "step": 22639 + }, + { + "epoch": 2.4862727871732924, + "grad_norm": 2.203138828277588, + "learning_rate": 5e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7413795590400696, + "num_tokens": 586087326.0, + "step": 22640 + }, + { + "epoch": 2.486382604875906, + "grad_norm": 2.373540163040161, + "learning_rate": 5e-06, + "loss": 0.622, + "mean_token_accuracy": 0.7972261905670166, + "num_tokens": 586106417.0, + "step": 22641 + }, + { + "epoch": 2.48649242257852, + "grad_norm": 2.2422924041748047, + "learning_rate": 5e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7612987756729126, + "num_tokens": 586128824.0, + "step": 22642 + }, + { + "epoch": 2.486602240281133, + "grad_norm": 2.128713369369507, + "learning_rate": 5e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7485849857330322, + "num_tokens": 586154111.0, + "step": 22643 + }, + { + "epoch": 2.486712057983747, + "grad_norm": 1.967697262763977, + "learning_rate": 5e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7688751220703125, + "num_tokens": 586183411.0, + "step": 22644 + }, + { + "epoch": 2.4868218756863607, + "grad_norm": 2.2210938930511475, + "learning_rate": 5e-06, + "loss": 0.6544, + "mean_token_accuracy": 0.7801516056060791, + "num_tokens": 586206614.0, + "step": 22645 + }, + { + "epoch": 2.4869316933889745, + "grad_norm": 2.0450565814971924, + "learning_rate": 5e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7708216905593872, + "num_tokens": 586234710.0, + "step": 22646 + }, + { + "epoch": 2.4870415110915878, + "grad_norm": 2.2090368270874023, + "learning_rate": 5e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7359857559204102, + "num_tokens": 586263194.0, + "step": 22647 + }, + { + "epoch": 2.4871513287942015, + "grad_norm": 1.9463131427764893, + "learning_rate": 5e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7566407918930054, + "num_tokens": 586292280.0, + "step": 22648 + }, + { + "epoch": 2.4872611464968153, + "grad_norm": 2.153712749481201, + "learning_rate": 5e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7614280581474304, + "num_tokens": 586317258.0, + "step": 22649 + }, + { + "epoch": 2.487370964199429, + "grad_norm": 2.221735715866089, + "learning_rate": 5e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7702264189720154, + "num_tokens": 586342050.0, + "step": 22650 + }, + { + "epoch": 2.487480781902043, + "grad_norm": 2.203415870666504, + "learning_rate": 5e-06, + "loss": 0.639, + "mean_token_accuracy": 0.7903293371200562, + "num_tokens": 586364914.0, + "step": 22651 + }, + { + "epoch": 2.487590599604656, + "grad_norm": 2.1185989379882812, + "learning_rate": 5e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7504903078079224, + "num_tokens": 586389649.0, + "step": 22652 + }, + { + "epoch": 2.48770041730727, + "grad_norm": 2.0220584869384766, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7695296406745911, + "num_tokens": 586417231.0, + "step": 22653 + }, + { + "epoch": 2.4878102350098836, + "grad_norm": 1.921812891960144, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7701421976089478, + "num_tokens": 586446394.0, + "step": 22654 + }, + { + "epoch": 2.4879200527124974, + "grad_norm": 2.179136037826538, + "learning_rate": 5e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7739712595939636, + "num_tokens": 586469521.0, + "step": 22655 + }, + { + "epoch": 2.4880298704151107, + "grad_norm": 2.430068016052246, + "learning_rate": 5e-06, + "loss": 0.6163, + "mean_token_accuracy": 0.7987420558929443, + "num_tokens": 586488921.0, + "step": 22656 + }, + { + "epoch": 2.4881396881177245, + "grad_norm": 2.5267186164855957, + "learning_rate": 5e-06, + "loss": 0.7535, + "mean_token_accuracy": 0.7582629323005676, + "num_tokens": 586508497.0, + "step": 22657 + }, + { + "epoch": 2.488249505820338, + "grad_norm": 2.0372254848480225, + "learning_rate": 5e-06, + "loss": 0.6631, + "mean_token_accuracy": 0.7844353914260864, + "num_tokens": 586533645.0, + "step": 22658 + }, + { + "epoch": 2.488359323522952, + "grad_norm": 2.099208354949951, + "learning_rate": 5e-06, + "loss": 0.6824, + "mean_token_accuracy": 0.7746056318283081, + "num_tokens": 586557666.0, + "step": 22659 + }, + { + "epoch": 2.4884691412255657, + "grad_norm": 2.466064929962158, + "learning_rate": 5e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.7724974751472473, + "num_tokens": 586576938.0, + "step": 22660 + }, + { + "epoch": 2.488578958928179, + "grad_norm": 1.9815365076065063, + "learning_rate": 5e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.7767229676246643, + "num_tokens": 586603530.0, + "step": 22661 + }, + { + "epoch": 2.488688776630793, + "grad_norm": 1.8965524435043335, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7548584938049316, + "num_tokens": 586632525.0, + "step": 22662 + }, + { + "epoch": 2.4887985943334066, + "grad_norm": 1.9288785457611084, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7524862885475159, + "num_tokens": 586663850.0, + "step": 22663 + }, + { + "epoch": 2.4889084120360203, + "grad_norm": 2.0754354000091553, + "learning_rate": 5e-06, + "loss": 0.7169, + "mean_token_accuracy": 0.7696057558059692, + "num_tokens": 586689805.0, + "step": 22664 + }, + { + "epoch": 2.489018229738634, + "grad_norm": 2.3852109909057617, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7618407011032104, + "num_tokens": 586709867.0, + "step": 22665 + }, + { + "epoch": 2.4891280474412474, + "grad_norm": 1.884375810623169, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7410975098609924, + "num_tokens": 586740791.0, + "step": 22666 + }, + { + "epoch": 2.489237865143861, + "grad_norm": 1.919128656387329, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7610855102539062, + "num_tokens": 586770678.0, + "step": 22667 + }, + { + "epoch": 2.489347682846475, + "grad_norm": 1.9883582592010498, + "learning_rate": 5e-06, + "loss": 0.6675, + "mean_token_accuracy": 0.7808513045310974, + "num_tokens": 586797997.0, + "step": 22668 + }, + { + "epoch": 2.4894575005490887, + "grad_norm": 2.072547674179077, + "learning_rate": 5e-06, + "loss": 0.6951, + "mean_token_accuracy": 0.7696715593338013, + "num_tokens": 586821776.0, + "step": 22669 + }, + { + "epoch": 2.4895673182517024, + "grad_norm": 2.029334306716919, + "learning_rate": 5e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7659851908683777, + "num_tokens": 586848310.0, + "step": 22670 + }, + { + "epoch": 2.4896771359543157, + "grad_norm": 2.005903482437134, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7628141045570374, + "num_tokens": 586873428.0, + "step": 22671 + }, + { + "epoch": 2.4897869536569295, + "grad_norm": 1.9517741203308105, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7490024566650391, + "num_tokens": 586904180.0, + "step": 22672 + }, + { + "epoch": 2.4898967713595432, + "grad_norm": 1.977487325668335, + "learning_rate": 5e-06, + "loss": 0.6158, + "mean_token_accuracy": 0.792226254940033, + "num_tokens": 586931113.0, + "step": 22673 + }, + { + "epoch": 2.490006589062157, + "grad_norm": 2.1448326110839844, + "learning_rate": 5e-06, + "loss": 0.658, + "mean_token_accuracy": 0.775196373462677, + "num_tokens": 586953827.0, + "step": 22674 + }, + { + "epoch": 2.4901164067647703, + "grad_norm": 1.9119131565093994, + "learning_rate": 5e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.7738551497459412, + "num_tokens": 586983198.0, + "step": 22675 + }, + { + "epoch": 2.490226224467384, + "grad_norm": 2.2124946117401123, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7762393355369568, + "num_tokens": 587006730.0, + "step": 22676 + }, + { + "epoch": 2.490336042169998, + "grad_norm": 2.089625835418701, + "learning_rate": 5e-06, + "loss": 0.6656, + "mean_token_accuracy": 0.773663341999054, + "num_tokens": 587031525.0, + "step": 22677 + }, + { + "epoch": 2.4904458598726116, + "grad_norm": 1.8923590183258057, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7681221961975098, + "num_tokens": 587060892.0, + "step": 22678 + }, + { + "epoch": 2.490555677575225, + "grad_norm": 1.9494636058807373, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.767892599105835, + "num_tokens": 587091069.0, + "step": 22679 + }, + { + "epoch": 2.4906654952778386, + "grad_norm": 2.251781463623047, + "learning_rate": 5e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7698999643325806, + "num_tokens": 587113910.0, + "step": 22680 + }, + { + "epoch": 2.4907753129804524, + "grad_norm": 2.2852985858917236, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7556229829788208, + "num_tokens": 587138050.0, + "step": 22681 + }, + { + "epoch": 2.490885130683066, + "grad_norm": 2.00091552734375, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7657865881919861, + "num_tokens": 587164359.0, + "step": 22682 + }, + { + "epoch": 2.49099494838568, + "grad_norm": 2.0833334922790527, + "learning_rate": 5e-06, + "loss": 0.6054, + "mean_token_accuracy": 0.7955105304718018, + "num_tokens": 587188125.0, + "step": 22683 + }, + { + "epoch": 2.4911047660882932, + "grad_norm": 1.9123201370239258, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7713184356689453, + "num_tokens": 587214882.0, + "step": 22684 + }, + { + "epoch": 2.491214583790907, + "grad_norm": 2.2890796661376953, + "learning_rate": 5e-06, + "loss": 0.6633, + "mean_token_accuracy": 0.7826194763183594, + "num_tokens": 587237187.0, + "step": 22685 + }, + { + "epoch": 2.4913244014935207, + "grad_norm": 2.0552127361297607, + "learning_rate": 5e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.757727861404419, + "num_tokens": 587263697.0, + "step": 22686 + }, + { + "epoch": 2.4914342191961345, + "grad_norm": 1.9714387655258179, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7784804105758667, + "num_tokens": 587290447.0, + "step": 22687 + }, + { + "epoch": 2.4915440368987483, + "grad_norm": 2.15126371383667, + "learning_rate": 5e-06, + "loss": 0.5866, + "mean_token_accuracy": 0.7999836206436157, + "num_tokens": 587311308.0, + "step": 22688 + }, + { + "epoch": 2.4916538546013616, + "grad_norm": 1.9707810878753662, + "learning_rate": 5e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7526495456695557, + "num_tokens": 587341936.0, + "step": 22689 + }, + { + "epoch": 2.4917636723039753, + "grad_norm": 2.1556365489959717, + "learning_rate": 5e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7735244631767273, + "num_tokens": 587364771.0, + "step": 22690 + }, + { + "epoch": 2.491873490006589, + "grad_norm": 2.152618408203125, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7645038962364197, + "num_tokens": 587391234.0, + "step": 22691 + }, + { + "epoch": 2.491983307709203, + "grad_norm": 2.305501699447632, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7704154253005981, + "num_tokens": 587412696.0, + "step": 22692 + }, + { + "epoch": 2.4920931254118166, + "grad_norm": 2.0585179328918457, + "learning_rate": 5e-06, + "loss": 0.8121, + "mean_token_accuracy": 0.7386228442192078, + "num_tokens": 587441169.0, + "step": 22693 + }, + { + "epoch": 2.49220294311443, + "grad_norm": 2.09055495262146, + "learning_rate": 5e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.748842716217041, + "num_tokens": 587469209.0, + "step": 22694 + }, + { + "epoch": 2.4923127608170437, + "grad_norm": 2.034041166305542, + "learning_rate": 5e-06, + "loss": 0.662, + "mean_token_accuracy": 0.7822138071060181, + "num_tokens": 587493454.0, + "step": 22695 + }, + { + "epoch": 2.4924225785196574, + "grad_norm": 2.239534616470337, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7606835961341858, + "num_tokens": 587517260.0, + "step": 22696 + }, + { + "epoch": 2.492532396222271, + "grad_norm": 2.1826419830322266, + "learning_rate": 5e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7509874105453491, + "num_tokens": 587541067.0, + "step": 22697 + }, + { + "epoch": 2.4926422139248845, + "grad_norm": 2.2775652408599854, + "learning_rate": 5e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7755717039108276, + "num_tokens": 587563949.0, + "step": 22698 + }, + { + "epoch": 2.4927520316274983, + "grad_norm": 2.2285778522491455, + "learning_rate": 5e-06, + "loss": 0.6148, + "mean_token_accuracy": 0.797240138053894, + "num_tokens": 587584901.0, + "step": 22699 + }, + { + "epoch": 2.492861849330112, + "grad_norm": 1.9812642335891724, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7694966793060303, + "num_tokens": 587614381.0, + "step": 22700 + }, + { + "epoch": 2.4929716670327258, + "grad_norm": 2.2857320308685303, + "learning_rate": 5e-06, + "loss": 0.6498, + "mean_token_accuracy": 0.7861538529396057, + "num_tokens": 587636992.0, + "step": 22701 + }, + { + "epoch": 2.4930814847353395, + "grad_norm": 2.3108649253845215, + "learning_rate": 5e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.7798134088516235, + "num_tokens": 587657763.0, + "step": 22702 + }, + { + "epoch": 2.493191302437953, + "grad_norm": 2.0795791149139404, + "learning_rate": 5e-06, + "loss": 0.6828, + "mean_token_accuracy": 0.7756426334381104, + "num_tokens": 587684485.0, + "step": 22703 + }, + { + "epoch": 2.4933011201405666, + "grad_norm": 2.0185036659240723, + "learning_rate": 5e-06, + "loss": 0.6843, + "mean_token_accuracy": 0.7756392955780029, + "num_tokens": 587712642.0, + "step": 22704 + }, + { + "epoch": 2.4934109378431804, + "grad_norm": 2.158076047897339, + "learning_rate": 5e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.759239912033081, + "num_tokens": 587738809.0, + "step": 22705 + }, + { + "epoch": 2.493520755545794, + "grad_norm": 2.536525249481201, + "learning_rate": 5e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7757220268249512, + "num_tokens": 587757631.0, + "step": 22706 + }, + { + "epoch": 2.4936305732484074, + "grad_norm": 2.199854612350464, + "learning_rate": 5e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.776698887348175, + "num_tokens": 587781558.0, + "step": 22707 + }, + { + "epoch": 2.493740390951021, + "grad_norm": 2.348285436630249, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7730086445808411, + "num_tokens": 587804164.0, + "step": 22708 + }, + { + "epoch": 2.493850208653635, + "grad_norm": 2.0770692825317383, + "learning_rate": 5e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7611675262451172, + "num_tokens": 587831105.0, + "step": 22709 + }, + { + "epoch": 2.4939600263562487, + "grad_norm": 2.337256908416748, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.765577495098114, + "num_tokens": 587853943.0, + "step": 22710 + }, + { + "epoch": 2.4940698440588625, + "grad_norm": 2.1836440563201904, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7712002992630005, + "num_tokens": 587875344.0, + "step": 22711 + }, + { + "epoch": 2.4941796617614758, + "grad_norm": 1.909488320350647, + "learning_rate": 5e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7637393474578857, + "num_tokens": 587904756.0, + "step": 22712 + }, + { + "epoch": 2.4942894794640895, + "grad_norm": 2.3880910873413086, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7744388580322266, + "num_tokens": 587924829.0, + "step": 22713 + }, + { + "epoch": 2.4943992971667033, + "grad_norm": 2.105205774307251, + "learning_rate": 5e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7675410509109497, + "num_tokens": 587949235.0, + "step": 22714 + }, + { + "epoch": 2.494509114869317, + "grad_norm": 1.6861940622329712, + "learning_rate": 5e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7581369280815125, + "num_tokens": 587984369.0, + "step": 22715 + }, + { + "epoch": 2.494618932571931, + "grad_norm": 1.9370217323303223, + "learning_rate": 5e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7596120238304138, + "num_tokens": 588012318.0, + "step": 22716 + }, + { + "epoch": 2.494728750274544, + "grad_norm": 1.8359524011611938, + "learning_rate": 5e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7481182813644409, + "num_tokens": 588044313.0, + "step": 22717 + }, + { + "epoch": 2.494838567977158, + "grad_norm": 2.1583878993988037, + "learning_rate": 5e-06, + "loss": 0.6717, + "mean_token_accuracy": 0.7709572315216064, + "num_tokens": 588069594.0, + "step": 22718 + }, + { + "epoch": 2.4949483856797716, + "grad_norm": 2.2077646255493164, + "learning_rate": 5e-06, + "loss": 0.7763, + "mean_token_accuracy": 0.7527286410331726, + "num_tokens": 588095140.0, + "step": 22719 + }, + { + "epoch": 2.4950582033823854, + "grad_norm": 2.1129260063171387, + "learning_rate": 5e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7629848718643188, + "num_tokens": 588123386.0, + "step": 22720 + }, + { + "epoch": 2.495168021084999, + "grad_norm": 2.3360342979431152, + "learning_rate": 5e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7697657346725464, + "num_tokens": 588145333.0, + "step": 22721 + }, + { + "epoch": 2.4952778387876124, + "grad_norm": 2.0202858448028564, + "learning_rate": 5e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7708189487457275, + "num_tokens": 588173144.0, + "step": 22722 + }, + { + "epoch": 2.495387656490226, + "grad_norm": 2.2490711212158203, + "learning_rate": 5e-06, + "loss": 0.7203, + "mean_token_accuracy": 0.7658028602600098, + "num_tokens": 588194729.0, + "step": 22723 + }, + { + "epoch": 2.49549747419284, + "grad_norm": 2.3159637451171875, + "learning_rate": 5e-06, + "loss": 0.5992, + "mean_token_accuracy": 0.7948041558265686, + "num_tokens": 588212634.0, + "step": 22724 + }, + { + "epoch": 2.4956072918954537, + "grad_norm": 2.2218494415283203, + "learning_rate": 5e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7541007399559021, + "num_tokens": 588237268.0, + "step": 22725 + }, + { + "epoch": 2.495717109598067, + "grad_norm": 2.268730401992798, + "learning_rate": 5e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.7832207679748535, + "num_tokens": 588258584.0, + "step": 22726 + }, + { + "epoch": 2.495826927300681, + "grad_norm": 1.8339396715164185, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7588493227958679, + "num_tokens": 588291105.0, + "step": 22727 + }, + { + "epoch": 2.4959367450032945, + "grad_norm": 2.1102287769317627, + "learning_rate": 5e-06, + "loss": 0.6511, + "mean_token_accuracy": 0.7826153039932251, + "num_tokens": 588314257.0, + "step": 22728 + }, + { + "epoch": 2.4960465627059083, + "grad_norm": 2.1476387977600098, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7673979997634888, + "num_tokens": 588337749.0, + "step": 22729 + }, + { + "epoch": 2.4961563804085216, + "grad_norm": 2.0294852256774902, + "learning_rate": 5e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7670037746429443, + "num_tokens": 588366308.0, + "step": 22730 + }, + { + "epoch": 2.4962661981111354, + "grad_norm": 1.9663041830062866, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7642048597335815, + "num_tokens": 588394831.0, + "step": 22731 + }, + { + "epoch": 2.496376015813749, + "grad_norm": 2.3235597610473633, + "learning_rate": 5e-06, + "loss": 0.6149, + "mean_token_accuracy": 0.7934465408325195, + "num_tokens": 588414629.0, + "step": 22732 + }, + { + "epoch": 2.496485833516363, + "grad_norm": 1.8444253206253052, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7567194700241089, + "num_tokens": 588446205.0, + "step": 22733 + }, + { + "epoch": 2.4965956512189766, + "grad_norm": 2.275275468826294, + "learning_rate": 5e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7487688064575195, + "num_tokens": 588470818.0, + "step": 22734 + }, + { + "epoch": 2.49670546892159, + "grad_norm": 2.2589962482452393, + "learning_rate": 5e-06, + "loss": 0.6574, + "mean_token_accuracy": 0.7814963459968567, + "num_tokens": 588494038.0, + "step": 22735 + }, + { + "epoch": 2.4968152866242037, + "grad_norm": 2.2877354621887207, + "learning_rate": 5e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.774631917476654, + "num_tokens": 588515628.0, + "step": 22736 + }, + { + "epoch": 2.4969251043268175, + "grad_norm": 2.048086166381836, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7657201886177063, + "num_tokens": 588544068.0, + "step": 22737 + }, + { + "epoch": 2.4970349220294312, + "grad_norm": 1.9939320087432861, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7788198590278625, + "num_tokens": 588571427.0, + "step": 22738 + }, + { + "epoch": 2.497144739732045, + "grad_norm": 2.222071886062622, + "learning_rate": 5e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.7777681946754456, + "num_tokens": 588593974.0, + "step": 22739 + }, + { + "epoch": 2.4972545574346583, + "grad_norm": 2.0527548789978027, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7579138875007629, + "num_tokens": 588619804.0, + "step": 22740 + }, + { + "epoch": 2.497364375137272, + "grad_norm": 1.9805899858474731, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.753385603427887, + "num_tokens": 588648095.0, + "step": 22741 + }, + { + "epoch": 2.497474192839886, + "grad_norm": 2.077986240386963, + "learning_rate": 5e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7768455743789673, + "num_tokens": 588676727.0, + "step": 22742 + }, + { + "epoch": 2.4975840105424996, + "grad_norm": 2.2406013011932373, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7770972847938538, + "num_tokens": 588701136.0, + "step": 22743 + }, + { + "epoch": 2.4976938282451133, + "grad_norm": 2.0403029918670654, + "learning_rate": 5e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7588032484054565, + "num_tokens": 588728129.0, + "step": 22744 + }, + { + "epoch": 2.4978036459477266, + "grad_norm": 2.379441976547241, + "learning_rate": 5e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.7808663845062256, + "num_tokens": 588748344.0, + "step": 22745 + }, + { + "epoch": 2.4979134636503404, + "grad_norm": 2.443711280822754, + "learning_rate": 5e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7748602032661438, + "num_tokens": 588768812.0, + "step": 22746 + }, + { + "epoch": 2.498023281352954, + "grad_norm": 2.115720272064209, + "learning_rate": 5e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7793724536895752, + "num_tokens": 588792953.0, + "step": 22747 + }, + { + "epoch": 2.498133099055568, + "grad_norm": 2.223072052001953, + "learning_rate": 5e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7716197967529297, + "num_tokens": 588814321.0, + "step": 22748 + }, + { + "epoch": 2.4982429167581817, + "grad_norm": 2.2417802810668945, + "learning_rate": 5e-06, + "loss": 0.6195, + "mean_token_accuracy": 0.7888707518577576, + "num_tokens": 588837475.0, + "step": 22749 + }, + { + "epoch": 2.498352734460795, + "grad_norm": 2.251556396484375, + "learning_rate": 5e-06, + "loss": 0.652, + "mean_token_accuracy": 0.7827812433242798, + "num_tokens": 588858988.0, + "step": 22750 + }, + { + "epoch": 2.4984625521634087, + "grad_norm": 2.3073644638061523, + "learning_rate": 5e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7509432435035706, + "num_tokens": 588886337.0, + "step": 22751 + }, + { + "epoch": 2.4985723698660225, + "grad_norm": 2.110255479812622, + "learning_rate": 5e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.7879127264022827, + "num_tokens": 588912408.0, + "step": 22752 + }, + { + "epoch": 2.4986821875686362, + "grad_norm": 2.311552047729492, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7604858875274658, + "num_tokens": 588933715.0, + "step": 22753 + }, + { + "epoch": 2.4987920052712496, + "grad_norm": 2.1433029174804688, + "learning_rate": 5e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7612678408622742, + "num_tokens": 588957934.0, + "step": 22754 + }, + { + "epoch": 2.4989018229738633, + "grad_norm": 2.123735189437866, + "learning_rate": 5e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7373210787773132, + "num_tokens": 588983747.0, + "step": 22755 + }, + { + "epoch": 2.499011640676477, + "grad_norm": 2.2860443592071533, + "learning_rate": 5e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7585642337799072, + "num_tokens": 589009208.0, + "step": 22756 + }, + { + "epoch": 2.499121458379091, + "grad_norm": 2.059368371963501, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7619244456291199, + "num_tokens": 589035586.0, + "step": 22757 + }, + { + "epoch": 2.499231276081704, + "grad_norm": 2.045746088027954, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7655839323997498, + "num_tokens": 589059809.0, + "step": 22758 + }, + { + "epoch": 2.499341093784318, + "grad_norm": 2.136063814163208, + "learning_rate": 5e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7560405731201172, + "num_tokens": 589085878.0, + "step": 22759 + }, + { + "epoch": 2.4994509114869317, + "grad_norm": 1.8598183393478394, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.733165442943573, + "num_tokens": 589119115.0, + "step": 22760 + }, + { + "epoch": 2.4995607291895454, + "grad_norm": 2.1627275943756104, + "learning_rate": 5e-06, + "loss": 0.6782, + "mean_token_accuracy": 0.7691174745559692, + "num_tokens": 589144285.0, + "step": 22761 + }, + { + "epoch": 2.499670546892159, + "grad_norm": 2.208414077758789, + "learning_rate": 5e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7552371025085449, + "num_tokens": 589169146.0, + "step": 22762 + }, + { + "epoch": 2.4997803645947725, + "grad_norm": 1.9355729818344116, + "learning_rate": 5e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7293616533279419, + "num_tokens": 589198326.0, + "step": 22763 + }, + { + "epoch": 2.4998901822973862, + "grad_norm": 1.7923063039779663, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7631675601005554, + "num_tokens": 589230598.0, + "step": 22764 + }, + { + "epoch": 2.5, + "grad_norm": 2.1563150882720947, + "learning_rate": 5e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7706398963928223, + "num_tokens": 589254300.0, + "step": 22765 + }, + { + "epoch": 2.5001098177026138, + "grad_norm": 2.2940211296081543, + "learning_rate": 5e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.7812231779098511, + "num_tokens": 589276205.0, + "step": 22766 + }, + { + "epoch": 2.5002196354052275, + "grad_norm": 2.0731441974639893, + "learning_rate": 5e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7849838733673096, + "num_tokens": 589300404.0, + "step": 22767 + }, + { + "epoch": 2.500329453107841, + "grad_norm": 2.0606932640075684, + "learning_rate": 5e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7502307891845703, + "num_tokens": 589330052.0, + "step": 22768 + }, + { + "epoch": 2.5004392708104546, + "grad_norm": 2.1987226009368896, + "learning_rate": 5e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.7864983081817627, + "num_tokens": 589352485.0, + "step": 22769 + }, + { + "epoch": 2.5005490885130683, + "grad_norm": 2.0971086025238037, + "learning_rate": 5e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.7775874137878418, + "num_tokens": 589377190.0, + "step": 22770 + }, + { + "epoch": 2.500658906215682, + "grad_norm": 1.965491533279419, + "learning_rate": 5e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7777689695358276, + "num_tokens": 589403284.0, + "step": 22771 + }, + { + "epoch": 2.500768723918296, + "grad_norm": 1.946491003036499, + "learning_rate": 5e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7698458433151245, + "num_tokens": 589431714.0, + "step": 22772 + }, + { + "epoch": 2.500878541620909, + "grad_norm": 1.920723557472229, + "learning_rate": 5e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7735040187835693, + "num_tokens": 589461316.0, + "step": 22773 + }, + { + "epoch": 2.500988359323523, + "grad_norm": 2.046092987060547, + "learning_rate": 5e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7612574100494385, + "num_tokens": 589488562.0, + "step": 22774 + }, + { + "epoch": 2.5010981770261367, + "grad_norm": 1.8980727195739746, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7705899477005005, + "num_tokens": 589518647.0, + "step": 22775 + }, + { + "epoch": 2.5012079947287504, + "grad_norm": 2.0898234844207764, + "learning_rate": 5e-06, + "loss": 0.5652, + "mean_token_accuracy": 0.8031246066093445, + "num_tokens": 589540619.0, + "step": 22776 + }, + { + "epoch": 2.501317812431364, + "grad_norm": 2.702451705932617, + "learning_rate": 5e-06, + "loss": 0.637, + "mean_token_accuracy": 0.7841598391532898, + "num_tokens": 589556433.0, + "step": 22777 + }, + { + "epoch": 2.5014276301339775, + "grad_norm": 2.4339373111724854, + "learning_rate": 5e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.7787123918533325, + "num_tokens": 589576146.0, + "step": 22778 + }, + { + "epoch": 2.5015374478365913, + "grad_norm": 2.2329463958740234, + "learning_rate": 5e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.7876036167144775, + "num_tokens": 589599158.0, + "step": 22779 + }, + { + "epoch": 2.501647265539205, + "grad_norm": 2.0132384300231934, + "learning_rate": 5e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.768709659576416, + "num_tokens": 589626432.0, + "step": 22780 + }, + { + "epoch": 2.5017570832418183, + "grad_norm": 1.9078187942504883, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7703772783279419, + "num_tokens": 589654714.0, + "step": 22781 + }, + { + "epoch": 2.501866900944432, + "grad_norm": 2.0674712657928467, + "learning_rate": 5e-06, + "loss": 0.6738, + "mean_token_accuracy": 0.7732288837432861, + "num_tokens": 589682032.0, + "step": 22782 + }, + { + "epoch": 2.501976718647046, + "grad_norm": 2.0202476978302, + "learning_rate": 5e-06, + "loss": 0.699, + "mean_token_accuracy": 0.7695472240447998, + "num_tokens": 589711179.0, + "step": 22783 + }, + { + "epoch": 2.5020865363496596, + "grad_norm": 2.0459394454956055, + "learning_rate": 5e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.7802705764770508, + "num_tokens": 589737433.0, + "step": 22784 + }, + { + "epoch": 2.5021963540522734, + "grad_norm": 2.0490763187408447, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7666950821876526, + "num_tokens": 589764264.0, + "step": 22785 + }, + { + "epoch": 2.5023061717548867, + "grad_norm": 2.0082128047943115, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7702546119689941, + "num_tokens": 589791335.0, + "step": 22786 + }, + { + "epoch": 2.5024159894575004, + "grad_norm": 1.9447880983352661, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7522130608558655, + "num_tokens": 589819587.0, + "step": 22787 + }, + { + "epoch": 2.502525807160114, + "grad_norm": 2.03292179107666, + "learning_rate": 5e-06, + "loss": 0.6356, + "mean_token_accuracy": 0.782484769821167, + "num_tokens": 589843377.0, + "step": 22788 + }, + { + "epoch": 2.502635624862728, + "grad_norm": 2.087493419647217, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7704774141311646, + "num_tokens": 589867664.0, + "step": 22789 + }, + { + "epoch": 2.5027454425653417, + "grad_norm": 1.956819772720337, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7609392404556274, + "num_tokens": 589894762.0, + "step": 22790 + }, + { + "epoch": 2.502855260267955, + "grad_norm": 2.255842447280884, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7712627649307251, + "num_tokens": 589917043.0, + "step": 22791 + }, + { + "epoch": 2.5029650779705688, + "grad_norm": 2.208832263946533, + "learning_rate": 5e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7412674427032471, + "num_tokens": 589941866.0, + "step": 22792 + }, + { + "epoch": 2.5030748956731825, + "grad_norm": 2.1271538734436035, + "learning_rate": 5e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.7829216122627258, + "num_tokens": 589964618.0, + "step": 22793 + }, + { + "epoch": 2.5031847133757963, + "grad_norm": 2.096355438232422, + "learning_rate": 5e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.7835302948951721, + "num_tokens": 589987510.0, + "step": 22794 + }, + { + "epoch": 2.50329453107841, + "grad_norm": 1.9588463306427002, + "learning_rate": 5e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7581087350845337, + "num_tokens": 590016232.0, + "step": 22795 + }, + { + "epoch": 2.5034043487810234, + "grad_norm": 2.14778470993042, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7710270881652832, + "num_tokens": 590041646.0, + "step": 22796 + }, + { + "epoch": 2.503514166483637, + "grad_norm": 1.914570689201355, + "learning_rate": 5e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7649343609809875, + "num_tokens": 590070038.0, + "step": 22797 + }, + { + "epoch": 2.503623984186251, + "grad_norm": 2.1160426139831543, + "learning_rate": 5e-06, + "loss": 0.6776, + "mean_token_accuracy": 0.7761232852935791, + "num_tokens": 590096020.0, + "step": 22798 + }, + { + "epoch": 2.5037338018888646, + "grad_norm": 2.1768507957458496, + "learning_rate": 5e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7572060823440552, + "num_tokens": 590120754.0, + "step": 22799 + }, + { + "epoch": 2.5038436195914784, + "grad_norm": 2.116568088531494, + "learning_rate": 5e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.739647388458252, + "num_tokens": 590148034.0, + "step": 22800 + }, + { + "epoch": 2.5039534372940917, + "grad_norm": 1.9999831914901733, + "learning_rate": 5e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7720726132392883, + "num_tokens": 590176396.0, + "step": 22801 + }, + { + "epoch": 2.5040632549967055, + "grad_norm": 2.01247501373291, + "learning_rate": 5e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7629340887069702, + "num_tokens": 590203550.0, + "step": 22802 + }, + { + "epoch": 2.504173072699319, + "grad_norm": 1.9681322574615479, + "learning_rate": 5e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7467511892318726, + "num_tokens": 590233236.0, + "step": 22803 + }, + { + "epoch": 2.5042828904019325, + "grad_norm": 2.04300594329834, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7683791518211365, + "num_tokens": 590257996.0, + "step": 22804 + }, + { + "epoch": 2.5043927081045467, + "grad_norm": 2.0918004512786865, + "learning_rate": 5e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.747924268245697, + "num_tokens": 590285519.0, + "step": 22805 + }, + { + "epoch": 2.50450252580716, + "grad_norm": 1.8858680725097656, + "learning_rate": 5e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7665242552757263, + "num_tokens": 590314635.0, + "step": 22806 + }, + { + "epoch": 2.504612343509774, + "grad_norm": 2.0705759525299072, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.764131486415863, + "num_tokens": 590341782.0, + "step": 22807 + }, + { + "epoch": 2.5047221612123876, + "grad_norm": 2.285770893096924, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7653548717498779, + "num_tokens": 590364133.0, + "step": 22808 + }, + { + "epoch": 2.504831978915001, + "grad_norm": 2.041403293609619, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7744377255439758, + "num_tokens": 590392081.0, + "step": 22809 + }, + { + "epoch": 2.5049417966176146, + "grad_norm": 2.3065409660339355, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7606368064880371, + "num_tokens": 590416619.0, + "step": 22810 + }, + { + "epoch": 2.5050516143202284, + "grad_norm": 2.4720864295959473, + "learning_rate": 5e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.7788360118865967, + "num_tokens": 590435813.0, + "step": 22811 + }, + { + "epoch": 2.505161432022842, + "grad_norm": 2.30720853805542, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7672246694564819, + "num_tokens": 590457946.0, + "step": 22812 + }, + { + "epoch": 2.505271249725456, + "grad_norm": 2.2790768146514893, + "learning_rate": 5e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.7464383840560913, + "num_tokens": 590487092.0, + "step": 22813 + }, + { + "epoch": 2.505381067428069, + "grad_norm": 2.056283473968506, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7432091236114502, + "num_tokens": 590516611.0, + "step": 22814 + }, + { + "epoch": 2.505490885130683, + "grad_norm": 2.0686864852905273, + "learning_rate": 5e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7602423429489136, + "num_tokens": 590541974.0, + "step": 22815 + }, + { + "epoch": 2.5056007028332967, + "grad_norm": 2.084920883178711, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7666304707527161, + "num_tokens": 590566226.0, + "step": 22816 + }, + { + "epoch": 2.5057105205359105, + "grad_norm": 2.3908684253692627, + "learning_rate": 5e-06, + "loss": 0.6217, + "mean_token_accuracy": 0.790277361869812, + "num_tokens": 590584141.0, + "step": 22817 + }, + { + "epoch": 2.5058203382385242, + "grad_norm": 2.0838611125946045, + "learning_rate": 5e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.770804762840271, + "num_tokens": 590608725.0, + "step": 22818 + }, + { + "epoch": 2.5059301559411375, + "grad_norm": 2.1058335304260254, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7628800868988037, + "num_tokens": 590636189.0, + "step": 22819 + }, + { + "epoch": 2.5060399736437513, + "grad_norm": 2.0849969387054443, + "learning_rate": 5e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.77828049659729, + "num_tokens": 590663333.0, + "step": 22820 + }, + { + "epoch": 2.506149791346365, + "grad_norm": 2.3304824829101562, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7535960674285889, + "num_tokens": 590687716.0, + "step": 22821 + }, + { + "epoch": 2.506259609048979, + "grad_norm": 2.1701202392578125, + "learning_rate": 5e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7583945393562317, + "num_tokens": 590714000.0, + "step": 22822 + }, + { + "epoch": 2.5063694267515926, + "grad_norm": 2.267627477645874, + "learning_rate": 5e-06, + "loss": 0.6417, + "mean_token_accuracy": 0.7868285179138184, + "num_tokens": 590737304.0, + "step": 22823 + }, + { + "epoch": 2.506479244454206, + "grad_norm": 1.9524645805358887, + "learning_rate": 5e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7751257419586182, + "num_tokens": 590764013.0, + "step": 22824 + }, + { + "epoch": 2.5065890621568196, + "grad_norm": 1.8438425064086914, + "learning_rate": 5e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7687052488327026, + "num_tokens": 590793668.0, + "step": 22825 + }, + { + "epoch": 2.5066988798594334, + "grad_norm": 2.085031032562256, + "learning_rate": 5e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.795661211013794, + "num_tokens": 590815402.0, + "step": 22826 + }, + { + "epoch": 2.506808697562047, + "grad_norm": 2.0891265869140625, + "learning_rate": 5e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.7777787446975708, + "num_tokens": 590838936.0, + "step": 22827 + }, + { + "epoch": 2.506918515264661, + "grad_norm": 2.220278024673462, + "learning_rate": 5e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7684307098388672, + "num_tokens": 590863420.0, + "step": 22828 + }, + { + "epoch": 2.5070283329672742, + "grad_norm": 1.6706421375274658, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7678228616714478, + "num_tokens": 590902045.0, + "step": 22829 + }, + { + "epoch": 2.507138150669888, + "grad_norm": 2.0259385108947754, + "learning_rate": 5e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7755367755889893, + "num_tokens": 590929058.0, + "step": 22830 + }, + { + "epoch": 2.5072479683725017, + "grad_norm": 1.8482036590576172, + "learning_rate": 5e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7477736473083496, + "num_tokens": 590960185.0, + "step": 22831 + }, + { + "epoch": 2.507357786075115, + "grad_norm": 2.6471261978149414, + "learning_rate": 5e-06, + "loss": 0.6191, + "mean_token_accuracy": 0.7931065559387207, + "num_tokens": 590977289.0, + "step": 22832 + }, + { + "epoch": 2.507467603777729, + "grad_norm": 2.239173173904419, + "learning_rate": 5e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.772567093372345, + "num_tokens": 591000900.0, + "step": 22833 + }, + { + "epoch": 2.5075774214803426, + "grad_norm": 2.1556196212768555, + "learning_rate": 5e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7690997123718262, + "num_tokens": 591025254.0, + "step": 22834 + }, + { + "epoch": 2.5076872391829563, + "grad_norm": 2.023247241973877, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7456027269363403, + "num_tokens": 591053238.0, + "step": 22835 + }, + { + "epoch": 2.50779705688557, + "grad_norm": 2.0304229259490967, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7687559127807617, + "num_tokens": 591081214.0, + "step": 22836 + }, + { + "epoch": 2.5079068745881834, + "grad_norm": 2.014253616333008, + "learning_rate": 5e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7491319179534912, + "num_tokens": 591108611.0, + "step": 22837 + }, + { + "epoch": 2.508016692290797, + "grad_norm": 2.1906800270080566, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.767683744430542, + "num_tokens": 591134548.0, + "step": 22838 + }, + { + "epoch": 2.508126509993411, + "grad_norm": 1.8945655822753906, + "learning_rate": 5e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7552783489227295, + "num_tokens": 591164310.0, + "step": 22839 + }, + { + "epoch": 2.5082363276960247, + "grad_norm": 2.014906406402588, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7494441866874695, + "num_tokens": 591196262.0, + "step": 22840 + }, + { + "epoch": 2.5083461453986384, + "grad_norm": 2.521031141281128, + "learning_rate": 5e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.759013295173645, + "num_tokens": 591216902.0, + "step": 22841 + }, + { + "epoch": 2.5084559631012517, + "grad_norm": 2.3410098552703857, + "learning_rate": 5e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7485569715499878, + "num_tokens": 591242070.0, + "step": 22842 + }, + { + "epoch": 2.5085657808038655, + "grad_norm": 2.3281612396240234, + "learning_rate": 5e-06, + "loss": 0.6834, + "mean_token_accuracy": 0.7728843688964844, + "num_tokens": 591265227.0, + "step": 22843 + }, + { + "epoch": 2.5086755985064793, + "grad_norm": 2.272416353225708, + "learning_rate": 5e-06, + "loss": 0.6516, + "mean_token_accuracy": 0.7761482000350952, + "num_tokens": 591286434.0, + "step": 22844 + }, + { + "epoch": 2.508785416209093, + "grad_norm": 1.824672818183899, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.744749903678894, + "num_tokens": 591318120.0, + "step": 22845 + }, + { + "epoch": 2.5088952339117068, + "grad_norm": 2.2745728492736816, + "learning_rate": 5e-06, + "loss": 0.6671, + "mean_token_accuracy": 0.7834806442260742, + "num_tokens": 591339720.0, + "step": 22846 + }, + { + "epoch": 2.50900505161432, + "grad_norm": 1.8546525239944458, + "learning_rate": 5e-06, + "loss": 0.6807, + "mean_token_accuracy": 0.779274582862854, + "num_tokens": 591371153.0, + "step": 22847 + }, + { + "epoch": 2.509114869316934, + "grad_norm": 2.4529225826263428, + "learning_rate": 5e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.7861521244049072, + "num_tokens": 591390030.0, + "step": 22848 + }, + { + "epoch": 2.5092246870195476, + "grad_norm": 2.021697521209717, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7715508937835693, + "num_tokens": 591419681.0, + "step": 22849 + }, + { + "epoch": 2.5093345047221614, + "grad_norm": 2.151488780975342, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7607919573783875, + "num_tokens": 591444369.0, + "step": 22850 + }, + { + "epoch": 2.509444322424775, + "grad_norm": 2.1789391040802, + "learning_rate": 5e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7445874214172363, + "num_tokens": 591470216.0, + "step": 22851 + }, + { + "epoch": 2.5095541401273884, + "grad_norm": 2.0602474212646484, + "learning_rate": 5e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.7798590064048767, + "num_tokens": 591494676.0, + "step": 22852 + }, + { + "epoch": 2.509663957830002, + "grad_norm": 2.4100215435028076, + "learning_rate": 5e-06, + "loss": 0.674, + "mean_token_accuracy": 0.7738658785820007, + "num_tokens": 591514079.0, + "step": 22853 + }, + { + "epoch": 2.509773775532616, + "grad_norm": 2.3863532543182373, + "learning_rate": 5e-06, + "loss": 0.7653, + "mean_token_accuracy": 0.7463298439979553, + "num_tokens": 591535812.0, + "step": 22854 + }, + { + "epoch": 2.5098835932352292, + "grad_norm": 2.3165345191955566, + "learning_rate": 5e-06, + "loss": 0.676, + "mean_token_accuracy": 0.775626003742218, + "num_tokens": 591559553.0, + "step": 22855 + }, + { + "epoch": 2.5099934109378434, + "grad_norm": 2.0700199604034424, + "learning_rate": 5e-06, + "loss": 0.6847, + "mean_token_accuracy": 0.7738507986068726, + "num_tokens": 591584624.0, + "step": 22856 + }, + { + "epoch": 2.5101032286404568, + "grad_norm": 2.1480584144592285, + "learning_rate": 5e-06, + "loss": 0.641, + "mean_token_accuracy": 0.7862219214439392, + "num_tokens": 591606524.0, + "step": 22857 + }, + { + "epoch": 2.5102130463430705, + "grad_norm": 1.9624884128570557, + "learning_rate": 5e-06, + "loss": 0.6698, + "mean_token_accuracy": 0.775467038154602, + "num_tokens": 591634189.0, + "step": 22858 + }, + { + "epoch": 2.5103228640456843, + "grad_norm": 2.0926096439361572, + "learning_rate": 5e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7611019015312195, + "num_tokens": 591658551.0, + "step": 22859 + }, + { + "epoch": 2.5104326817482976, + "grad_norm": 2.2714571952819824, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7737510204315186, + "num_tokens": 591679785.0, + "step": 22860 + }, + { + "epoch": 2.5105424994509113, + "grad_norm": 2.229767322540283, + "learning_rate": 5e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7581818699836731, + "num_tokens": 591703866.0, + "step": 22861 + }, + { + "epoch": 2.510652317153525, + "grad_norm": 2.135941743850708, + "learning_rate": 5e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7634860277175903, + "num_tokens": 591729860.0, + "step": 22862 + }, + { + "epoch": 2.510762134856139, + "grad_norm": 2.0651402473449707, + "learning_rate": 5e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.7823570966720581, + "num_tokens": 591753659.0, + "step": 22863 + }, + { + "epoch": 2.5108719525587526, + "grad_norm": 1.7981032133102417, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7697295546531677, + "num_tokens": 591788509.0, + "step": 22864 + }, + { + "epoch": 2.510981770261366, + "grad_norm": 1.9626542329788208, + "learning_rate": 5e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7586094737052917, + "num_tokens": 591817438.0, + "step": 22865 + }, + { + "epoch": 2.5110915879639797, + "grad_norm": 2.0179269313812256, + "learning_rate": 5e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7597362995147705, + "num_tokens": 591844873.0, + "step": 22866 + }, + { + "epoch": 2.5112014056665934, + "grad_norm": 2.0257296562194824, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7557681798934937, + "num_tokens": 591870421.0, + "step": 22867 + }, + { + "epoch": 2.511311223369207, + "grad_norm": 1.8914167881011963, + "learning_rate": 5e-06, + "loss": 0.6076, + "mean_token_accuracy": 0.7912771701812744, + "num_tokens": 591898209.0, + "step": 22868 + }, + { + "epoch": 2.511421041071821, + "grad_norm": 2.038296937942505, + "learning_rate": 5e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7588872313499451, + "num_tokens": 591926471.0, + "step": 22869 + }, + { + "epoch": 2.5115308587744343, + "grad_norm": 2.275496482849121, + "learning_rate": 5e-06, + "loss": 0.677, + "mean_token_accuracy": 0.7820828557014465, + "num_tokens": 591947839.0, + "step": 22870 + }, + { + "epoch": 2.511640676477048, + "grad_norm": 2.002962112426758, + "learning_rate": 5e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7546828389167786, + "num_tokens": 591975773.0, + "step": 22871 + }, + { + "epoch": 2.511750494179662, + "grad_norm": 2.2624568939208984, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7560639381408691, + "num_tokens": 592000336.0, + "step": 22872 + }, + { + "epoch": 2.5118603118822755, + "grad_norm": 2.28977632522583, + "learning_rate": 5e-06, + "loss": 0.5983, + "mean_token_accuracy": 0.7956048846244812, + "num_tokens": 592020692.0, + "step": 22873 + }, + { + "epoch": 2.5119701295848893, + "grad_norm": 1.9215455055236816, + "learning_rate": 5e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7643052935600281, + "num_tokens": 592052836.0, + "step": 22874 + }, + { + "epoch": 2.5120799472875026, + "grad_norm": 2.101055383682251, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7650708556175232, + "num_tokens": 592077429.0, + "step": 22875 + }, + { + "epoch": 2.5121897649901164, + "grad_norm": 1.9406929016113281, + "learning_rate": 5e-06, + "loss": 0.6299, + "mean_token_accuracy": 0.791914165019989, + "num_tokens": 592105217.0, + "step": 22876 + }, + { + "epoch": 2.51229958269273, + "grad_norm": 1.7127139568328857, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7589032649993896, + "num_tokens": 592140727.0, + "step": 22877 + }, + { + "epoch": 2.512409400395344, + "grad_norm": 1.988078236579895, + "learning_rate": 5e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7285898923873901, + "num_tokens": 592169707.0, + "step": 22878 + }, + { + "epoch": 2.5125192180979576, + "grad_norm": 2.381391763687134, + "learning_rate": 5e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.780158519744873, + "num_tokens": 592190766.0, + "step": 22879 + }, + { + "epoch": 2.512629035800571, + "grad_norm": 1.8761389255523682, + "learning_rate": 5e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.740630030632019, + "num_tokens": 592223181.0, + "step": 22880 + }, + { + "epoch": 2.5127388535031847, + "grad_norm": 1.9791113138198853, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7684085369110107, + "num_tokens": 592253921.0, + "step": 22881 + }, + { + "epoch": 2.5128486712057985, + "grad_norm": 2.1833667755126953, + "learning_rate": 5e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.7789853811264038, + "num_tokens": 592275031.0, + "step": 22882 + }, + { + "epoch": 2.512958488908412, + "grad_norm": 2.145251512527466, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7500602006912231, + "num_tokens": 592302103.0, + "step": 22883 + }, + { + "epoch": 2.5130683066110255, + "grad_norm": 1.9423680305480957, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7469202280044556, + "num_tokens": 592331256.0, + "step": 22884 + }, + { + "epoch": 2.5131781243136393, + "grad_norm": 2.2552874088287354, + "learning_rate": 5e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7686868906021118, + "num_tokens": 592355845.0, + "step": 22885 + }, + { + "epoch": 2.513287942016253, + "grad_norm": 2.207679033279419, + "learning_rate": 5e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7565523386001587, + "num_tokens": 592380758.0, + "step": 22886 + }, + { + "epoch": 2.513397759718867, + "grad_norm": 2.020538330078125, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7631346583366394, + "num_tokens": 592406575.0, + "step": 22887 + }, + { + "epoch": 2.51350757742148, + "grad_norm": 1.838938593864441, + "learning_rate": 5e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7439122796058655, + "num_tokens": 592437656.0, + "step": 22888 + }, + { + "epoch": 2.513617395124094, + "grad_norm": 2.0264861583709717, + "learning_rate": 5e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7590787410736084, + "num_tokens": 592463968.0, + "step": 22889 + }, + { + "epoch": 2.5137272128267076, + "grad_norm": 2.008007049560547, + "learning_rate": 5e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7580525875091553, + "num_tokens": 592493298.0, + "step": 22890 + }, + { + "epoch": 2.5138370305293214, + "grad_norm": 1.9072542190551758, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7595967054367065, + "num_tokens": 592523064.0, + "step": 22891 + }, + { + "epoch": 2.513946848231935, + "grad_norm": 2.340148448944092, + "learning_rate": 5e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.771705150604248, + "num_tokens": 592544970.0, + "step": 22892 + }, + { + "epoch": 2.5140566659345485, + "grad_norm": 2.003145217895508, + "learning_rate": 5e-06, + "loss": 0.6454, + "mean_token_accuracy": 0.7870554327964783, + "num_tokens": 592570971.0, + "step": 22893 + }, + { + "epoch": 2.514166483637162, + "grad_norm": 2.6262733936309814, + "learning_rate": 5e-06, + "loss": 0.5707, + "mean_token_accuracy": 0.8083620667457581, + "num_tokens": 592587461.0, + "step": 22894 + }, + { + "epoch": 2.514276301339776, + "grad_norm": 2.198007345199585, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7547173500061035, + "num_tokens": 592610619.0, + "step": 22895 + }, + { + "epoch": 2.5143861190423897, + "grad_norm": 1.7499748468399048, + "learning_rate": 5e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7466263771057129, + "num_tokens": 592644962.0, + "step": 22896 + }, + { + "epoch": 2.5144959367450035, + "grad_norm": 2.597381830215454, + "learning_rate": 5e-06, + "loss": 0.6488, + "mean_token_accuracy": 0.7873373031616211, + "num_tokens": 592661515.0, + "step": 22897 + }, + { + "epoch": 2.514605754447617, + "grad_norm": 1.9200903177261353, + "learning_rate": 5e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.7807937264442444, + "num_tokens": 592690001.0, + "step": 22898 + }, + { + "epoch": 2.5147155721502306, + "grad_norm": 2.0126142501831055, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.778276264667511, + "num_tokens": 592715469.0, + "step": 22899 + }, + { + "epoch": 2.5148253898528443, + "grad_norm": 2.3666789531707764, + "learning_rate": 5e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7700029015541077, + "num_tokens": 592737064.0, + "step": 22900 + }, + { + "epoch": 2.514935207555458, + "grad_norm": 1.8743044137954712, + "learning_rate": 5e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.750987708568573, + "num_tokens": 592769105.0, + "step": 22901 + }, + { + "epoch": 2.515045025258072, + "grad_norm": 2.343287706375122, + "learning_rate": 5e-06, + "loss": 0.6399, + "mean_token_accuracy": 0.7857118248939514, + "num_tokens": 592789727.0, + "step": 22902 + }, + { + "epoch": 2.515154842960685, + "grad_norm": 1.8708723783493042, + "learning_rate": 5e-06, + "loss": 0.6927, + "mean_token_accuracy": 0.7678272724151611, + "num_tokens": 592820041.0, + "step": 22903 + }, + { + "epoch": 2.515264660663299, + "grad_norm": 1.982815146446228, + "learning_rate": 5e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.7767027020454407, + "num_tokens": 592848370.0, + "step": 22904 + }, + { + "epoch": 2.5153744783659127, + "grad_norm": 2.2184741497039795, + "learning_rate": 5e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7535858750343323, + "num_tokens": 592871668.0, + "step": 22905 + }, + { + "epoch": 2.515484296068526, + "grad_norm": 2.0618443489074707, + "learning_rate": 5e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7730263471603394, + "num_tokens": 592897479.0, + "step": 22906 + }, + { + "epoch": 2.51559411377114, + "grad_norm": 2.233429431915283, + "learning_rate": 5e-06, + "loss": 0.6416, + "mean_token_accuracy": 0.7926107048988342, + "num_tokens": 592918807.0, + "step": 22907 + }, + { + "epoch": 2.5157039314737535, + "grad_norm": 2.1809284687042236, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7903022766113281, + "num_tokens": 592941671.0, + "step": 22908 + }, + { + "epoch": 2.5158137491763672, + "grad_norm": 2.059947967529297, + "learning_rate": 5e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.7906012535095215, + "num_tokens": 592967240.0, + "step": 22909 + }, + { + "epoch": 2.515923566878981, + "grad_norm": 2.0647451877593994, + "learning_rate": 5e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.780183732509613, + "num_tokens": 592993286.0, + "step": 22910 + }, + { + "epoch": 2.5160333845815943, + "grad_norm": 2.3065288066864014, + "learning_rate": 5e-06, + "loss": 0.675, + "mean_token_accuracy": 0.782424807548523, + "num_tokens": 593014633.0, + "step": 22911 + }, + { + "epoch": 2.516143202284208, + "grad_norm": 1.8052688837051392, + "learning_rate": 5e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7629268169403076, + "num_tokens": 593045929.0, + "step": 22912 + }, + { + "epoch": 2.516253019986822, + "grad_norm": 2.148305892944336, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7550977468490601, + "num_tokens": 593071601.0, + "step": 22913 + }, + { + "epoch": 2.5163628376894356, + "grad_norm": 2.0444536209106445, + "learning_rate": 5e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.7499529719352722, + "num_tokens": 593099799.0, + "step": 22914 + }, + { + "epoch": 2.5164726553920493, + "grad_norm": 2.420928478240967, + "learning_rate": 5e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7694482803344727, + "num_tokens": 593121115.0, + "step": 22915 + }, + { + "epoch": 2.5165824730946627, + "grad_norm": 2.0310630798339844, + "learning_rate": 5e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7531880140304565, + "num_tokens": 593149561.0, + "step": 22916 + }, + { + "epoch": 2.5166922907972764, + "grad_norm": 2.2962985038757324, + "learning_rate": 5e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.7547619938850403, + "num_tokens": 593172415.0, + "step": 22917 + }, + { + "epoch": 2.51680210849989, + "grad_norm": 2.033287286758423, + "learning_rate": 5e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.766671895980835, + "num_tokens": 593200076.0, + "step": 22918 + }, + { + "epoch": 2.516911926202504, + "grad_norm": 1.8979194164276123, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.776594340801239, + "num_tokens": 593230484.0, + "step": 22919 + }, + { + "epoch": 2.5170217439051177, + "grad_norm": 2.334951877593994, + "learning_rate": 5e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.7723071575164795, + "num_tokens": 593251977.0, + "step": 22920 + }, + { + "epoch": 2.517131561607731, + "grad_norm": 2.2830464839935303, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.760316014289856, + "num_tokens": 593273523.0, + "step": 22921 + }, + { + "epoch": 2.5172413793103448, + "grad_norm": 2.518817663192749, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.766484797000885, + "num_tokens": 593293411.0, + "step": 22922 + }, + { + "epoch": 2.5173511970129585, + "grad_norm": 2.027869701385498, + "learning_rate": 5e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7696162462234497, + "num_tokens": 593323189.0, + "step": 22923 + }, + { + "epoch": 2.5174610147155723, + "grad_norm": 2.0479609966278076, + "learning_rate": 5e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.7758361101150513, + "num_tokens": 593347869.0, + "step": 22924 + }, + { + "epoch": 2.517570832418186, + "grad_norm": 1.945622205734253, + "learning_rate": 5e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.7756671905517578, + "num_tokens": 593378740.0, + "step": 22925 + }, + { + "epoch": 2.5176806501207993, + "grad_norm": 1.8217799663543701, + "learning_rate": 5e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7413516044616699, + "num_tokens": 593411477.0, + "step": 22926 + }, + { + "epoch": 2.517790467823413, + "grad_norm": 2.178372859954834, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7588598728179932, + "num_tokens": 593437079.0, + "step": 22927 + }, + { + "epoch": 2.517900285526027, + "grad_norm": 1.94161057472229, + "learning_rate": 5e-06, + "loss": 0.6477, + "mean_token_accuracy": 0.7847380638122559, + "num_tokens": 593462455.0, + "step": 22928 + }, + { + "epoch": 2.5180101032286406, + "grad_norm": 2.1029231548309326, + "learning_rate": 5e-06, + "loss": 0.5798, + "mean_token_accuracy": 0.8035930395126343, + "num_tokens": 593485758.0, + "step": 22929 + }, + { + "epoch": 2.5181199209312544, + "grad_norm": 1.9069268703460693, + "learning_rate": 5e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7530604600906372, + "num_tokens": 593513976.0, + "step": 22930 + }, + { + "epoch": 2.5182297386338677, + "grad_norm": 1.949684739112854, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7723557353019714, + "num_tokens": 593539604.0, + "step": 22931 + }, + { + "epoch": 2.5183395563364814, + "grad_norm": 2.2290334701538086, + "learning_rate": 5e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.7761485576629639, + "num_tokens": 593561304.0, + "step": 22932 + }, + { + "epoch": 2.518449374039095, + "grad_norm": 1.8914587497711182, + "learning_rate": 5e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7740185260772705, + "num_tokens": 593588764.0, + "step": 22933 + }, + { + "epoch": 2.5185591917417085, + "grad_norm": 1.9752192497253418, + "learning_rate": 5e-06, + "loss": 0.6855, + "mean_token_accuracy": 0.7909131050109863, + "num_tokens": 593615840.0, + "step": 22934 + }, + { + "epoch": 2.5186690094443223, + "grad_norm": 2.1281285285949707, + "learning_rate": 5e-06, + "loss": 0.6424, + "mean_token_accuracy": 0.7895407676696777, + "num_tokens": 593638381.0, + "step": 22935 + }, + { + "epoch": 2.518778827146936, + "grad_norm": 2.2995188236236572, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7756008505821228, + "num_tokens": 593659822.0, + "step": 22936 + }, + { + "epoch": 2.5188886448495498, + "grad_norm": 1.8715786933898926, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7572166919708252, + "num_tokens": 593690538.0, + "step": 22937 + }, + { + "epoch": 2.5189984625521635, + "grad_norm": 1.9146106243133545, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7711412906646729, + "num_tokens": 593721106.0, + "step": 22938 + }, + { + "epoch": 2.519108280254777, + "grad_norm": 2.2661259174346924, + "learning_rate": 5e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7573118805885315, + "num_tokens": 593743645.0, + "step": 22939 + }, + { + "epoch": 2.5192180979573906, + "grad_norm": 2.102370500564575, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7526928186416626, + "num_tokens": 593769192.0, + "step": 22940 + }, + { + "epoch": 2.5193279156600044, + "grad_norm": 2.028686285018921, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.742990255355835, + "num_tokens": 593797957.0, + "step": 22941 + }, + { + "epoch": 2.519437733362618, + "grad_norm": 2.1325926780700684, + "learning_rate": 5e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7488213777542114, + "num_tokens": 593824065.0, + "step": 22942 + }, + { + "epoch": 2.519547551065232, + "grad_norm": 2.156237840652466, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7662539482116699, + "num_tokens": 593850270.0, + "step": 22943 + }, + { + "epoch": 2.519657368767845, + "grad_norm": 2.169620990753174, + "learning_rate": 5e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7523722648620605, + "num_tokens": 593877108.0, + "step": 22944 + }, + { + "epoch": 2.519767186470459, + "grad_norm": 2.3726868629455566, + "learning_rate": 5e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7795971632003784, + "num_tokens": 593897960.0, + "step": 22945 + }, + { + "epoch": 2.5198770041730727, + "grad_norm": 2.073493003845215, + "learning_rate": 5e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7371927499771118, + "num_tokens": 593925687.0, + "step": 22946 + }, + { + "epoch": 2.5199868218756865, + "grad_norm": 2.318424701690674, + "learning_rate": 5e-06, + "loss": 0.7263, + "mean_token_accuracy": 0.7669588327407837, + "num_tokens": 593950278.0, + "step": 22947 + }, + { + "epoch": 2.5200966395783, + "grad_norm": 1.995031714439392, + "learning_rate": 5e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7646050453186035, + "num_tokens": 593978778.0, + "step": 22948 + }, + { + "epoch": 2.5202064572809135, + "grad_norm": 1.9528805017471313, + "learning_rate": 5e-06, + "loss": 0.6331, + "mean_token_accuracy": 0.7992621064186096, + "num_tokens": 594003322.0, + "step": 22949 + }, + { + "epoch": 2.5203162749835273, + "grad_norm": 2.2992019653320312, + "learning_rate": 5e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7711563110351562, + "num_tokens": 594024399.0, + "step": 22950 + }, + { + "epoch": 2.520426092686141, + "grad_norm": 2.051422119140625, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7506921887397766, + "num_tokens": 594050081.0, + "step": 22951 + }, + { + "epoch": 2.520535910388755, + "grad_norm": 2.1844136714935303, + "learning_rate": 5e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7686981558799744, + "num_tokens": 594074971.0, + "step": 22952 + }, + { + "epoch": 2.5206457280913686, + "grad_norm": 1.7968825101852417, + "learning_rate": 5e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7616980075836182, + "num_tokens": 594108621.0, + "step": 22953 + }, + { + "epoch": 2.520755545793982, + "grad_norm": 2.157623767852783, + "learning_rate": 5e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.7824569940567017, + "num_tokens": 594132332.0, + "step": 22954 + }, + { + "epoch": 2.5208653634965956, + "grad_norm": 1.9293792247772217, + "learning_rate": 5e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7502285838127136, + "num_tokens": 594161162.0, + "step": 22955 + }, + { + "epoch": 2.5209751811992094, + "grad_norm": 2.0900485515594482, + "learning_rate": 5e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.7664130926132202, + "num_tokens": 594188100.0, + "step": 22956 + }, + { + "epoch": 2.521084998901823, + "grad_norm": 2.027461051940918, + "learning_rate": 5e-06, + "loss": 0.6821, + "mean_token_accuracy": 0.7695589661598206, + "num_tokens": 594214389.0, + "step": 22957 + }, + { + "epoch": 2.521194816604437, + "grad_norm": 2.1207759380340576, + "learning_rate": 5e-06, + "loss": 0.6158, + "mean_token_accuracy": 0.8014956712722778, + "num_tokens": 594240256.0, + "step": 22958 + }, + { + "epoch": 2.52130463430705, + "grad_norm": 1.7960290908813477, + "learning_rate": 5e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7459555864334106, + "num_tokens": 594273404.0, + "step": 22959 + }, + { + "epoch": 2.521414452009664, + "grad_norm": 1.9043877124786377, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7606325149536133, + "num_tokens": 594302692.0, + "step": 22960 + }, + { + "epoch": 2.5215242697122777, + "grad_norm": 2.1271791458129883, + "learning_rate": 5e-06, + "loss": 0.7547, + "mean_token_accuracy": 0.7517697811126709, + "num_tokens": 594328864.0, + "step": 22961 + }, + { + "epoch": 2.521634087414891, + "grad_norm": 2.09855055809021, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7716239094734192, + "num_tokens": 594353955.0, + "step": 22962 + }, + { + "epoch": 2.521743905117505, + "grad_norm": 2.215883255004883, + "learning_rate": 5e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7708429098129272, + "num_tokens": 594381298.0, + "step": 22963 + }, + { + "epoch": 2.5218537228201185, + "grad_norm": 2.069739818572998, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7710875272750854, + "num_tokens": 594405646.0, + "step": 22964 + }, + { + "epoch": 2.5219635405227323, + "grad_norm": 2.334303617477417, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7659702301025391, + "num_tokens": 594428903.0, + "step": 22965 + }, + { + "epoch": 2.522073358225346, + "grad_norm": 1.9993809461593628, + "learning_rate": 5e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7593809962272644, + "num_tokens": 594457737.0, + "step": 22966 + }, + { + "epoch": 2.5221831759279594, + "grad_norm": 2.565288543701172, + "learning_rate": 5e-06, + "loss": 0.635, + "mean_token_accuracy": 0.7878440618515015, + "num_tokens": 594476108.0, + "step": 22967 + }, + { + "epoch": 2.522292993630573, + "grad_norm": 2.099599838256836, + "learning_rate": 5e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.75641930103302, + "num_tokens": 594503113.0, + "step": 22968 + }, + { + "epoch": 2.522402811333187, + "grad_norm": 1.8811335563659668, + "learning_rate": 5e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7517651915550232, + "num_tokens": 594532700.0, + "step": 22969 + }, + { + "epoch": 2.5225126290358006, + "grad_norm": 1.912346363067627, + "learning_rate": 5e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.781484842300415, + "num_tokens": 594562090.0, + "step": 22970 + }, + { + "epoch": 2.5226224467384144, + "grad_norm": 2.085599184036255, + "learning_rate": 5e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7654181718826294, + "num_tokens": 594587609.0, + "step": 22971 + }, + { + "epoch": 2.5227322644410277, + "grad_norm": 2.1694555282592773, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.7901818752288818, + "num_tokens": 594611064.0, + "step": 22972 + }, + { + "epoch": 2.5228420821436415, + "grad_norm": 1.891976237297058, + "learning_rate": 5e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7654460072517395, + "num_tokens": 594641522.0, + "step": 22973 + }, + { + "epoch": 2.5229518998462552, + "grad_norm": 2.3067054748535156, + "learning_rate": 5e-06, + "loss": 0.7003, + "mean_token_accuracy": 0.7735295295715332, + "num_tokens": 594661867.0, + "step": 22974 + }, + { + "epoch": 2.523061717548869, + "grad_norm": 1.9923523664474487, + "learning_rate": 5e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.76172935962677, + "num_tokens": 594691069.0, + "step": 22975 + }, + { + "epoch": 2.5231715352514827, + "grad_norm": 2.0332283973693848, + "learning_rate": 5e-06, + "loss": 0.6606, + "mean_token_accuracy": 0.785162091255188, + "num_tokens": 594717752.0, + "step": 22976 + }, + { + "epoch": 2.523281352954096, + "grad_norm": 2.0454418659210205, + "learning_rate": 5e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.783977746963501, + "num_tokens": 594745678.0, + "step": 22977 + }, + { + "epoch": 2.52339117065671, + "grad_norm": 2.2734591960906982, + "learning_rate": 5e-06, + "loss": 0.661, + "mean_token_accuracy": 0.7759923934936523, + "num_tokens": 594765890.0, + "step": 22978 + }, + { + "epoch": 2.5235009883593236, + "grad_norm": 2.231856107711792, + "learning_rate": 5e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7556078433990479, + "num_tokens": 594791481.0, + "step": 22979 + }, + { + "epoch": 2.5236108060619373, + "grad_norm": 1.9211801290512085, + "learning_rate": 5e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7625001668930054, + "num_tokens": 594822888.0, + "step": 22980 + }, + { + "epoch": 2.523720623764551, + "grad_norm": 1.9631385803222656, + "learning_rate": 5e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7330629825592041, + "num_tokens": 594854114.0, + "step": 22981 + }, + { + "epoch": 2.5238304414671644, + "grad_norm": 2.213840961456299, + "learning_rate": 5e-06, + "loss": 0.6413, + "mean_token_accuracy": 0.7827898859977722, + "num_tokens": 594877602.0, + "step": 22982 + }, + { + "epoch": 2.523940259169778, + "grad_norm": 1.876033902168274, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7567553520202637, + "num_tokens": 594907443.0, + "step": 22983 + }, + { + "epoch": 2.524050076872392, + "grad_norm": 2.0593297481536865, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7451298236846924, + "num_tokens": 594936291.0, + "step": 22984 + }, + { + "epoch": 2.5241598945750052, + "grad_norm": 2.0241410732269287, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.764324963092804, + "num_tokens": 594963973.0, + "step": 22985 + }, + { + "epoch": 2.5242697122776194, + "grad_norm": 2.2461259365081787, + "learning_rate": 5e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7642988562583923, + "num_tokens": 594987323.0, + "step": 22986 + }, + { + "epoch": 2.5243795299802327, + "grad_norm": 2.0538835525512695, + "learning_rate": 5e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7755924463272095, + "num_tokens": 595013139.0, + "step": 22987 + }, + { + "epoch": 2.5244893476828465, + "grad_norm": 2.2905213832855225, + "learning_rate": 5e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.7740731239318848, + "num_tokens": 595034485.0, + "step": 22988 + }, + { + "epoch": 2.5245991653854603, + "grad_norm": 1.904555320739746, + "learning_rate": 5e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7637407779693604, + "num_tokens": 595064217.0, + "step": 22989 + }, + { + "epoch": 2.5247089830880736, + "grad_norm": 2.0579051971435547, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7487272024154663, + "num_tokens": 595094016.0, + "step": 22990 + }, + { + "epoch": 2.5248188007906873, + "grad_norm": 2.199244499206543, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7542659044265747, + "num_tokens": 595118537.0, + "step": 22991 + }, + { + "epoch": 2.524928618493301, + "grad_norm": 2.154953718185425, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7661272287368774, + "num_tokens": 595143260.0, + "step": 22992 + }, + { + "epoch": 2.525038436195915, + "grad_norm": 2.237422466278076, + "learning_rate": 5e-06, + "loss": 0.6763, + "mean_token_accuracy": 0.78167724609375, + "num_tokens": 595164790.0, + "step": 22993 + }, + { + "epoch": 2.5251482538985286, + "grad_norm": 2.2246475219726562, + "learning_rate": 5e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.7872141599655151, + "num_tokens": 595188615.0, + "step": 22994 + }, + { + "epoch": 2.525258071601142, + "grad_norm": 2.0243618488311768, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7579994201660156, + "num_tokens": 595219784.0, + "step": 22995 + }, + { + "epoch": 2.5253678893037557, + "grad_norm": 2.0966670513153076, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7655954360961914, + "num_tokens": 595244374.0, + "step": 22996 + }, + { + "epoch": 2.5254777070063694, + "grad_norm": 2.063610076904297, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7602012157440186, + "num_tokens": 595269732.0, + "step": 22997 + }, + { + "epoch": 2.525587524708983, + "grad_norm": 1.8460216522216797, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7535300254821777, + "num_tokens": 595300735.0, + "step": 22998 + }, + { + "epoch": 2.525697342411597, + "grad_norm": 2.191075086593628, + "learning_rate": 5e-06, + "loss": 0.7617, + "mean_token_accuracy": 0.7519668340682983, + "num_tokens": 595325304.0, + "step": 22999 + }, + { + "epoch": 2.5258071601142102, + "grad_norm": 2.2453770637512207, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7553945779800415, + "num_tokens": 595349897.0, + "step": 23000 + }, + { + "epoch": 2.525916977816824, + "grad_norm": 1.8441283702850342, + "learning_rate": 5e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7540483474731445, + "num_tokens": 595382477.0, + "step": 23001 + }, + { + "epoch": 2.5260267955194378, + "grad_norm": 2.191196918487549, + "learning_rate": 5e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7537773847579956, + "num_tokens": 595407705.0, + "step": 23002 + }, + { + "epoch": 2.5261366132220515, + "grad_norm": 2.1801767349243164, + "learning_rate": 5e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7642855644226074, + "num_tokens": 595430455.0, + "step": 23003 + }, + { + "epoch": 2.5262464309246653, + "grad_norm": 2.116300106048584, + "learning_rate": 5e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7523449063301086, + "num_tokens": 595456322.0, + "step": 23004 + }, + { + "epoch": 2.5263562486272786, + "grad_norm": 2.289086103439331, + "learning_rate": 5e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7583284378051758, + "num_tokens": 595480423.0, + "step": 23005 + }, + { + "epoch": 2.5264660663298923, + "grad_norm": 2.2621190547943115, + "learning_rate": 5e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7501264810562134, + "num_tokens": 595504464.0, + "step": 23006 + }, + { + "epoch": 2.526575884032506, + "grad_norm": 2.3470189571380615, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7615377902984619, + "num_tokens": 595526337.0, + "step": 23007 + }, + { + "epoch": 2.52668570173512, + "grad_norm": 2.2114827632904053, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7573641538619995, + "num_tokens": 595551645.0, + "step": 23008 + }, + { + "epoch": 2.5267955194377336, + "grad_norm": 2.041924476623535, + "learning_rate": 5e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.7807576656341553, + "num_tokens": 595579105.0, + "step": 23009 + }, + { + "epoch": 2.526905337140347, + "grad_norm": 2.3118245601654053, + "learning_rate": 5e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.7682051062583923, + "num_tokens": 595600223.0, + "step": 23010 + }, + { + "epoch": 2.5270151548429607, + "grad_norm": 2.3531289100646973, + "learning_rate": 5e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7790534496307373, + "num_tokens": 595623626.0, + "step": 23011 + }, + { + "epoch": 2.5271249725455744, + "grad_norm": 2.0163533687591553, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7628205418586731, + "num_tokens": 595649868.0, + "step": 23012 + }, + { + "epoch": 2.5272347902481878, + "grad_norm": 2.1670918464660645, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7633205652236938, + "num_tokens": 595676841.0, + "step": 23013 + }, + { + "epoch": 2.5273446079508015, + "grad_norm": 1.9513341188430786, + "learning_rate": 5e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7552975416183472, + "num_tokens": 595705797.0, + "step": 23014 + }, + { + "epoch": 2.5274544256534153, + "grad_norm": 2.161141872406006, + "learning_rate": 5e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7632906436920166, + "num_tokens": 595729929.0, + "step": 23015 + }, + { + "epoch": 2.527564243356029, + "grad_norm": 2.3077845573425293, + "learning_rate": 5e-06, + "loss": 0.6351, + "mean_token_accuracy": 0.7860534191131592, + "num_tokens": 595751656.0, + "step": 23016 + }, + { + "epoch": 2.527674061058643, + "grad_norm": 2.1235711574554443, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7638877034187317, + "num_tokens": 595775851.0, + "step": 23017 + }, + { + "epoch": 2.527783878761256, + "grad_norm": 2.0228919982910156, + "learning_rate": 5e-06, + "loss": 0.647, + "mean_token_accuracy": 0.7872446179389954, + "num_tokens": 595801547.0, + "step": 23018 + }, + { + "epoch": 2.52789369646387, + "grad_norm": 2.107630491256714, + "learning_rate": 5e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.7814885973930359, + "num_tokens": 595825630.0, + "step": 23019 + }, + { + "epoch": 2.5280035141664836, + "grad_norm": 2.2216358184814453, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7695722579956055, + "num_tokens": 595850094.0, + "step": 23020 + }, + { + "epoch": 2.5281133318690974, + "grad_norm": 2.220078468322754, + "learning_rate": 5e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7638894319534302, + "num_tokens": 595873381.0, + "step": 23021 + }, + { + "epoch": 2.528223149571711, + "grad_norm": 2.049825668334961, + "learning_rate": 5e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7622490525245667, + "num_tokens": 595898338.0, + "step": 23022 + }, + { + "epoch": 2.5283329672743244, + "grad_norm": 2.1481306552886963, + "learning_rate": 5e-06, + "loss": 0.65, + "mean_token_accuracy": 0.7887311577796936, + "num_tokens": 595921312.0, + "step": 23023 + }, + { + "epoch": 2.528442784976938, + "grad_norm": 2.063180685043335, + "learning_rate": 5e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7744759321212769, + "num_tokens": 595946552.0, + "step": 23024 + }, + { + "epoch": 2.528552602679552, + "grad_norm": 2.2677218914031982, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7681203484535217, + "num_tokens": 595968939.0, + "step": 23025 + }, + { + "epoch": 2.5286624203821657, + "grad_norm": 2.1549675464630127, + "learning_rate": 5e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7769603729248047, + "num_tokens": 595992953.0, + "step": 23026 + }, + { + "epoch": 2.5287722380847795, + "grad_norm": 2.1160309314727783, + "learning_rate": 5e-06, + "loss": 0.6556, + "mean_token_accuracy": 0.7856265902519226, + "num_tokens": 596020049.0, + "step": 23027 + }, + { + "epoch": 2.528882055787393, + "grad_norm": 2.3058066368103027, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7485562562942505, + "num_tokens": 596044152.0, + "step": 23028 + }, + { + "epoch": 2.5289918734900065, + "grad_norm": 1.9849389791488647, + "learning_rate": 5e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7610903978347778, + "num_tokens": 596072537.0, + "step": 23029 + }, + { + "epoch": 2.5291016911926203, + "grad_norm": 2.1106579303741455, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7564367651939392, + "num_tokens": 596099076.0, + "step": 23030 + }, + { + "epoch": 2.529211508895234, + "grad_norm": 2.1154377460479736, + "learning_rate": 5e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7502471208572388, + "num_tokens": 596125960.0, + "step": 23031 + }, + { + "epoch": 2.529321326597848, + "grad_norm": 2.2198407649993896, + "learning_rate": 5e-06, + "loss": 0.6133, + "mean_token_accuracy": 0.8005068302154541, + "num_tokens": 596149174.0, + "step": 23032 + }, + { + "epoch": 2.529431144300461, + "grad_norm": 2.2792441844940186, + "learning_rate": 5e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7834362983703613, + "num_tokens": 596171618.0, + "step": 23033 + }, + { + "epoch": 2.529540962003075, + "grad_norm": 1.821621060371399, + "learning_rate": 5e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7790063619613647, + "num_tokens": 596201884.0, + "step": 23034 + }, + { + "epoch": 2.5296507797056886, + "grad_norm": 2.1384706497192383, + "learning_rate": 5e-06, + "loss": 0.6843, + "mean_token_accuracy": 0.77016282081604, + "num_tokens": 596224714.0, + "step": 23035 + }, + { + "epoch": 2.529760597408302, + "grad_norm": 2.1512227058410645, + "learning_rate": 5e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.7806607484817505, + "num_tokens": 596248719.0, + "step": 23036 + }, + { + "epoch": 2.529870415110916, + "grad_norm": 2.025391101837158, + "learning_rate": 5e-06, + "loss": 0.6146, + "mean_token_accuracy": 0.7864808440208435, + "num_tokens": 596273990.0, + "step": 23037 + }, + { + "epoch": 2.5299802328135295, + "grad_norm": 2.369720458984375, + "learning_rate": 5e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.7854089736938477, + "num_tokens": 596294821.0, + "step": 23038 + }, + { + "epoch": 2.530090050516143, + "grad_norm": 2.455592393875122, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7561269998550415, + "num_tokens": 596317944.0, + "step": 23039 + }, + { + "epoch": 2.530199868218757, + "grad_norm": 1.8939038515090942, + "learning_rate": 5e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7505494356155396, + "num_tokens": 596350883.0, + "step": 23040 + }, + { + "epoch": 2.5303096859213703, + "grad_norm": 1.9332523345947266, + "learning_rate": 5e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7243295907974243, + "num_tokens": 596385085.0, + "step": 23041 + }, + { + "epoch": 2.530419503623984, + "grad_norm": 1.9983181953430176, + "learning_rate": 5e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7721405625343323, + "num_tokens": 596414079.0, + "step": 23042 + }, + { + "epoch": 2.530529321326598, + "grad_norm": 2.075577735900879, + "learning_rate": 5e-06, + "loss": 0.6497, + "mean_token_accuracy": 0.7889429926872253, + "num_tokens": 596437897.0, + "step": 23043 + }, + { + "epoch": 2.5306391390292116, + "grad_norm": 1.7160688638687134, + "learning_rate": 5e-06, + "loss": 0.6517, + "mean_token_accuracy": 0.7871477007865906, + "num_tokens": 596470283.0, + "step": 23044 + }, + { + "epoch": 2.5307489567318253, + "grad_norm": 1.909121036529541, + "learning_rate": 5e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.777193009853363, + "num_tokens": 596497642.0, + "step": 23045 + }, + { + "epoch": 2.5308587744344386, + "grad_norm": 1.720458745956421, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7601943016052246, + "num_tokens": 596532707.0, + "step": 23046 + }, + { + "epoch": 2.5309685921370524, + "grad_norm": 1.8354792594909668, + "learning_rate": 5e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7549140453338623, + "num_tokens": 596564006.0, + "step": 23047 + }, + { + "epoch": 2.531078409839666, + "grad_norm": 1.9851953983306885, + "learning_rate": 5e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7602288722991943, + "num_tokens": 596592506.0, + "step": 23048 + }, + { + "epoch": 2.53118822754228, + "grad_norm": 2.04211163520813, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7468284964561462, + "num_tokens": 596619094.0, + "step": 23049 + }, + { + "epoch": 2.5312980452448937, + "grad_norm": 2.0042543411254883, + "learning_rate": 5e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7374909520149231, + "num_tokens": 596647233.0, + "step": 23050 + }, + { + "epoch": 2.531407862947507, + "grad_norm": 1.9659770727157593, + "learning_rate": 5e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7656249403953552, + "num_tokens": 596678206.0, + "step": 23051 + }, + { + "epoch": 2.5315176806501207, + "grad_norm": 1.6840585470199585, + "learning_rate": 5e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7512892484664917, + "num_tokens": 596715081.0, + "step": 23052 + }, + { + "epoch": 2.5316274983527345, + "grad_norm": 2.0974080562591553, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7750664353370667, + "num_tokens": 596739357.0, + "step": 23053 + }, + { + "epoch": 2.5317373160553482, + "grad_norm": 2.185025215148926, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7593932151794434, + "num_tokens": 596763713.0, + "step": 23054 + }, + { + "epoch": 2.531847133757962, + "grad_norm": 1.8905766010284424, + "learning_rate": 5e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7481204271316528, + "num_tokens": 596793227.0, + "step": 23055 + }, + { + "epoch": 2.5319569514605753, + "grad_norm": 1.9265129566192627, + "learning_rate": 5e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.74889075756073, + "num_tokens": 596823600.0, + "step": 23056 + }, + { + "epoch": 2.532066769163189, + "grad_norm": 2.289710283279419, + "learning_rate": 5e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7664651870727539, + "num_tokens": 596844657.0, + "step": 23057 + }, + { + "epoch": 2.532176586865803, + "grad_norm": 2.1620893478393555, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7427762746810913, + "num_tokens": 596871677.0, + "step": 23058 + }, + { + "epoch": 2.5322864045684166, + "grad_norm": 2.4477193355560303, + "learning_rate": 5e-06, + "loss": 0.673, + "mean_token_accuracy": 0.7738995552062988, + "num_tokens": 596891636.0, + "step": 23059 + }, + { + "epoch": 2.5323962222710303, + "grad_norm": 2.1537792682647705, + "learning_rate": 5e-06, + "loss": 0.6362, + "mean_token_accuracy": 0.7895510196685791, + "num_tokens": 596916863.0, + "step": 23060 + }, + { + "epoch": 2.5325060399736437, + "grad_norm": 1.9609789848327637, + "learning_rate": 5e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.774660050868988, + "num_tokens": 596942717.0, + "step": 23061 + }, + { + "epoch": 2.5326158576762574, + "grad_norm": 1.9707915782928467, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.763206958770752, + "num_tokens": 596971172.0, + "step": 23062 + }, + { + "epoch": 2.532725675378871, + "grad_norm": 1.8685367107391357, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.743058443069458, + "num_tokens": 597003919.0, + "step": 23063 + }, + { + "epoch": 2.5328354930814845, + "grad_norm": 2.0025224685668945, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7568936944007874, + "num_tokens": 597035110.0, + "step": 23064 + }, + { + "epoch": 2.5329453107840982, + "grad_norm": 2.0046603679656982, + "learning_rate": 5e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7557868361473083, + "num_tokens": 597063712.0, + "step": 23065 + }, + { + "epoch": 2.533055128486712, + "grad_norm": 2.197981357574463, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7784243822097778, + "num_tokens": 597086557.0, + "step": 23066 + }, + { + "epoch": 2.5331649461893258, + "grad_norm": 2.5711493492126465, + "learning_rate": 5e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.7827575206756592, + "num_tokens": 597104811.0, + "step": 23067 + }, + { + "epoch": 2.5332747638919395, + "grad_norm": 1.8666409254074097, + "learning_rate": 5e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7677986025810242, + "num_tokens": 597134994.0, + "step": 23068 + }, + { + "epoch": 2.533384581594553, + "grad_norm": 2.0545878410339355, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7577463984489441, + "num_tokens": 597163773.0, + "step": 23069 + }, + { + "epoch": 2.5334943992971666, + "grad_norm": 2.4298489093780518, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7696935534477234, + "num_tokens": 597184147.0, + "step": 23070 + }, + { + "epoch": 2.5336042169997803, + "grad_norm": 2.0349414348602295, + "learning_rate": 5e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.73856121301651, + "num_tokens": 597212069.0, + "step": 23071 + }, + { + "epoch": 2.533714034702394, + "grad_norm": 1.9853655099868774, + "learning_rate": 5e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7466533184051514, + "num_tokens": 597239421.0, + "step": 23072 + }, + { + "epoch": 2.533823852405008, + "grad_norm": 1.8306790590286255, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7449764013290405, + "num_tokens": 597269026.0, + "step": 23073 + }, + { + "epoch": 2.533933670107621, + "grad_norm": 2.1119275093078613, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7468677759170532, + "num_tokens": 597293643.0, + "step": 23074 + }, + { + "epoch": 2.534043487810235, + "grad_norm": 1.8810621500015259, + "learning_rate": 5e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7469279170036316, + "num_tokens": 597325276.0, + "step": 23075 + }, + { + "epoch": 2.5341533055128487, + "grad_norm": 2.421126365661621, + "learning_rate": 5e-06, + "loss": 0.6717, + "mean_token_accuracy": 0.783237874507904, + "num_tokens": 597344858.0, + "step": 23076 + }, + { + "epoch": 2.5342631232154624, + "grad_norm": 2.1071383953094482, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7468470335006714, + "num_tokens": 597373226.0, + "step": 23077 + }, + { + "epoch": 2.534372940918076, + "grad_norm": 2.2588326930999756, + "learning_rate": 5e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7555198669433594, + "num_tokens": 597397223.0, + "step": 23078 + }, + { + "epoch": 2.5344827586206895, + "grad_norm": 1.9394596815109253, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7482402324676514, + "num_tokens": 597426198.0, + "step": 23079 + }, + { + "epoch": 2.5345925763233033, + "grad_norm": 2.0398099422454834, + "learning_rate": 5e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7580180168151855, + "num_tokens": 597454211.0, + "step": 23080 + }, + { + "epoch": 2.534702394025917, + "grad_norm": 1.9596751928329468, + "learning_rate": 5e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7732536792755127, + "num_tokens": 597482009.0, + "step": 23081 + }, + { + "epoch": 2.5348122117285308, + "grad_norm": 2.5128161907196045, + "learning_rate": 5e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7734307646751404, + "num_tokens": 597500242.0, + "step": 23082 + }, + { + "epoch": 2.5349220294311445, + "grad_norm": 2.042517900466919, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7719530463218689, + "num_tokens": 597525070.0, + "step": 23083 + }, + { + "epoch": 2.535031847133758, + "grad_norm": 2.229907512664795, + "learning_rate": 5e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.7843443155288696, + "num_tokens": 597548651.0, + "step": 23084 + }, + { + "epoch": 2.5351416648363716, + "grad_norm": 2.099998712539673, + "learning_rate": 5e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.7692021131515503, + "num_tokens": 597573275.0, + "step": 23085 + }, + { + "epoch": 2.5352514825389854, + "grad_norm": 2.1387195587158203, + "learning_rate": 5e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7633576989173889, + "num_tokens": 597600624.0, + "step": 23086 + }, + { + "epoch": 2.5353613002415987, + "grad_norm": 1.9947197437286377, + "learning_rate": 5e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.7770254611968994, + "num_tokens": 597627521.0, + "step": 23087 + }, + { + "epoch": 2.535471117944213, + "grad_norm": 2.045607328414917, + "learning_rate": 5e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7601912617683411, + "num_tokens": 597655127.0, + "step": 23088 + }, + { + "epoch": 2.535580935646826, + "grad_norm": 2.051410675048828, + "learning_rate": 5e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7505068182945251, + "num_tokens": 597680969.0, + "step": 23089 + }, + { + "epoch": 2.53569075334944, + "grad_norm": 1.9886679649353027, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7478090524673462, + "num_tokens": 597710669.0, + "step": 23090 + }, + { + "epoch": 2.5358005710520537, + "grad_norm": 1.9538191556930542, + "learning_rate": 5e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7302603125572205, + "num_tokens": 597741525.0, + "step": 23091 + }, + { + "epoch": 2.535910388754667, + "grad_norm": 2.1877245903015137, + "learning_rate": 5e-06, + "loss": 0.6717, + "mean_token_accuracy": 0.772337794303894, + "num_tokens": 597764481.0, + "step": 23092 + }, + { + "epoch": 2.5360202064572808, + "grad_norm": 2.0115890502929688, + "learning_rate": 5e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7635204195976257, + "num_tokens": 597792209.0, + "step": 23093 + }, + { + "epoch": 2.5361300241598945, + "grad_norm": 2.3006632328033447, + "learning_rate": 5e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.78629469871521, + "num_tokens": 597813672.0, + "step": 23094 + }, + { + "epoch": 2.5362398418625083, + "grad_norm": 2.3042800426483154, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7681056261062622, + "num_tokens": 597838015.0, + "step": 23095 + }, + { + "epoch": 2.536349659565122, + "grad_norm": 1.9903534650802612, + "learning_rate": 5e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.762654721736908, + "num_tokens": 597864931.0, + "step": 23096 + }, + { + "epoch": 2.5364594772677354, + "grad_norm": 2.093937635421753, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7493794560432434, + "num_tokens": 597892192.0, + "step": 23097 + }, + { + "epoch": 2.536569294970349, + "grad_norm": 1.9575573205947876, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.752760112285614, + "num_tokens": 597922252.0, + "step": 23098 + }, + { + "epoch": 2.536679112672963, + "grad_norm": 2.256401777267456, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7691929340362549, + "num_tokens": 597944920.0, + "step": 23099 + }, + { + "epoch": 2.5367889303755766, + "grad_norm": 2.4732134342193604, + "learning_rate": 5e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7743948698043823, + "num_tokens": 597963678.0, + "step": 23100 + }, + { + "epoch": 2.5368987480781904, + "grad_norm": 2.0671913623809814, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7811251282691956, + "num_tokens": 597987341.0, + "step": 23101 + }, + { + "epoch": 2.5370085657808037, + "grad_norm": 2.3032071590423584, + "learning_rate": 5e-06, + "loss": 0.6743, + "mean_token_accuracy": 0.7782942056655884, + "num_tokens": 598009176.0, + "step": 23102 + }, + { + "epoch": 2.5371183834834174, + "grad_norm": 1.910244107246399, + "learning_rate": 5e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7530116438865662, + "num_tokens": 598039387.0, + "step": 23103 + }, + { + "epoch": 2.537228201186031, + "grad_norm": 2.2099804878234863, + "learning_rate": 5e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7752203941345215, + "num_tokens": 598065495.0, + "step": 23104 + }, + { + "epoch": 2.537338018888645, + "grad_norm": 2.3685660362243652, + "learning_rate": 5e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7716232538223267, + "num_tokens": 598086095.0, + "step": 23105 + }, + { + "epoch": 2.5374478365912587, + "grad_norm": 2.011906385421753, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7692837715148926, + "num_tokens": 598112977.0, + "step": 23106 + }, + { + "epoch": 2.537557654293872, + "grad_norm": 2.018643856048584, + "learning_rate": 5e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7596889734268188, + "num_tokens": 598138737.0, + "step": 23107 + }, + { + "epoch": 2.537667471996486, + "grad_norm": 1.7142130136489868, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7593022584915161, + "num_tokens": 598172235.0, + "step": 23108 + }, + { + "epoch": 2.5377772896990995, + "grad_norm": 2.3070452213287354, + "learning_rate": 5e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7635642886161804, + "num_tokens": 598194812.0, + "step": 23109 + }, + { + "epoch": 2.5378871074017133, + "grad_norm": 1.8629060983657837, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7548316121101379, + "num_tokens": 598225198.0, + "step": 23110 + }, + { + "epoch": 2.537996925104327, + "grad_norm": 2.216752529144287, + "learning_rate": 5e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7600323557853699, + "num_tokens": 598249938.0, + "step": 23111 + }, + { + "epoch": 2.5381067428069404, + "grad_norm": 2.2782623767852783, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7730029821395874, + "num_tokens": 598272024.0, + "step": 23112 + }, + { + "epoch": 2.538216560509554, + "grad_norm": 2.1672098636627197, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7693172693252563, + "num_tokens": 598296895.0, + "step": 23113 + }, + { + "epoch": 2.538326378212168, + "grad_norm": 2.0974104404449463, + "learning_rate": 5e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7565244436264038, + "num_tokens": 598322266.0, + "step": 23114 + }, + { + "epoch": 2.538436195914781, + "grad_norm": 1.8917661905288696, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7636682987213135, + "num_tokens": 598350326.0, + "step": 23115 + }, + { + "epoch": 2.538546013617395, + "grad_norm": 2.250579833984375, + "learning_rate": 5e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.7748735547065735, + "num_tokens": 598372381.0, + "step": 23116 + }, + { + "epoch": 2.5386558313200087, + "grad_norm": 2.1309404373168945, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7662217617034912, + "num_tokens": 598398323.0, + "step": 23117 + }, + { + "epoch": 2.5387656490226225, + "grad_norm": 1.8972355127334595, + "learning_rate": 5e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7653203010559082, + "num_tokens": 598426745.0, + "step": 23118 + }, + { + "epoch": 2.5388754667252362, + "grad_norm": 2.0791783332824707, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7674243450164795, + "num_tokens": 598451745.0, + "step": 23119 + }, + { + "epoch": 2.5389852844278495, + "grad_norm": 2.32029128074646, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7660026550292969, + "num_tokens": 598473623.0, + "step": 23120 + }, + { + "epoch": 2.5390951021304633, + "grad_norm": 2.2619214057922363, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7685699462890625, + "num_tokens": 598497584.0, + "step": 23121 + }, + { + "epoch": 2.539204919833077, + "grad_norm": 1.9378585815429688, + "learning_rate": 5e-06, + "loss": 0.697, + "mean_token_accuracy": 0.7737070918083191, + "num_tokens": 598527166.0, + "step": 23122 + }, + { + "epoch": 2.539314737535691, + "grad_norm": 1.8925087451934814, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.744364321231842, + "num_tokens": 598556645.0, + "step": 23123 + }, + { + "epoch": 2.5394245552383046, + "grad_norm": 2.09133243560791, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7682047486305237, + "num_tokens": 598580378.0, + "step": 23124 + }, + { + "epoch": 2.539534372940918, + "grad_norm": 2.4561221599578857, + "learning_rate": 5e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7602370381355286, + "num_tokens": 598601709.0, + "step": 23125 + }, + { + "epoch": 2.5396441906435316, + "grad_norm": 2.54771089553833, + "learning_rate": 5e-06, + "loss": 0.5985, + "mean_token_accuracy": 0.7940207719802856, + "num_tokens": 598619688.0, + "step": 23126 + }, + { + "epoch": 2.5397540083461454, + "grad_norm": 2.2267067432403564, + "learning_rate": 5e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7480252981185913, + "num_tokens": 598644726.0, + "step": 23127 + }, + { + "epoch": 2.539863826048759, + "grad_norm": 2.2366132736206055, + "learning_rate": 5e-06, + "loss": 0.6458, + "mean_token_accuracy": 0.7823972702026367, + "num_tokens": 598665287.0, + "step": 23128 + }, + { + "epoch": 2.539973643751373, + "grad_norm": 1.8813176155090332, + "learning_rate": 5e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7418971657752991, + "num_tokens": 598695582.0, + "step": 23129 + }, + { + "epoch": 2.5400834614539862, + "grad_norm": 1.991235613822937, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7627586722373962, + "num_tokens": 598724018.0, + "step": 23130 + }, + { + "epoch": 2.5401932791566, + "grad_norm": 1.9999804496765137, + "learning_rate": 5e-06, + "loss": 0.6707, + "mean_token_accuracy": 0.7834205627441406, + "num_tokens": 598750029.0, + "step": 23131 + }, + { + "epoch": 2.5403030968592137, + "grad_norm": 2.1623387336730957, + "learning_rate": 5e-06, + "loss": 0.6587, + "mean_token_accuracy": 0.778913140296936, + "num_tokens": 598772586.0, + "step": 23132 + }, + { + "epoch": 2.5404129145618275, + "grad_norm": 2.210113763809204, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7685623168945312, + "num_tokens": 598795598.0, + "step": 23133 + }, + { + "epoch": 2.5405227322644413, + "grad_norm": 2.182591676712036, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7699365615844727, + "num_tokens": 598819758.0, + "step": 23134 + }, + { + "epoch": 2.5406325499670546, + "grad_norm": 2.0970981121063232, + "learning_rate": 5e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7724095582962036, + "num_tokens": 598844528.0, + "step": 23135 + }, + { + "epoch": 2.5407423676696683, + "grad_norm": 2.2918813228607178, + "learning_rate": 5e-06, + "loss": 0.6772, + "mean_token_accuracy": 0.7712961435317993, + "num_tokens": 598868745.0, + "step": 23136 + }, + { + "epoch": 2.540852185372282, + "grad_norm": 2.1828739643096924, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.754042387008667, + "num_tokens": 598895717.0, + "step": 23137 + }, + { + "epoch": 2.540962003074896, + "grad_norm": 2.257801055908203, + "learning_rate": 5e-06, + "loss": 0.651, + "mean_token_accuracy": 0.7825661897659302, + "num_tokens": 598919230.0, + "step": 23138 + }, + { + "epoch": 2.5410718207775096, + "grad_norm": 2.2945683002471924, + "learning_rate": 5e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7655690908432007, + "num_tokens": 598940706.0, + "step": 23139 + }, + { + "epoch": 2.541181638480123, + "grad_norm": 2.133572816848755, + "learning_rate": 5e-06, + "loss": 0.7511, + "mean_token_accuracy": 0.7545416355133057, + "num_tokens": 598969120.0, + "step": 23140 + }, + { + "epoch": 2.5412914561827367, + "grad_norm": 2.140831708908081, + "learning_rate": 5e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.7833970785140991, + "num_tokens": 598992032.0, + "step": 23141 + }, + { + "epoch": 2.5414012738853504, + "grad_norm": 2.023306369781494, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7688726186752319, + "num_tokens": 599018971.0, + "step": 23142 + }, + { + "epoch": 2.5415110915879637, + "grad_norm": 1.8949875831604004, + "learning_rate": 5e-06, + "loss": 0.6318, + "mean_token_accuracy": 0.7918300628662109, + "num_tokens": 599047463.0, + "step": 23143 + }, + { + "epoch": 2.5416209092905775, + "grad_norm": 1.8311325311660767, + "learning_rate": 5e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7691167593002319, + "num_tokens": 599078639.0, + "step": 23144 + }, + { + "epoch": 2.5417307269931912, + "grad_norm": 2.356654644012451, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7740408778190613, + "num_tokens": 599098105.0, + "step": 23145 + }, + { + "epoch": 2.541840544695805, + "grad_norm": 2.3151779174804688, + "learning_rate": 5e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7572346329689026, + "num_tokens": 599123753.0, + "step": 23146 + }, + { + "epoch": 2.5419503623984188, + "grad_norm": 1.9446910619735718, + "learning_rate": 5e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7579407095909119, + "num_tokens": 599152305.0, + "step": 23147 + }, + { + "epoch": 2.542060180101032, + "grad_norm": 2.2571613788604736, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7723758816719055, + "num_tokens": 599174712.0, + "step": 23148 + }, + { + "epoch": 2.542169997803646, + "grad_norm": 1.9809478521347046, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7598010301589966, + "num_tokens": 599199226.0, + "step": 23149 + }, + { + "epoch": 2.5422798155062596, + "grad_norm": 2.2560739517211914, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7691437005996704, + "num_tokens": 599222553.0, + "step": 23150 + }, + { + "epoch": 2.5423896332088733, + "grad_norm": 1.7917184829711914, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.766150176525116, + "num_tokens": 599257411.0, + "step": 23151 + }, + { + "epoch": 2.542499450911487, + "grad_norm": 2.209689140319824, + "learning_rate": 5e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.755574107170105, + "num_tokens": 599282308.0, + "step": 23152 + }, + { + "epoch": 2.5426092686141004, + "grad_norm": 2.017517566680908, + "learning_rate": 5e-06, + "loss": 0.6656, + "mean_token_accuracy": 0.7834367752075195, + "num_tokens": 599308014.0, + "step": 23153 + }, + { + "epoch": 2.542719086316714, + "grad_norm": 2.0630087852478027, + "learning_rate": 5e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7650983929634094, + "num_tokens": 599333397.0, + "step": 23154 + }, + { + "epoch": 2.542828904019328, + "grad_norm": 1.9191113710403442, + "learning_rate": 5e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7510043978691101, + "num_tokens": 599361444.0, + "step": 23155 + }, + { + "epoch": 2.5429387217219417, + "grad_norm": 2.187509298324585, + "learning_rate": 5e-06, + "loss": 0.6447, + "mean_token_accuracy": 0.7829256057739258, + "num_tokens": 599385217.0, + "step": 23156 + }, + { + "epoch": 2.5430485394245554, + "grad_norm": 2.050593852996826, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7589341402053833, + "num_tokens": 599412176.0, + "step": 23157 + }, + { + "epoch": 2.5431583571271688, + "grad_norm": 2.509310245513916, + "learning_rate": 5e-06, + "loss": 0.7203, + "mean_token_accuracy": 0.7675104141235352, + "num_tokens": 599431355.0, + "step": 23158 + }, + { + "epoch": 2.5432681748297825, + "grad_norm": 2.5769097805023193, + "learning_rate": 5e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.7837684154510498, + "num_tokens": 599449758.0, + "step": 23159 + }, + { + "epoch": 2.5433779925323963, + "grad_norm": 1.995419979095459, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7626534104347229, + "num_tokens": 599477531.0, + "step": 23160 + }, + { + "epoch": 2.54348781023501, + "grad_norm": 2.068260908126831, + "learning_rate": 5e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7546092867851257, + "num_tokens": 599506007.0, + "step": 23161 + }, + { + "epoch": 2.543597627937624, + "grad_norm": 2.01802396774292, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7458968162536621, + "num_tokens": 599533634.0, + "step": 23162 + }, + { + "epoch": 2.543707445640237, + "grad_norm": 1.984887957572937, + "learning_rate": 5e-06, + "loss": 0.706, + "mean_token_accuracy": 0.766708493232727, + "num_tokens": 599558693.0, + "step": 23163 + }, + { + "epoch": 2.543817263342851, + "grad_norm": 1.974554181098938, + "learning_rate": 5e-06, + "loss": 0.6558, + "mean_token_accuracy": 0.7818228006362915, + "num_tokens": 599584613.0, + "step": 23164 + }, + { + "epoch": 2.5439270810454646, + "grad_norm": 2.1659693717956543, + "learning_rate": 5e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7669876217842102, + "num_tokens": 599607701.0, + "step": 23165 + }, + { + "epoch": 2.544036898748078, + "grad_norm": 2.1400437355041504, + "learning_rate": 5e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7225465774536133, + "num_tokens": 599638348.0, + "step": 23166 + }, + { + "epoch": 2.544146716450692, + "grad_norm": 2.454763412475586, + "learning_rate": 5e-06, + "loss": 0.6711, + "mean_token_accuracy": 0.7769666910171509, + "num_tokens": 599657172.0, + "step": 23167 + }, + { + "epoch": 2.5442565341533054, + "grad_norm": 1.8955652713775635, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7484742403030396, + "num_tokens": 599688880.0, + "step": 23168 + }, + { + "epoch": 2.544366351855919, + "grad_norm": 2.2177860736846924, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7608746886253357, + "num_tokens": 599716932.0, + "step": 23169 + }, + { + "epoch": 2.544476169558533, + "grad_norm": 1.9644124507904053, + "learning_rate": 5e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.7833787202835083, + "num_tokens": 599743244.0, + "step": 23170 + }, + { + "epoch": 2.5445859872611463, + "grad_norm": 2.0900065898895264, + "learning_rate": 5e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7764700651168823, + "num_tokens": 599765500.0, + "step": 23171 + }, + { + "epoch": 2.54469580496376, + "grad_norm": 2.1141459941864014, + "learning_rate": 5e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7566182017326355, + "num_tokens": 599791601.0, + "step": 23172 + }, + { + "epoch": 2.544805622666374, + "grad_norm": 2.287652015686035, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.779019832611084, + "num_tokens": 599814477.0, + "step": 23173 + }, + { + "epoch": 2.5449154403689875, + "grad_norm": 2.1936392784118652, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7757801413536072, + "num_tokens": 599839404.0, + "step": 23174 + }, + { + "epoch": 2.5450252580716013, + "grad_norm": 2.0292675495147705, + "learning_rate": 5e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.769335150718689, + "num_tokens": 599867678.0, + "step": 23175 + }, + { + "epoch": 2.5451350757742146, + "grad_norm": 2.3320462703704834, + "learning_rate": 5e-06, + "loss": 0.6683, + "mean_token_accuracy": 0.7769759893417358, + "num_tokens": 599888539.0, + "step": 23176 + }, + { + "epoch": 2.5452448934768284, + "grad_norm": 2.303070545196533, + "learning_rate": 5e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7724237442016602, + "num_tokens": 599909640.0, + "step": 23177 + }, + { + "epoch": 2.545354711179442, + "grad_norm": 2.276951789855957, + "learning_rate": 5e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.7805073261260986, + "num_tokens": 599934142.0, + "step": 23178 + }, + { + "epoch": 2.545464528882056, + "grad_norm": 2.0950796604156494, + "learning_rate": 5e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.7781659960746765, + "num_tokens": 599958006.0, + "step": 23179 + }, + { + "epoch": 2.5455743465846696, + "grad_norm": 2.070220947265625, + "learning_rate": 5e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7462186217308044, + "num_tokens": 599986791.0, + "step": 23180 + }, + { + "epoch": 2.545684164287283, + "grad_norm": 2.106443166732788, + "learning_rate": 5e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7464667558670044, + "num_tokens": 600015505.0, + "step": 23181 + }, + { + "epoch": 2.5457939819898967, + "grad_norm": 2.065300464630127, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7604798078536987, + "num_tokens": 600040986.0, + "step": 23182 + }, + { + "epoch": 2.5459037996925105, + "grad_norm": 2.140935182571411, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7740896940231323, + "num_tokens": 600065076.0, + "step": 23183 + }, + { + "epoch": 2.546013617395124, + "grad_norm": 2.197650671005249, + "learning_rate": 5e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.7744120955467224, + "num_tokens": 600088476.0, + "step": 23184 + }, + { + "epoch": 2.546123435097738, + "grad_norm": 1.8919769525527954, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7746644020080566, + "num_tokens": 600118911.0, + "step": 23185 + }, + { + "epoch": 2.5462332528003513, + "grad_norm": 2.014256715774536, + "learning_rate": 5e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7591097950935364, + "num_tokens": 600147758.0, + "step": 23186 + }, + { + "epoch": 2.546343070502965, + "grad_norm": 2.034147024154663, + "learning_rate": 5e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7639793753623962, + "num_tokens": 600174453.0, + "step": 23187 + }, + { + "epoch": 2.546452888205579, + "grad_norm": 2.1811723709106445, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7680206298828125, + "num_tokens": 600199021.0, + "step": 23188 + }, + { + "epoch": 2.5465627059081926, + "grad_norm": 2.109351634979248, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7469415664672852, + "num_tokens": 600227813.0, + "step": 23189 + }, + { + "epoch": 2.5466725236108063, + "grad_norm": 2.1039061546325684, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7544519901275635, + "num_tokens": 600253720.0, + "step": 23190 + }, + { + "epoch": 2.5467823413134196, + "grad_norm": 2.241791009902954, + "learning_rate": 5e-06, + "loss": 0.6912, + "mean_token_accuracy": 0.7710745930671692, + "num_tokens": 600275186.0, + "step": 23191 + }, + { + "epoch": 2.5468921590160334, + "grad_norm": 1.894811987876892, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7633330821990967, + "num_tokens": 600307263.0, + "step": 23192 + }, + { + "epoch": 2.547001976718647, + "grad_norm": 2.036492109298706, + "learning_rate": 5e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7579740285873413, + "num_tokens": 600333266.0, + "step": 23193 + }, + { + "epoch": 2.5471117944212605, + "grad_norm": 1.9933595657348633, + "learning_rate": 5e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7670297622680664, + "num_tokens": 600361717.0, + "step": 23194 + }, + { + "epoch": 2.547221612123874, + "grad_norm": 2.049818992614746, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7561947107315063, + "num_tokens": 600390090.0, + "step": 23195 + }, + { + "epoch": 2.547331429826488, + "grad_norm": 1.8242921829223633, + "learning_rate": 5e-06, + "loss": 0.6524, + "mean_token_accuracy": 0.7823001146316528, + "num_tokens": 600419392.0, + "step": 23196 + }, + { + "epoch": 2.5474412475291017, + "grad_norm": 1.8605314493179321, + "learning_rate": 5e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.7514032125473022, + "num_tokens": 600450797.0, + "step": 23197 + }, + { + "epoch": 2.5475510652317155, + "grad_norm": 1.9463742971420288, + "learning_rate": 5e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7670314311981201, + "num_tokens": 600477794.0, + "step": 23198 + }, + { + "epoch": 2.547660882934329, + "grad_norm": 2.2326767444610596, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7640787959098816, + "num_tokens": 600500586.0, + "step": 23199 + }, + { + "epoch": 2.5477707006369426, + "grad_norm": 1.8922315835952759, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7425963878631592, + "num_tokens": 600530667.0, + "step": 23200 + }, + { + "epoch": 2.5478805183395563, + "grad_norm": 2.1805801391601562, + "learning_rate": 5e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7591461539268494, + "num_tokens": 600554148.0, + "step": 23201 + }, + { + "epoch": 2.54799033604217, + "grad_norm": 2.0085318088531494, + "learning_rate": 5e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7457436323165894, + "num_tokens": 600582780.0, + "step": 23202 + }, + { + "epoch": 2.548100153744784, + "grad_norm": 2.3054823875427246, + "learning_rate": 5e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7694061994552612, + "num_tokens": 600603822.0, + "step": 23203 + }, + { + "epoch": 2.548209971447397, + "grad_norm": 2.0678701400756836, + "learning_rate": 5e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7643270492553711, + "num_tokens": 600627384.0, + "step": 23204 + }, + { + "epoch": 2.548319789150011, + "grad_norm": 2.190166711807251, + "learning_rate": 5e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7677110433578491, + "num_tokens": 600649368.0, + "step": 23205 + }, + { + "epoch": 2.5484296068526247, + "grad_norm": 2.1181344985961914, + "learning_rate": 5e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.7602535486221313, + "num_tokens": 600673393.0, + "step": 23206 + }, + { + "epoch": 2.5485394245552384, + "grad_norm": 2.088423013687134, + "learning_rate": 5e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7572603225708008, + "num_tokens": 600698853.0, + "step": 23207 + }, + { + "epoch": 2.548649242257852, + "grad_norm": 2.187481641769409, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7607746124267578, + "num_tokens": 600722726.0, + "step": 23208 + }, + { + "epoch": 2.5487590599604655, + "grad_norm": 1.8740649223327637, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7283130884170532, + "num_tokens": 600755567.0, + "step": 23209 + }, + { + "epoch": 2.5488688776630792, + "grad_norm": 2.055210828781128, + "learning_rate": 5e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.767522931098938, + "num_tokens": 600783760.0, + "step": 23210 + }, + { + "epoch": 2.548978695365693, + "grad_norm": 2.36765193939209, + "learning_rate": 5e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7623218297958374, + "num_tokens": 600804872.0, + "step": 23211 + }, + { + "epoch": 2.5490885130683067, + "grad_norm": 2.1380064487457275, + "learning_rate": 5e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7623053789138794, + "num_tokens": 600832030.0, + "step": 23212 + }, + { + "epoch": 2.5491983307709205, + "grad_norm": 2.166893243789673, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.755874514579773, + "num_tokens": 600858536.0, + "step": 23213 + }, + { + "epoch": 2.549308148473534, + "grad_norm": 2.167503833770752, + "learning_rate": 5e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7707343697547913, + "num_tokens": 600886621.0, + "step": 23214 + }, + { + "epoch": 2.5494179661761476, + "grad_norm": 1.8261297941207886, + "learning_rate": 5e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.747410774230957, + "num_tokens": 600922302.0, + "step": 23215 + }, + { + "epoch": 2.5495277838787613, + "grad_norm": 2.056530714035034, + "learning_rate": 5e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.7958155274391174, + "num_tokens": 600945167.0, + "step": 23216 + }, + { + "epoch": 2.5496376015813746, + "grad_norm": 2.0416250228881836, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7747886180877686, + "num_tokens": 600972542.0, + "step": 23217 + }, + { + "epoch": 2.549747419283989, + "grad_norm": 2.100623846054077, + "learning_rate": 5e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7684400081634521, + "num_tokens": 600996254.0, + "step": 23218 + }, + { + "epoch": 2.549857236986602, + "grad_norm": 1.9586708545684814, + "learning_rate": 5e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7643985748291016, + "num_tokens": 601023374.0, + "step": 23219 + }, + { + "epoch": 2.549967054689216, + "grad_norm": 1.8696430921554565, + "learning_rate": 5e-06, + "loss": 0.7501, + "mean_token_accuracy": 0.7524859309196472, + "num_tokens": 601054839.0, + "step": 23220 + }, + { + "epoch": 2.5500768723918297, + "grad_norm": 2.2045955657958984, + "learning_rate": 5e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7507497072219849, + "num_tokens": 601079487.0, + "step": 23221 + }, + { + "epoch": 2.550186690094443, + "grad_norm": 2.2381625175476074, + "learning_rate": 5e-06, + "loss": 0.607, + "mean_token_accuracy": 0.7941068410873413, + "num_tokens": 601100925.0, + "step": 23222 + }, + { + "epoch": 2.5502965077970567, + "grad_norm": 2.202497959136963, + "learning_rate": 5e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7461662292480469, + "num_tokens": 601125092.0, + "step": 23223 + }, + { + "epoch": 2.5504063254996705, + "grad_norm": 2.4679250717163086, + "learning_rate": 5e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.7813148498535156, + "num_tokens": 601143226.0, + "step": 23224 + }, + { + "epoch": 2.5505161432022843, + "grad_norm": 2.202498435974121, + "learning_rate": 5e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.7773622274398804, + "num_tokens": 601165505.0, + "step": 23225 + }, + { + "epoch": 2.550625960904898, + "grad_norm": 2.272649049758911, + "learning_rate": 5e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7870388031005859, + "num_tokens": 601189021.0, + "step": 23226 + }, + { + "epoch": 2.5507357786075113, + "grad_norm": 2.364194393157959, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7498258352279663, + "num_tokens": 601211485.0, + "step": 23227 + }, + { + "epoch": 2.550845596310125, + "grad_norm": 1.9467705488204956, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7667117714881897, + "num_tokens": 601239039.0, + "step": 23228 + }, + { + "epoch": 2.550955414012739, + "grad_norm": 2.125162124633789, + "learning_rate": 5e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7645480632781982, + "num_tokens": 601265441.0, + "step": 23229 + }, + { + "epoch": 2.5510652317153526, + "grad_norm": 2.2199504375457764, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7574988603591919, + "num_tokens": 601291405.0, + "step": 23230 + }, + { + "epoch": 2.5511750494179664, + "grad_norm": 2.527928352355957, + "learning_rate": 5e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.7742760181427002, + "num_tokens": 601311801.0, + "step": 23231 + }, + { + "epoch": 2.5512848671205797, + "grad_norm": 2.140242099761963, + "learning_rate": 5e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7499991655349731, + "num_tokens": 601337837.0, + "step": 23232 + }, + { + "epoch": 2.5513946848231934, + "grad_norm": 2.0710089206695557, + "learning_rate": 5e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7709093689918518, + "num_tokens": 601363903.0, + "step": 23233 + }, + { + "epoch": 2.551504502525807, + "grad_norm": 2.2262260913848877, + "learning_rate": 5e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.7772120237350464, + "num_tokens": 601387540.0, + "step": 23234 + }, + { + "epoch": 2.551614320228421, + "grad_norm": 2.2908124923706055, + "learning_rate": 5e-06, + "loss": 0.6361, + "mean_token_accuracy": 0.7856390476226807, + "num_tokens": 601409704.0, + "step": 23235 + }, + { + "epoch": 2.5517241379310347, + "grad_norm": 2.2659964561462402, + "learning_rate": 5e-06, + "loss": 0.6444, + "mean_token_accuracy": 0.7921635508537292, + "num_tokens": 601430617.0, + "step": 23236 + }, + { + "epoch": 2.551833955633648, + "grad_norm": 1.944773554801941, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7730709910392761, + "num_tokens": 601457608.0, + "step": 23237 + }, + { + "epoch": 2.5519437733362618, + "grad_norm": 2.0421624183654785, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7566778063774109, + "num_tokens": 601482913.0, + "step": 23238 + }, + { + "epoch": 2.5520535910388755, + "grad_norm": 2.3602294921875, + "learning_rate": 5e-06, + "loss": 0.6474, + "mean_token_accuracy": 0.7806397080421448, + "num_tokens": 601504214.0, + "step": 23239 + }, + { + "epoch": 2.5521634087414893, + "grad_norm": 2.388899087905884, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7684288024902344, + "num_tokens": 601526633.0, + "step": 23240 + }, + { + "epoch": 2.552273226444103, + "grad_norm": 1.8810081481933594, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7513850927352905, + "num_tokens": 601557290.0, + "step": 23241 + }, + { + "epoch": 2.5523830441467164, + "grad_norm": 1.889643907546997, + "learning_rate": 5e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7568690776824951, + "num_tokens": 601586344.0, + "step": 23242 + }, + { + "epoch": 2.55249286184933, + "grad_norm": 2.0523715019226074, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7570526599884033, + "num_tokens": 601614446.0, + "step": 23243 + }, + { + "epoch": 2.552602679551944, + "grad_norm": 2.142965793609619, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7573675513267517, + "num_tokens": 601639432.0, + "step": 23244 + }, + { + "epoch": 2.552712497254557, + "grad_norm": 1.9657623767852783, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.753909170627594, + "num_tokens": 601668285.0, + "step": 23245 + }, + { + "epoch": 2.552822314957171, + "grad_norm": 2.0855934619903564, + "learning_rate": 5e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7360982298851013, + "num_tokens": 601695367.0, + "step": 23246 + }, + { + "epoch": 2.5529321326597847, + "grad_norm": 2.2262377738952637, + "learning_rate": 5e-06, + "loss": 0.746, + "mean_token_accuracy": 0.757811427116394, + "num_tokens": 601717554.0, + "step": 23247 + }, + { + "epoch": 2.5530419503623984, + "grad_norm": 2.0581114292144775, + "learning_rate": 5e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.783942699432373, + "num_tokens": 601740308.0, + "step": 23248 + }, + { + "epoch": 2.553151768065012, + "grad_norm": 2.155339479446411, + "learning_rate": 5e-06, + "loss": 0.6985, + "mean_token_accuracy": 0.7702492475509644, + "num_tokens": 601763859.0, + "step": 23249 + }, + { + "epoch": 2.5532615857676255, + "grad_norm": 2.036835193634033, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7628997564315796, + "num_tokens": 601792516.0, + "step": 23250 + }, + { + "epoch": 2.5533714034702393, + "grad_norm": 1.9594783782958984, + "learning_rate": 5e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7645368576049805, + "num_tokens": 601821503.0, + "step": 23251 + }, + { + "epoch": 2.553481221172853, + "grad_norm": 2.2594656944274902, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7792339324951172, + "num_tokens": 601842122.0, + "step": 23252 + }, + { + "epoch": 2.553591038875467, + "grad_norm": 2.192023754119873, + "learning_rate": 5e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7774209976196289, + "num_tokens": 601864068.0, + "step": 23253 + }, + { + "epoch": 2.5537008565780805, + "grad_norm": 1.845960259437561, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7644484043121338, + "num_tokens": 601892813.0, + "step": 23254 + }, + { + "epoch": 2.553810674280694, + "grad_norm": 1.9863557815551758, + "learning_rate": 5e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7441409230232239, + "num_tokens": 601923259.0, + "step": 23255 + }, + { + "epoch": 2.5539204919833076, + "grad_norm": 2.147792100906372, + "learning_rate": 5e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.773848295211792, + "num_tokens": 601946985.0, + "step": 23256 + }, + { + "epoch": 2.5540303096859214, + "grad_norm": 2.4236204624176025, + "learning_rate": 5e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7706433534622192, + "num_tokens": 601969100.0, + "step": 23257 + }, + { + "epoch": 2.554140127388535, + "grad_norm": 2.0756099224090576, + "learning_rate": 5e-06, + "loss": 0.7291, + "mean_token_accuracy": 0.7561050057411194, + "num_tokens": 601995527.0, + "step": 23258 + }, + { + "epoch": 2.554249945091149, + "grad_norm": 2.2508866786956787, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7627381086349487, + "num_tokens": 602017849.0, + "step": 23259 + }, + { + "epoch": 2.554359762793762, + "grad_norm": 2.2625138759613037, + "learning_rate": 5e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.745890736579895, + "num_tokens": 602041463.0, + "step": 23260 + }, + { + "epoch": 2.554469580496376, + "grad_norm": 1.7764936685562134, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7593172788619995, + "num_tokens": 602076086.0, + "step": 23261 + }, + { + "epoch": 2.5545793981989897, + "grad_norm": 1.8707613945007324, + "learning_rate": 5e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7510351538658142, + "num_tokens": 602107410.0, + "step": 23262 + }, + { + "epoch": 2.5546892159016035, + "grad_norm": 1.993558406829834, + "learning_rate": 5e-06, + "loss": 0.6684, + "mean_token_accuracy": 0.7808624505996704, + "num_tokens": 602134810.0, + "step": 23263 + }, + { + "epoch": 2.5547990336042172, + "grad_norm": 2.167701005935669, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7708761692047119, + "num_tokens": 602160765.0, + "step": 23264 + }, + { + "epoch": 2.5549088513068305, + "grad_norm": 2.1244513988494873, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7611676454544067, + "num_tokens": 602186413.0, + "step": 23265 + }, + { + "epoch": 2.5550186690094443, + "grad_norm": 2.127797842025757, + "learning_rate": 5e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7679083347320557, + "num_tokens": 602213465.0, + "step": 23266 + }, + { + "epoch": 2.555128486712058, + "grad_norm": 1.7747225761413574, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7426444888114929, + "num_tokens": 602248603.0, + "step": 23267 + }, + { + "epoch": 2.5552383044146714, + "grad_norm": 2.0212152004241943, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7589051127433777, + "num_tokens": 602276682.0, + "step": 23268 + }, + { + "epoch": 2.5553481221172856, + "grad_norm": 2.136826753616333, + "learning_rate": 5e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7564325332641602, + "num_tokens": 602302563.0, + "step": 23269 + }, + { + "epoch": 2.555457939819899, + "grad_norm": 1.8779232501983643, + "learning_rate": 5e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.7756969928741455, + "num_tokens": 602332222.0, + "step": 23270 + }, + { + "epoch": 2.5555677575225126, + "grad_norm": 1.956371784210205, + "learning_rate": 5e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7593258619308472, + "num_tokens": 602359318.0, + "step": 23271 + }, + { + "epoch": 2.5556775752251264, + "grad_norm": 2.2490317821502686, + "learning_rate": 5e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7523048520088196, + "num_tokens": 602383879.0, + "step": 23272 + }, + { + "epoch": 2.5557873929277397, + "grad_norm": 2.1463961601257324, + "learning_rate": 5e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7820982336997986, + "num_tokens": 602408436.0, + "step": 23273 + }, + { + "epoch": 2.5558972106303535, + "grad_norm": 2.218003749847412, + "learning_rate": 5e-06, + "loss": 0.6621, + "mean_token_accuracy": 0.7782398462295532, + "num_tokens": 602430447.0, + "step": 23274 + }, + { + "epoch": 2.5560070283329672, + "grad_norm": 2.259800434112549, + "learning_rate": 5e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.7474571466445923, + "num_tokens": 602453094.0, + "step": 23275 + }, + { + "epoch": 2.556116846035581, + "grad_norm": 1.9788801670074463, + "learning_rate": 5e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.7823237180709839, + "num_tokens": 602478686.0, + "step": 23276 + }, + { + "epoch": 2.5562266637381947, + "grad_norm": 2.1298885345458984, + "learning_rate": 5e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7645065784454346, + "num_tokens": 602505618.0, + "step": 23277 + }, + { + "epoch": 2.556336481440808, + "grad_norm": 1.760985255241394, + "learning_rate": 5e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7722798585891724, + "num_tokens": 602538478.0, + "step": 23278 + }, + { + "epoch": 2.556446299143422, + "grad_norm": 2.226261854171753, + "learning_rate": 5e-06, + "loss": 0.6198, + "mean_token_accuracy": 0.7849836349487305, + "num_tokens": 602561667.0, + "step": 23279 + }, + { + "epoch": 2.5565561168460356, + "grad_norm": 2.0201363563537598, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7534132599830627, + "num_tokens": 602589211.0, + "step": 23280 + }, + { + "epoch": 2.5566659345486493, + "grad_norm": 2.072453260421753, + "learning_rate": 5e-06, + "loss": 0.687, + "mean_token_accuracy": 0.7689307928085327, + "num_tokens": 602615947.0, + "step": 23281 + }, + { + "epoch": 2.556775752251263, + "grad_norm": 2.1104366779327393, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7747848629951477, + "num_tokens": 602642744.0, + "step": 23282 + }, + { + "epoch": 2.5568855699538764, + "grad_norm": 2.16540265083313, + "learning_rate": 5e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7649346590042114, + "num_tokens": 602668454.0, + "step": 23283 + }, + { + "epoch": 2.55699538765649, + "grad_norm": 2.245252847671509, + "learning_rate": 5e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.764994740486145, + "num_tokens": 602691436.0, + "step": 23284 + }, + { + "epoch": 2.557105205359104, + "grad_norm": 1.9950817823410034, + "learning_rate": 5e-06, + "loss": 0.6968, + "mean_token_accuracy": 0.767233669757843, + "num_tokens": 602720668.0, + "step": 23285 + }, + { + "epoch": 2.5572150230617177, + "grad_norm": 1.8990575075149536, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.773817241191864, + "num_tokens": 602750185.0, + "step": 23286 + }, + { + "epoch": 2.5573248407643314, + "grad_norm": 2.1976640224456787, + "learning_rate": 5e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7766509652137756, + "num_tokens": 602772560.0, + "step": 23287 + }, + { + "epoch": 2.5574346584669447, + "grad_norm": 2.114314556121826, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7656548619270325, + "num_tokens": 602795709.0, + "step": 23288 + }, + { + "epoch": 2.5575444761695585, + "grad_norm": 2.3483312129974365, + "learning_rate": 5e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7773356437683105, + "num_tokens": 602817435.0, + "step": 23289 + }, + { + "epoch": 2.5576542938721722, + "grad_norm": 2.157170057296753, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7632980942726135, + "num_tokens": 602842684.0, + "step": 23290 + }, + { + "epoch": 2.557764111574786, + "grad_norm": 2.042126178741455, + "learning_rate": 5e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7469915151596069, + "num_tokens": 602870925.0, + "step": 23291 + }, + { + "epoch": 2.5578739292773998, + "grad_norm": 2.426835060119629, + "learning_rate": 5e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7703388929367065, + "num_tokens": 602890057.0, + "step": 23292 + }, + { + "epoch": 2.557983746980013, + "grad_norm": 2.0565743446350098, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7675479650497437, + "num_tokens": 602913809.0, + "step": 23293 + }, + { + "epoch": 2.558093564682627, + "grad_norm": 2.1820363998413086, + "learning_rate": 5e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7861143946647644, + "num_tokens": 602939561.0, + "step": 23294 + }, + { + "epoch": 2.5582033823852406, + "grad_norm": 1.989759087562561, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7635315656661987, + "num_tokens": 602968876.0, + "step": 23295 + }, + { + "epoch": 2.558313200087854, + "grad_norm": 2.3842670917510986, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7700965404510498, + "num_tokens": 602990373.0, + "step": 23296 + }, + { + "epoch": 2.5584230177904677, + "grad_norm": 2.045311212539673, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.765278697013855, + "num_tokens": 603014477.0, + "step": 23297 + }, + { + "epoch": 2.5585328354930814, + "grad_norm": 2.2902634143829346, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.769040584564209, + "num_tokens": 603036227.0, + "step": 23298 + }, + { + "epoch": 2.558642653195695, + "grad_norm": 2.3378448486328125, + "learning_rate": 5e-06, + "loss": 0.6083, + "mean_token_accuracy": 0.7924400568008423, + "num_tokens": 603056469.0, + "step": 23299 + }, + { + "epoch": 2.558752470898309, + "grad_norm": 2.153470039367676, + "learning_rate": 5e-06, + "loss": 0.6682, + "mean_token_accuracy": 0.7788881063461304, + "num_tokens": 603078350.0, + "step": 23300 + }, + { + "epoch": 2.5588622886009222, + "grad_norm": 2.2202301025390625, + "learning_rate": 5e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7557533979415894, + "num_tokens": 603104108.0, + "step": 23301 + }, + { + "epoch": 2.558972106303536, + "grad_norm": 2.0896198749542236, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7622291445732117, + "num_tokens": 603129350.0, + "step": 23302 + }, + { + "epoch": 2.5590819240061498, + "grad_norm": 2.069261312484741, + "learning_rate": 5e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7581205368041992, + "num_tokens": 603155126.0, + "step": 23303 + }, + { + "epoch": 2.5591917417087635, + "grad_norm": 2.359607458114624, + "learning_rate": 5e-06, + "loss": 0.6427, + "mean_token_accuracy": 0.7855613827705383, + "num_tokens": 603175070.0, + "step": 23304 + }, + { + "epoch": 2.5593015594113773, + "grad_norm": 1.9691870212554932, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7529500722885132, + "num_tokens": 603204318.0, + "step": 23305 + }, + { + "epoch": 2.5594113771139906, + "grad_norm": 1.8286559581756592, + "learning_rate": 5e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.7873215079307556, + "num_tokens": 603237991.0, + "step": 23306 + }, + { + "epoch": 2.5595211948166043, + "grad_norm": 2.042494535446167, + "learning_rate": 5e-06, + "loss": 0.6507, + "mean_token_accuracy": 0.7744522094726562, + "num_tokens": 603262024.0, + "step": 23307 + }, + { + "epoch": 2.559631012519218, + "grad_norm": 2.2028348445892334, + "learning_rate": 5e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.7976808547973633, + "num_tokens": 603283652.0, + "step": 23308 + }, + { + "epoch": 2.559740830221832, + "grad_norm": 2.1686275005340576, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7592394351959229, + "num_tokens": 603308376.0, + "step": 23309 + }, + { + "epoch": 2.5598506479244456, + "grad_norm": 2.197221517562866, + "learning_rate": 5e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7540948987007141, + "num_tokens": 603332830.0, + "step": 23310 + }, + { + "epoch": 2.559960465627059, + "grad_norm": 2.045172691345215, + "learning_rate": 5e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7356289029121399, + "num_tokens": 603362956.0, + "step": 23311 + }, + { + "epoch": 2.5600702833296727, + "grad_norm": 1.9824122190475464, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7729630470275879, + "num_tokens": 603390008.0, + "step": 23312 + }, + { + "epoch": 2.5601801010322864, + "grad_norm": 2.0853753089904785, + "learning_rate": 5e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.7830272316932678, + "num_tokens": 603414285.0, + "step": 23313 + }, + { + "epoch": 2.5602899187349, + "grad_norm": 2.143148899078369, + "learning_rate": 5e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7591663002967834, + "num_tokens": 603437938.0, + "step": 23314 + }, + { + "epoch": 2.560399736437514, + "grad_norm": 2.0326688289642334, + "learning_rate": 5e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7741091847419739, + "num_tokens": 603466710.0, + "step": 23315 + }, + { + "epoch": 2.5605095541401273, + "grad_norm": 2.0302083492279053, + "learning_rate": 5e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8016892671585083, + "num_tokens": 603490331.0, + "step": 23316 + }, + { + "epoch": 2.560619371842741, + "grad_norm": 1.9823328256607056, + "learning_rate": 5e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7536320686340332, + "num_tokens": 603518261.0, + "step": 23317 + }, + { + "epoch": 2.560729189545355, + "grad_norm": 2.0495264530181885, + "learning_rate": 5e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7451764345169067, + "num_tokens": 603549144.0, + "step": 23318 + }, + { + "epoch": 2.5608390072479685, + "grad_norm": 2.2757246494293213, + "learning_rate": 5e-06, + "loss": 0.6897, + "mean_token_accuracy": 0.7644155025482178, + "num_tokens": 603570462.0, + "step": 23319 + }, + { + "epoch": 2.5609488249505823, + "grad_norm": 2.2828450202941895, + "learning_rate": 5e-06, + "loss": 0.6852, + "mean_token_accuracy": 0.7765842080116272, + "num_tokens": 603591806.0, + "step": 23320 + }, + { + "epoch": 2.5610586426531956, + "grad_norm": 2.085212230682373, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7552235126495361, + "num_tokens": 603617225.0, + "step": 23321 + }, + { + "epoch": 2.5611684603558094, + "grad_norm": 2.295409679412842, + "learning_rate": 5e-06, + "loss": 0.6406, + "mean_token_accuracy": 0.78788822889328, + "num_tokens": 603638117.0, + "step": 23322 + }, + { + "epoch": 2.561278278058423, + "grad_norm": 1.964035153388977, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7723191976547241, + "num_tokens": 603665961.0, + "step": 23323 + }, + { + "epoch": 2.5613880957610364, + "grad_norm": 2.0512025356292725, + "learning_rate": 5e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7508238554000854, + "num_tokens": 603693271.0, + "step": 23324 + }, + { + "epoch": 2.56149791346365, + "grad_norm": 2.109689474105835, + "learning_rate": 5e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7617056369781494, + "num_tokens": 603718530.0, + "step": 23325 + }, + { + "epoch": 2.561607731166264, + "grad_norm": 2.019538640975952, + "learning_rate": 5e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7663068771362305, + "num_tokens": 603745011.0, + "step": 23326 + }, + { + "epoch": 2.5617175488688777, + "grad_norm": 1.8520333766937256, + "learning_rate": 5e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7686932682991028, + "num_tokens": 603773300.0, + "step": 23327 + }, + { + "epoch": 2.5618273665714915, + "grad_norm": 1.94528067111969, + "learning_rate": 5e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7520145177841187, + "num_tokens": 603802286.0, + "step": 23328 + }, + { + "epoch": 2.5619371842741048, + "grad_norm": 2.2063982486724854, + "learning_rate": 5e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.7891802787780762, + "num_tokens": 603824000.0, + "step": 23329 + }, + { + "epoch": 2.5620470019767185, + "grad_norm": 2.4703707695007324, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7769151926040649, + "num_tokens": 603843612.0, + "step": 23330 + }, + { + "epoch": 2.5621568196793323, + "grad_norm": 2.2580716609954834, + "learning_rate": 5e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7722895741462708, + "num_tokens": 603864252.0, + "step": 23331 + }, + { + "epoch": 2.562266637381946, + "grad_norm": 1.9648617506027222, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7599542140960693, + "num_tokens": 603896442.0, + "step": 23332 + }, + { + "epoch": 2.56237645508456, + "grad_norm": 2.533928155899048, + "learning_rate": 5e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7697525024414062, + "num_tokens": 603915691.0, + "step": 23333 + }, + { + "epoch": 2.562486272787173, + "grad_norm": 2.2747223377227783, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7761303186416626, + "num_tokens": 603937434.0, + "step": 23334 + }, + { + "epoch": 2.562596090489787, + "grad_norm": 1.8496365547180176, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7573369741439819, + "num_tokens": 603971758.0, + "step": 23335 + }, + { + "epoch": 2.5627059081924006, + "grad_norm": 1.9685291051864624, + "learning_rate": 5e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7485350370407104, + "num_tokens": 603999659.0, + "step": 23336 + }, + { + "epoch": 2.5628157258950144, + "grad_norm": 2.1677603721618652, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7497357130050659, + "num_tokens": 604025222.0, + "step": 23337 + }, + { + "epoch": 2.562925543597628, + "grad_norm": 2.0048742294311523, + "learning_rate": 5e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.7725523114204407, + "num_tokens": 604052691.0, + "step": 23338 + }, + { + "epoch": 2.5630353613002415, + "grad_norm": 2.0693817138671875, + "learning_rate": 5e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.778084397315979, + "num_tokens": 604076654.0, + "step": 23339 + }, + { + "epoch": 2.563145179002855, + "grad_norm": 2.004772186279297, + "learning_rate": 5e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7643094062805176, + "num_tokens": 604103487.0, + "step": 23340 + }, + { + "epoch": 2.563254996705469, + "grad_norm": 1.8667466640472412, + "learning_rate": 5e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7725028991699219, + "num_tokens": 604134052.0, + "step": 23341 + }, + { + "epoch": 2.5633648144080827, + "grad_norm": 2.145109176635742, + "learning_rate": 5e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.764431357383728, + "num_tokens": 604159598.0, + "step": 23342 + }, + { + "epoch": 2.5634746321106965, + "grad_norm": 2.478057384490967, + "learning_rate": 5e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.7874630689620972, + "num_tokens": 604177860.0, + "step": 23343 + }, + { + "epoch": 2.56358444981331, + "grad_norm": 2.4661054611206055, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7610196471214294, + "num_tokens": 604198857.0, + "step": 23344 + }, + { + "epoch": 2.5636942675159236, + "grad_norm": 2.0344183444976807, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7738083600997925, + "num_tokens": 604223815.0, + "step": 23345 + }, + { + "epoch": 2.5638040852185373, + "grad_norm": 1.9774088859558105, + "learning_rate": 5e-06, + "loss": 0.6733, + "mean_token_accuracy": 0.7764188051223755, + "num_tokens": 604254254.0, + "step": 23346 + }, + { + "epoch": 2.5639139029211506, + "grad_norm": 2.167351484298706, + "learning_rate": 5e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.7780799269676208, + "num_tokens": 604279906.0, + "step": 23347 + }, + { + "epoch": 2.564023720623765, + "grad_norm": 2.0706136226654053, + "learning_rate": 5e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.7753556966781616, + "num_tokens": 604306806.0, + "step": 23348 + }, + { + "epoch": 2.564133538326378, + "grad_norm": 2.101655960083008, + "learning_rate": 5e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.7765848636627197, + "num_tokens": 604332842.0, + "step": 23349 + }, + { + "epoch": 2.564243356028992, + "grad_norm": 2.038306713104248, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7513272166252136, + "num_tokens": 604362294.0, + "step": 23350 + }, + { + "epoch": 2.5643531737316057, + "grad_norm": 1.9657962322235107, + "learning_rate": 5e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7669976949691772, + "num_tokens": 604390856.0, + "step": 23351 + }, + { + "epoch": 2.564462991434219, + "grad_norm": 2.1368229389190674, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.762525200843811, + "num_tokens": 604414844.0, + "step": 23352 + }, + { + "epoch": 2.5645728091368327, + "grad_norm": 2.3852319717407227, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7630399465560913, + "num_tokens": 604435949.0, + "step": 23353 + }, + { + "epoch": 2.5646826268394465, + "grad_norm": 2.258431911468506, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7605737447738647, + "num_tokens": 604461310.0, + "step": 23354 + }, + { + "epoch": 2.5647924445420602, + "grad_norm": 2.136322021484375, + "learning_rate": 5e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.757226824760437, + "num_tokens": 604485554.0, + "step": 23355 + }, + { + "epoch": 2.564902262244674, + "grad_norm": 2.036952257156372, + "learning_rate": 5e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7533327341079712, + "num_tokens": 604516283.0, + "step": 23356 + }, + { + "epoch": 2.5650120799472873, + "grad_norm": 2.1811656951904297, + "learning_rate": 5e-06, + "loss": 0.624, + "mean_token_accuracy": 0.7899760007858276, + "num_tokens": 604539761.0, + "step": 23357 + }, + { + "epoch": 2.565121897649901, + "grad_norm": 2.0973823070526123, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7686150074005127, + "num_tokens": 604563803.0, + "step": 23358 + }, + { + "epoch": 2.565231715352515, + "grad_norm": 2.1590726375579834, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7627933621406555, + "num_tokens": 604586194.0, + "step": 23359 + }, + { + "epoch": 2.5653415330551286, + "grad_norm": 2.166701078414917, + "learning_rate": 5e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.776144802570343, + "num_tokens": 604609959.0, + "step": 23360 + }, + { + "epoch": 2.5654513507577423, + "grad_norm": 2.156127691268921, + "learning_rate": 5e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.7906635999679565, + "num_tokens": 604631898.0, + "step": 23361 + }, + { + "epoch": 2.5655611684603556, + "grad_norm": 2.3524622917175293, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7717710733413696, + "num_tokens": 604656540.0, + "step": 23362 + }, + { + "epoch": 2.5656709861629694, + "grad_norm": 1.822403907775879, + "learning_rate": 5e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7534791827201843, + "num_tokens": 604689380.0, + "step": 23363 + }, + { + "epoch": 2.565780803865583, + "grad_norm": 2.1645634174346924, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7579470872879028, + "num_tokens": 604717980.0, + "step": 23364 + }, + { + "epoch": 2.565890621568197, + "grad_norm": 2.1914710998535156, + "learning_rate": 5e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7540460824966431, + "num_tokens": 604741505.0, + "step": 23365 + }, + { + "epoch": 2.5660004392708107, + "grad_norm": 1.9090452194213867, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7424211502075195, + "num_tokens": 604770362.0, + "step": 23366 + }, + { + "epoch": 2.566110256973424, + "grad_norm": 2.0343778133392334, + "learning_rate": 5e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.7799888253211975, + "num_tokens": 604795156.0, + "step": 23367 + }, + { + "epoch": 2.5662200746760377, + "grad_norm": 1.9255536794662476, + "learning_rate": 5e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.752139687538147, + "num_tokens": 604823432.0, + "step": 23368 + }, + { + "epoch": 2.5663298923786515, + "grad_norm": 2.3350136280059814, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7749576568603516, + "num_tokens": 604844648.0, + "step": 23369 + }, + { + "epoch": 2.5664397100812653, + "grad_norm": 2.260899305343628, + "learning_rate": 5e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7716437578201294, + "num_tokens": 604867629.0, + "step": 23370 + }, + { + "epoch": 2.566549527783879, + "grad_norm": 1.924067735671997, + "learning_rate": 5e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7661831378936768, + "num_tokens": 604896531.0, + "step": 23371 + }, + { + "epoch": 2.5666593454864923, + "grad_norm": 1.9861587285995483, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7453576326370239, + "num_tokens": 604924546.0, + "step": 23372 + }, + { + "epoch": 2.566769163189106, + "grad_norm": 2.1951184272766113, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7602075934410095, + "num_tokens": 604946651.0, + "step": 23373 + }, + { + "epoch": 2.56687898089172, + "grad_norm": 1.9969350099563599, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7573889493942261, + "num_tokens": 604973806.0, + "step": 23374 + }, + { + "epoch": 2.566988798594333, + "grad_norm": 2.0059943199157715, + "learning_rate": 5e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7578148245811462, + "num_tokens": 605000592.0, + "step": 23375 + }, + { + "epoch": 2.567098616296947, + "grad_norm": 2.2426674365997314, + "learning_rate": 5e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.7700722217559814, + "num_tokens": 605021593.0, + "step": 23376 + }, + { + "epoch": 2.5672084339995607, + "grad_norm": 2.1164512634277344, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7418599128723145, + "num_tokens": 605050062.0, + "step": 23377 + }, + { + "epoch": 2.5673182517021744, + "grad_norm": 1.913926601409912, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.756320595741272, + "num_tokens": 605081969.0, + "step": 23378 + }, + { + "epoch": 2.567428069404788, + "grad_norm": 2.5359745025634766, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7738645076751709, + "num_tokens": 605100994.0, + "step": 23379 + }, + { + "epoch": 2.5675378871074015, + "grad_norm": 1.8886300325393677, + "learning_rate": 5e-06, + "loss": 0.6981, + "mean_token_accuracy": 0.765681803226471, + "num_tokens": 605130793.0, + "step": 23380 + }, + { + "epoch": 2.5676477048100153, + "grad_norm": 2.3456153869628906, + "learning_rate": 5e-06, + "loss": 0.6197, + "mean_token_accuracy": 0.7960606813430786, + "num_tokens": 605151304.0, + "step": 23381 + }, + { + "epoch": 2.567757522512629, + "grad_norm": 2.219151735305786, + "learning_rate": 5e-06, + "loss": 0.6624, + "mean_token_accuracy": 0.7759395837783813, + "num_tokens": 605173613.0, + "step": 23382 + }, + { + "epoch": 2.5678673402152428, + "grad_norm": 1.9562981128692627, + "learning_rate": 5e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.762052595615387, + "num_tokens": 605203815.0, + "step": 23383 + }, + { + "epoch": 2.5679771579178565, + "grad_norm": 2.1265130043029785, + "learning_rate": 5e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7772384285926819, + "num_tokens": 605228683.0, + "step": 23384 + }, + { + "epoch": 2.56808697562047, + "grad_norm": 2.301722764968872, + "learning_rate": 5e-06, + "loss": 0.6541, + "mean_token_accuracy": 0.7810823321342468, + "num_tokens": 605249166.0, + "step": 23385 + }, + { + "epoch": 2.5681967933230836, + "grad_norm": 1.9044643640518188, + "learning_rate": 5e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7763041257858276, + "num_tokens": 605274518.0, + "step": 23386 + }, + { + "epoch": 2.5683066110256974, + "grad_norm": 2.1387510299682617, + "learning_rate": 5e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7646627426147461, + "num_tokens": 605298965.0, + "step": 23387 + }, + { + "epoch": 2.568416428728311, + "grad_norm": 1.9790047407150269, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7581465840339661, + "num_tokens": 605327204.0, + "step": 23388 + }, + { + "epoch": 2.568526246430925, + "grad_norm": 1.9326064586639404, + "learning_rate": 5e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7726759910583496, + "num_tokens": 605355286.0, + "step": 23389 + }, + { + "epoch": 2.568636064133538, + "grad_norm": 2.188502311706543, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7626366019248962, + "num_tokens": 605380285.0, + "step": 23390 + }, + { + "epoch": 2.568745881836152, + "grad_norm": 1.998291254043579, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7715603113174438, + "num_tokens": 605407968.0, + "step": 23391 + }, + { + "epoch": 2.5688556995387657, + "grad_norm": 2.020580768585205, + "learning_rate": 5e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7588882446289062, + "num_tokens": 605434981.0, + "step": 23392 + }, + { + "epoch": 2.5689655172413794, + "grad_norm": 1.9633928537368774, + "learning_rate": 5e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7568855881690979, + "num_tokens": 605462517.0, + "step": 23393 + }, + { + "epoch": 2.569075334943993, + "grad_norm": 2.133776903152466, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7494815587997437, + "num_tokens": 605487715.0, + "step": 23394 + }, + { + "epoch": 2.5691851526466065, + "grad_norm": 1.9064995050430298, + "learning_rate": 5e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.758948802947998, + "num_tokens": 605518411.0, + "step": 23395 + }, + { + "epoch": 2.5692949703492203, + "grad_norm": 1.963703989982605, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7469456195831299, + "num_tokens": 605547880.0, + "step": 23396 + }, + { + "epoch": 2.569404788051834, + "grad_norm": 2.380519390106201, + "learning_rate": 5e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.780234694480896, + "num_tokens": 605568373.0, + "step": 23397 + }, + { + "epoch": 2.5695146057544473, + "grad_norm": 2.178057909011841, + "learning_rate": 5e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7745531797409058, + "num_tokens": 605591056.0, + "step": 23398 + }, + { + "epoch": 2.5696244234570615, + "grad_norm": 2.0276126861572266, + "learning_rate": 5e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7531063556671143, + "num_tokens": 605615889.0, + "step": 23399 + }, + { + "epoch": 2.569734241159675, + "grad_norm": 2.0698604583740234, + "learning_rate": 5e-06, + "loss": 0.6672, + "mean_token_accuracy": 0.7808127403259277, + "num_tokens": 605640346.0, + "step": 23400 + }, + { + "epoch": 2.5698440588622886, + "grad_norm": 2.1474783420562744, + "learning_rate": 5e-06, + "loss": 0.6756, + "mean_token_accuracy": 0.776203453540802, + "num_tokens": 605665289.0, + "step": 23401 + }, + { + "epoch": 2.5699538765649024, + "grad_norm": 2.3433001041412354, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7607283592224121, + "num_tokens": 605686008.0, + "step": 23402 + }, + { + "epoch": 2.5700636942675157, + "grad_norm": 2.0452568531036377, + "learning_rate": 5e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7711083889007568, + "num_tokens": 605710360.0, + "step": 23403 + }, + { + "epoch": 2.5701735119701294, + "grad_norm": 1.89584481716156, + "learning_rate": 5e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7592471837997437, + "num_tokens": 605740468.0, + "step": 23404 + }, + { + "epoch": 2.570283329672743, + "grad_norm": 2.0377211570739746, + "learning_rate": 5e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7733803987503052, + "num_tokens": 605765099.0, + "step": 23405 + }, + { + "epoch": 2.570393147375357, + "grad_norm": 1.9530447721481323, + "learning_rate": 5e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7631003856658936, + "num_tokens": 605791058.0, + "step": 23406 + }, + { + "epoch": 2.5705029650779707, + "grad_norm": 2.069366693496704, + "learning_rate": 5e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7622162103652954, + "num_tokens": 605816326.0, + "step": 23407 + }, + { + "epoch": 2.570612782780584, + "grad_norm": 2.594233989715576, + "learning_rate": 5e-06, + "loss": 0.6222, + "mean_token_accuracy": 0.7893490791320801, + "num_tokens": 605834089.0, + "step": 23408 + }, + { + "epoch": 2.570722600483198, + "grad_norm": 2.040804624557495, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7486509084701538, + "num_tokens": 605861640.0, + "step": 23409 + }, + { + "epoch": 2.5708324181858115, + "grad_norm": 2.139141321182251, + "learning_rate": 5e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7483130097389221, + "num_tokens": 605889168.0, + "step": 23410 + }, + { + "epoch": 2.5709422358884253, + "grad_norm": 2.163623094558716, + "learning_rate": 5e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7725960612297058, + "num_tokens": 605915448.0, + "step": 23411 + }, + { + "epoch": 2.571052053591039, + "grad_norm": 1.935568928718567, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7498350143432617, + "num_tokens": 605947642.0, + "step": 23412 + }, + { + "epoch": 2.5711618712936524, + "grad_norm": 2.4583616256713867, + "learning_rate": 5e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7624473571777344, + "num_tokens": 605966410.0, + "step": 23413 + }, + { + "epoch": 2.571271688996266, + "grad_norm": 1.8170784711837769, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7592315673828125, + "num_tokens": 605996576.0, + "step": 23414 + }, + { + "epoch": 2.57138150669888, + "grad_norm": 1.965846061706543, + "learning_rate": 5e-06, + "loss": 0.6333, + "mean_token_accuracy": 0.7877668738365173, + "num_tokens": 606023774.0, + "step": 23415 + }, + { + "epoch": 2.5714913244014936, + "grad_norm": 2.097895622253418, + "learning_rate": 5e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7497236728668213, + "num_tokens": 606049969.0, + "step": 23416 + }, + { + "epoch": 2.5716011421041074, + "grad_norm": 2.151247978210449, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7615678906440735, + "num_tokens": 606074289.0, + "step": 23417 + }, + { + "epoch": 2.5717109598067207, + "grad_norm": 1.8820586204528809, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7713456749916077, + "num_tokens": 606105700.0, + "step": 23418 + }, + { + "epoch": 2.5718207775093345, + "grad_norm": 1.837774634361267, + "learning_rate": 5e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7464446425437927, + "num_tokens": 606137724.0, + "step": 23419 + }, + { + "epoch": 2.5719305952119482, + "grad_norm": 2.1471073627471924, + "learning_rate": 5e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.7871378660202026, + "num_tokens": 606160237.0, + "step": 23420 + }, + { + "epoch": 2.572040412914562, + "grad_norm": 1.8994356393814087, + "learning_rate": 5e-06, + "loss": 0.6126, + "mean_token_accuracy": 0.7944361567497253, + "num_tokens": 606186326.0, + "step": 23421 + }, + { + "epoch": 2.5721502306171757, + "grad_norm": 2.115543842315674, + "learning_rate": 5e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.7722294330596924, + "num_tokens": 606209968.0, + "step": 23422 + }, + { + "epoch": 2.572260048319789, + "grad_norm": 2.255143880844116, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7507667541503906, + "num_tokens": 606234381.0, + "step": 23423 + }, + { + "epoch": 2.572369866022403, + "grad_norm": 1.8767279386520386, + "learning_rate": 5e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7430339455604553, + "num_tokens": 606268519.0, + "step": 23424 + }, + { + "epoch": 2.5724796837250166, + "grad_norm": 1.8187334537506104, + "learning_rate": 5e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7779407501220703, + "num_tokens": 606299647.0, + "step": 23425 + }, + { + "epoch": 2.57258950142763, + "grad_norm": 2.007110118865967, + "learning_rate": 5e-06, + "loss": 0.7164, + "mean_token_accuracy": 0.7710698843002319, + "num_tokens": 606323908.0, + "step": 23426 + }, + { + "epoch": 2.5726993191302436, + "grad_norm": 2.1933791637420654, + "learning_rate": 5e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.7744828462600708, + "num_tokens": 606346596.0, + "step": 23427 + }, + { + "epoch": 2.5728091368328574, + "grad_norm": 1.9751157760620117, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7677712440490723, + "num_tokens": 606373924.0, + "step": 23428 + }, + { + "epoch": 2.572918954535471, + "grad_norm": 2.2476327419281006, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7518196702003479, + "num_tokens": 606397375.0, + "step": 23429 + }, + { + "epoch": 2.573028772238085, + "grad_norm": 2.1670777797698975, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7668724060058594, + "num_tokens": 606420504.0, + "step": 23430 + }, + { + "epoch": 2.573138589940698, + "grad_norm": 2.1788148880004883, + "learning_rate": 5e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7688475847244263, + "num_tokens": 606443548.0, + "step": 23431 + }, + { + "epoch": 2.573248407643312, + "grad_norm": 1.8882681131362915, + "learning_rate": 5e-06, + "loss": 0.7604, + "mean_token_accuracy": 0.7539762258529663, + "num_tokens": 606474336.0, + "step": 23432 + }, + { + "epoch": 2.5733582253459257, + "grad_norm": 2.3530728816986084, + "learning_rate": 5e-06, + "loss": 0.67, + "mean_token_accuracy": 0.771946907043457, + "num_tokens": 606494851.0, + "step": 23433 + }, + { + "epoch": 2.5734680430485395, + "grad_norm": 2.1197550296783447, + "learning_rate": 5e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.757416844367981, + "num_tokens": 606519363.0, + "step": 23434 + }, + { + "epoch": 2.5735778607511532, + "grad_norm": 2.1527857780456543, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7682350277900696, + "num_tokens": 606540777.0, + "step": 23435 + }, + { + "epoch": 2.5736876784537666, + "grad_norm": 2.1204886436462402, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7598294019699097, + "num_tokens": 606564356.0, + "step": 23436 + }, + { + "epoch": 2.5737974961563803, + "grad_norm": 2.162202835083008, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7466834783554077, + "num_tokens": 606590387.0, + "step": 23437 + }, + { + "epoch": 2.573907313858994, + "grad_norm": 2.006115198135376, + "learning_rate": 5e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.757821798324585, + "num_tokens": 606620072.0, + "step": 23438 + }, + { + "epoch": 2.574017131561608, + "grad_norm": 2.1707401275634766, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7695907354354858, + "num_tokens": 606643268.0, + "step": 23439 + }, + { + "epoch": 2.5741269492642216, + "grad_norm": 2.3986995220184326, + "learning_rate": 5e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.7594605088233948, + "num_tokens": 606663079.0, + "step": 23440 + }, + { + "epoch": 2.574236766966835, + "grad_norm": 2.064605236053467, + "learning_rate": 5e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7594029903411865, + "num_tokens": 606688757.0, + "step": 23441 + }, + { + "epoch": 2.5743465846694487, + "grad_norm": 2.175177574157715, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7474896907806396, + "num_tokens": 606711870.0, + "step": 23442 + }, + { + "epoch": 2.5744564023720624, + "grad_norm": 2.285961627960205, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.768796443939209, + "num_tokens": 606732735.0, + "step": 23443 + }, + { + "epoch": 2.574566220074676, + "grad_norm": 1.9814136028289795, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7630500197410583, + "num_tokens": 606761889.0, + "step": 23444 + }, + { + "epoch": 2.57467603777729, + "grad_norm": 2.7283730506896973, + "learning_rate": 5e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.758695662021637, + "num_tokens": 606780366.0, + "step": 23445 + }, + { + "epoch": 2.5747858554799032, + "grad_norm": 2.224094867706299, + "learning_rate": 5e-06, + "loss": 0.6729, + "mean_token_accuracy": 0.7772372364997864, + "num_tokens": 606802966.0, + "step": 23446 + }, + { + "epoch": 2.574895673182517, + "grad_norm": 2.053122043609619, + "learning_rate": 5e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7602152824401855, + "num_tokens": 606829003.0, + "step": 23447 + }, + { + "epoch": 2.5750054908851308, + "grad_norm": 2.005394458770752, + "learning_rate": 5e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.7737274169921875, + "num_tokens": 606854508.0, + "step": 23448 + }, + { + "epoch": 2.575115308587744, + "grad_norm": 1.9923226833343506, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7474031448364258, + "num_tokens": 606884286.0, + "step": 23449 + }, + { + "epoch": 2.5752251262903583, + "grad_norm": 2.1565442085266113, + "learning_rate": 5e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.7810955047607422, + "num_tokens": 606908284.0, + "step": 23450 + }, + { + "epoch": 2.5753349439929716, + "grad_norm": 2.3208858966827393, + "learning_rate": 5e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.755072832107544, + "num_tokens": 606930169.0, + "step": 23451 + }, + { + "epoch": 2.5754447616955853, + "grad_norm": 2.123936653137207, + "learning_rate": 5e-06, + "loss": 0.6945, + "mean_token_accuracy": 0.7716102600097656, + "num_tokens": 606955889.0, + "step": 23452 + }, + { + "epoch": 2.575554579398199, + "grad_norm": 2.2888782024383545, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7614905834197998, + "num_tokens": 606975798.0, + "step": 23453 + }, + { + "epoch": 2.5756643971008124, + "grad_norm": 2.2388346195220947, + "learning_rate": 5e-06, + "loss": 0.5895, + "mean_token_accuracy": 0.8020377159118652, + "num_tokens": 606997480.0, + "step": 23454 + }, + { + "epoch": 2.575774214803426, + "grad_norm": 1.9029628038406372, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7556987404823303, + "num_tokens": 607026370.0, + "step": 23455 + }, + { + "epoch": 2.57588403250604, + "grad_norm": 2.329545021057129, + "learning_rate": 5e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7753623127937317, + "num_tokens": 607047129.0, + "step": 23456 + }, + { + "epoch": 2.5759938502086537, + "grad_norm": 2.1710216999053955, + "learning_rate": 5e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7762598991394043, + "num_tokens": 607070842.0, + "step": 23457 + }, + { + "epoch": 2.5761036679112674, + "grad_norm": 2.0234174728393555, + "learning_rate": 5e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7657724022865295, + "num_tokens": 607097582.0, + "step": 23458 + }, + { + "epoch": 2.5762134856138807, + "grad_norm": 2.3309106826782227, + "learning_rate": 5e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7696484923362732, + "num_tokens": 607119557.0, + "step": 23459 + }, + { + "epoch": 2.5763233033164945, + "grad_norm": 2.4572012424468994, + "learning_rate": 5e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7803311347961426, + "num_tokens": 607138898.0, + "step": 23460 + }, + { + "epoch": 2.5764331210191083, + "grad_norm": 1.9370461702346802, + "learning_rate": 5e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7554208040237427, + "num_tokens": 607168006.0, + "step": 23461 + }, + { + "epoch": 2.576542938721722, + "grad_norm": 2.0423402786254883, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7569944858551025, + "num_tokens": 607194719.0, + "step": 23462 + }, + { + "epoch": 2.5766527564243358, + "grad_norm": 1.8870421648025513, + "learning_rate": 5e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7450985908508301, + "num_tokens": 607228677.0, + "step": 23463 + }, + { + "epoch": 2.576762574126949, + "grad_norm": 1.9617619514465332, + "learning_rate": 5e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7581726312637329, + "num_tokens": 607256232.0, + "step": 23464 + }, + { + "epoch": 2.576872391829563, + "grad_norm": 2.2140543460845947, + "learning_rate": 5e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7590533494949341, + "num_tokens": 607280057.0, + "step": 23465 + }, + { + "epoch": 2.5769822095321766, + "grad_norm": 2.343376398086548, + "learning_rate": 5e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7731830477714539, + "num_tokens": 607300567.0, + "step": 23466 + }, + { + "epoch": 2.5770920272347904, + "grad_norm": 2.1492834091186523, + "learning_rate": 5e-06, + "loss": 0.6607, + "mean_token_accuracy": 0.7798696756362915, + "num_tokens": 607323480.0, + "step": 23467 + }, + { + "epoch": 2.577201844937404, + "grad_norm": 2.113422393798828, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7680819630622864, + "num_tokens": 607350294.0, + "step": 23468 + }, + { + "epoch": 2.5773116626400174, + "grad_norm": 2.096618890762329, + "learning_rate": 5e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.7485882043838501, + "num_tokens": 607376399.0, + "step": 23469 + }, + { + "epoch": 2.577421480342631, + "grad_norm": 1.986021637916565, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7605831623077393, + "num_tokens": 607403346.0, + "step": 23470 + }, + { + "epoch": 2.577531298045245, + "grad_norm": 2.094419002532959, + "learning_rate": 5e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7739514708518982, + "num_tokens": 607427480.0, + "step": 23471 + }, + { + "epoch": 2.5776411157478587, + "grad_norm": 2.0272154808044434, + "learning_rate": 5e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7470149993896484, + "num_tokens": 607455240.0, + "step": 23472 + }, + { + "epoch": 2.5777509334504725, + "grad_norm": 1.953375220298767, + "learning_rate": 5e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7519817352294922, + "num_tokens": 607486028.0, + "step": 23473 + }, + { + "epoch": 2.5778607511530858, + "grad_norm": 2.1481831073760986, + "learning_rate": 5e-06, + "loss": 0.704, + "mean_token_accuracy": 0.7745193243026733, + "num_tokens": 607510907.0, + "step": 23474 + }, + { + "epoch": 2.5779705688556995, + "grad_norm": 2.0714974403381348, + "learning_rate": 5e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.769589900970459, + "num_tokens": 607536263.0, + "step": 23475 + }, + { + "epoch": 2.5780803865583133, + "grad_norm": 1.9643503427505493, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7606720924377441, + "num_tokens": 607564310.0, + "step": 23476 + }, + { + "epoch": 2.5781902042609266, + "grad_norm": 2.0721962451934814, + "learning_rate": 5e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7660924792289734, + "num_tokens": 607588376.0, + "step": 23477 + }, + { + "epoch": 2.578300021963541, + "grad_norm": 1.7691688537597656, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.757003128528595, + "num_tokens": 607620826.0, + "step": 23478 + }, + { + "epoch": 2.578409839666154, + "grad_norm": 1.983154058456421, + "learning_rate": 5e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7353730201721191, + "num_tokens": 607650048.0, + "step": 23479 + }, + { + "epoch": 2.578519657368768, + "grad_norm": 1.9852639436721802, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.745031476020813, + "num_tokens": 607680378.0, + "step": 23480 + }, + { + "epoch": 2.5786294750713816, + "grad_norm": 2.0632688999176025, + "learning_rate": 5e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7705895900726318, + "num_tokens": 607706205.0, + "step": 23481 + }, + { + "epoch": 2.578739292773995, + "grad_norm": 2.1090853214263916, + "learning_rate": 5e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7489628791809082, + "num_tokens": 607733984.0, + "step": 23482 + }, + { + "epoch": 2.5788491104766087, + "grad_norm": 2.406097412109375, + "learning_rate": 5e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7646948099136353, + "num_tokens": 607754776.0, + "step": 23483 + }, + { + "epoch": 2.5789589281792225, + "grad_norm": 2.093987464904785, + "learning_rate": 5e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7526458501815796, + "num_tokens": 607781931.0, + "step": 23484 + }, + { + "epoch": 2.579068745881836, + "grad_norm": 2.354121208190918, + "learning_rate": 5e-06, + "loss": 0.6499, + "mean_token_accuracy": 0.7819365859031677, + "num_tokens": 607804743.0, + "step": 23485 + }, + { + "epoch": 2.57917856358445, + "grad_norm": 1.9440687894821167, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7578745484352112, + "num_tokens": 607834842.0, + "step": 23486 + }, + { + "epoch": 2.5792883812870633, + "grad_norm": 2.160914421081543, + "learning_rate": 5e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.763701319694519, + "num_tokens": 607860092.0, + "step": 23487 + }, + { + "epoch": 2.579398198989677, + "grad_norm": 2.3103668689727783, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7764273285865784, + "num_tokens": 607882653.0, + "step": 23488 + }, + { + "epoch": 2.579508016692291, + "grad_norm": 2.017404556274414, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.750525712966919, + "num_tokens": 607911253.0, + "step": 23489 + }, + { + "epoch": 2.5796178343949046, + "grad_norm": 1.7707492113113403, + "learning_rate": 5e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7447037696838379, + "num_tokens": 607945921.0, + "step": 23490 + }, + { + "epoch": 2.5797276520975183, + "grad_norm": 2.1524569988250732, + "learning_rate": 5e-06, + "loss": 0.6939, + "mean_token_accuracy": 0.7758229970932007, + "num_tokens": 607969827.0, + "step": 23491 + }, + { + "epoch": 2.5798374698001316, + "grad_norm": 2.077977180480957, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7645272016525269, + "num_tokens": 607995672.0, + "step": 23492 + }, + { + "epoch": 2.5799472875027454, + "grad_norm": 2.2139949798583984, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.757815957069397, + "num_tokens": 608018847.0, + "step": 23493 + }, + { + "epoch": 2.580057105205359, + "grad_norm": 1.9372328519821167, + "learning_rate": 5e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.7635663747787476, + "num_tokens": 608048226.0, + "step": 23494 + }, + { + "epoch": 2.580166922907973, + "grad_norm": 2.0223567485809326, + "learning_rate": 5e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7780035734176636, + "num_tokens": 608074964.0, + "step": 23495 + }, + { + "epoch": 2.5802767406105866, + "grad_norm": 1.9022257328033447, + "learning_rate": 5e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7460300922393799, + "num_tokens": 608108099.0, + "step": 23496 + }, + { + "epoch": 2.5803865583132, + "grad_norm": 2.1895639896392822, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7505495548248291, + "num_tokens": 608133698.0, + "step": 23497 + }, + { + "epoch": 2.5804963760158137, + "grad_norm": 2.289654493331909, + "learning_rate": 5e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7789291143417358, + "num_tokens": 608155824.0, + "step": 23498 + }, + { + "epoch": 2.5806061937184275, + "grad_norm": 2.003009796142578, + "learning_rate": 5e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7674710750579834, + "num_tokens": 608183149.0, + "step": 23499 + }, + { + "epoch": 2.5807160114210412, + "grad_norm": 2.7006478309631348, + "learning_rate": 5e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7649791240692139, + "num_tokens": 608201158.0, + "step": 23500 + }, + { + "epoch": 2.580825829123655, + "grad_norm": 2.122488498687744, + "learning_rate": 5e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7775158882141113, + "num_tokens": 608226049.0, + "step": 23501 + }, + { + "epoch": 2.5809356468262683, + "grad_norm": 2.3647539615631104, + "learning_rate": 5e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7694640159606934, + "num_tokens": 608247442.0, + "step": 23502 + }, + { + "epoch": 2.581045464528882, + "grad_norm": 2.2623836994171143, + "learning_rate": 5e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7604265213012695, + "num_tokens": 608272396.0, + "step": 23503 + }, + { + "epoch": 2.581155282231496, + "grad_norm": 2.1552622318267822, + "learning_rate": 5e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.763875424861908, + "num_tokens": 608297395.0, + "step": 23504 + }, + { + "epoch": 2.581265099934109, + "grad_norm": 2.12117075920105, + "learning_rate": 5e-06, + "loss": 0.7223, + "mean_token_accuracy": 0.7619959115982056, + "num_tokens": 608322553.0, + "step": 23505 + }, + { + "epoch": 2.581374917636723, + "grad_norm": 2.080094814300537, + "learning_rate": 5e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7636327743530273, + "num_tokens": 608346950.0, + "step": 23506 + }, + { + "epoch": 2.5814847353393366, + "grad_norm": 1.9620438814163208, + "learning_rate": 5e-06, + "loss": 0.6759, + "mean_token_accuracy": 0.7753757238388062, + "num_tokens": 608374204.0, + "step": 23507 + }, + { + "epoch": 2.5815945530419504, + "grad_norm": 1.978492259979248, + "learning_rate": 5e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.7648496627807617, + "num_tokens": 608401269.0, + "step": 23508 + }, + { + "epoch": 2.581704370744564, + "grad_norm": 2.1900277137756348, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7596402764320374, + "num_tokens": 608425234.0, + "step": 23509 + }, + { + "epoch": 2.5818141884471775, + "grad_norm": 2.194154977798462, + "learning_rate": 5e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7550604939460754, + "num_tokens": 608449530.0, + "step": 23510 + }, + { + "epoch": 2.5819240061497912, + "grad_norm": 1.9730544090270996, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7569851875305176, + "num_tokens": 608479342.0, + "step": 23511 + }, + { + "epoch": 2.582033823852405, + "grad_norm": 1.8814231157302856, + "learning_rate": 5e-06, + "loss": 0.6733, + "mean_token_accuracy": 0.7798672914505005, + "num_tokens": 608509792.0, + "step": 23512 + }, + { + "epoch": 2.5821436415550187, + "grad_norm": 2.2629289627075195, + "learning_rate": 5e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7811493277549744, + "num_tokens": 608533517.0, + "step": 23513 + }, + { + "epoch": 2.5822534592576325, + "grad_norm": 2.0107147693634033, + "learning_rate": 5e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.7730804681777954, + "num_tokens": 608560318.0, + "step": 23514 + }, + { + "epoch": 2.582363276960246, + "grad_norm": 2.2616374492645264, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7664830684661865, + "num_tokens": 608584308.0, + "step": 23515 + }, + { + "epoch": 2.5824730946628596, + "grad_norm": 2.0644891262054443, + "learning_rate": 5e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7602511644363403, + "num_tokens": 608612285.0, + "step": 23516 + }, + { + "epoch": 2.5825829123654733, + "grad_norm": 2.0762388706207275, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.764453113079071, + "num_tokens": 608639339.0, + "step": 23517 + }, + { + "epoch": 2.582692730068087, + "grad_norm": 2.05302095413208, + "learning_rate": 5e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.7653289437294006, + "num_tokens": 608663936.0, + "step": 23518 + }, + { + "epoch": 2.582802547770701, + "grad_norm": 2.12326979637146, + "learning_rate": 5e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7625482082366943, + "num_tokens": 608689650.0, + "step": 23519 + }, + { + "epoch": 2.582912365473314, + "grad_norm": 2.0880703926086426, + "learning_rate": 5e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.770469605922699, + "num_tokens": 608714571.0, + "step": 23520 + }, + { + "epoch": 2.583022183175928, + "grad_norm": 2.2497284412384033, + "learning_rate": 5e-06, + "loss": 0.6155, + "mean_token_accuracy": 0.7933316230773926, + "num_tokens": 608735276.0, + "step": 23521 + }, + { + "epoch": 2.5831320008785417, + "grad_norm": 2.1715097427368164, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7360371351242065, + "num_tokens": 608760245.0, + "step": 23522 + }, + { + "epoch": 2.5832418185811554, + "grad_norm": 2.336014747619629, + "learning_rate": 5e-06, + "loss": 0.613, + "mean_token_accuracy": 0.7905840873718262, + "num_tokens": 608778937.0, + "step": 23523 + }, + { + "epoch": 2.583351636283769, + "grad_norm": 2.1619017124176025, + "learning_rate": 5e-06, + "loss": 0.6525, + "mean_token_accuracy": 0.7816348075866699, + "num_tokens": 608802997.0, + "step": 23524 + }, + { + "epoch": 2.5834614539863825, + "grad_norm": 2.051682949066162, + "learning_rate": 5e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7644534111022949, + "num_tokens": 608830321.0, + "step": 23525 + }, + { + "epoch": 2.5835712716889963, + "grad_norm": 1.9191886186599731, + "learning_rate": 5e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7594507932662964, + "num_tokens": 608857138.0, + "step": 23526 + }, + { + "epoch": 2.58368108939161, + "grad_norm": 2.1126112937927246, + "learning_rate": 5e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.782464325428009, + "num_tokens": 608884398.0, + "step": 23527 + }, + { + "epoch": 2.5837909070942233, + "grad_norm": 1.8144265413284302, + "learning_rate": 5e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7419715523719788, + "num_tokens": 608917936.0, + "step": 23528 + }, + { + "epoch": 2.5839007247968375, + "grad_norm": 1.916541337966919, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.772941529750824, + "num_tokens": 608947784.0, + "step": 23529 + }, + { + "epoch": 2.584010542499451, + "grad_norm": 2.3857147693634033, + "learning_rate": 5e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.7777950763702393, + "num_tokens": 608967679.0, + "step": 23530 + }, + { + "epoch": 2.5841203602020646, + "grad_norm": 2.420708179473877, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7665886282920837, + "num_tokens": 608987239.0, + "step": 23531 + }, + { + "epoch": 2.5842301779046783, + "grad_norm": 2.048713445663452, + "learning_rate": 5e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7683397531509399, + "num_tokens": 609012041.0, + "step": 23532 + }, + { + "epoch": 2.5843399956072917, + "grad_norm": 2.2503445148468018, + "learning_rate": 5e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.785417914390564, + "num_tokens": 609032765.0, + "step": 23533 + }, + { + "epoch": 2.5844498133099054, + "grad_norm": 2.1290175914764404, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.778512179851532, + "num_tokens": 609056483.0, + "step": 23534 + }, + { + "epoch": 2.584559631012519, + "grad_norm": 2.0236117839813232, + "learning_rate": 5e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7712280750274658, + "num_tokens": 609084567.0, + "step": 23535 + }, + { + "epoch": 2.584669448715133, + "grad_norm": 2.1133015155792236, + "learning_rate": 5e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7732781171798706, + "num_tokens": 609111003.0, + "step": 23536 + }, + { + "epoch": 2.5847792664177467, + "grad_norm": 2.122344493865967, + "learning_rate": 5e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7657317519187927, + "num_tokens": 609136110.0, + "step": 23537 + }, + { + "epoch": 2.58488908412036, + "grad_norm": 1.9145238399505615, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7475141286849976, + "num_tokens": 609164540.0, + "step": 23538 + }, + { + "epoch": 2.5849989018229738, + "grad_norm": 2.126148223876953, + "learning_rate": 5e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7529460787773132, + "num_tokens": 609190991.0, + "step": 23539 + }, + { + "epoch": 2.5851087195255875, + "grad_norm": 2.0422189235687256, + "learning_rate": 5e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7409806251525879, + "num_tokens": 609219075.0, + "step": 23540 + }, + { + "epoch": 2.5852185372282013, + "grad_norm": 2.07463002204895, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7515260577201843, + "num_tokens": 609247547.0, + "step": 23541 + }, + { + "epoch": 2.585328354930815, + "grad_norm": 2.0238091945648193, + "learning_rate": 5e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.7689582109451294, + "num_tokens": 609274135.0, + "step": 23542 + }, + { + "epoch": 2.5854381726334283, + "grad_norm": 1.8733004331588745, + "learning_rate": 5e-06, + "loss": 0.7041, + "mean_token_accuracy": 0.7710402011871338, + "num_tokens": 609304747.0, + "step": 23543 + }, + { + "epoch": 2.585547990336042, + "grad_norm": 2.4191668033599854, + "learning_rate": 5e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.7925815582275391, + "num_tokens": 609324696.0, + "step": 23544 + }, + { + "epoch": 2.585657808038656, + "grad_norm": 2.0142292976379395, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7623674869537354, + "num_tokens": 609353090.0, + "step": 23545 + }, + { + "epoch": 2.5857676257412696, + "grad_norm": 2.1168038845062256, + "learning_rate": 5e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.7704950571060181, + "num_tokens": 609377168.0, + "step": 23546 + }, + { + "epoch": 2.5858774434438834, + "grad_norm": 2.1872384548187256, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7651325464248657, + "num_tokens": 609401924.0, + "step": 23547 + }, + { + "epoch": 2.5859872611464967, + "grad_norm": 2.3228538036346436, + "learning_rate": 5e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.772897481918335, + "num_tokens": 609423345.0, + "step": 23548 + }, + { + "epoch": 2.5860970788491104, + "grad_norm": 2.1557934284210205, + "learning_rate": 5e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7728922367095947, + "num_tokens": 609449359.0, + "step": 23549 + }, + { + "epoch": 2.586206896551724, + "grad_norm": 2.2279369831085205, + "learning_rate": 5e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7587395310401917, + "num_tokens": 609473870.0, + "step": 23550 + }, + { + "epoch": 2.586316714254338, + "grad_norm": 1.8915576934814453, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7532231211662292, + "num_tokens": 609502459.0, + "step": 23551 + }, + { + "epoch": 2.5864265319569517, + "grad_norm": 2.0211567878723145, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7487432956695557, + "num_tokens": 609534258.0, + "step": 23552 + }, + { + "epoch": 2.586536349659565, + "grad_norm": 2.253074884414673, + "learning_rate": 5e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7585064172744751, + "num_tokens": 609559534.0, + "step": 23553 + }, + { + "epoch": 2.586646167362179, + "grad_norm": 2.154939889907837, + "learning_rate": 5e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7615641355514526, + "num_tokens": 609585487.0, + "step": 23554 + }, + { + "epoch": 2.5867559850647925, + "grad_norm": 2.0949320793151855, + "learning_rate": 5e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7524157762527466, + "num_tokens": 609609555.0, + "step": 23555 + }, + { + "epoch": 2.586865802767406, + "grad_norm": 2.053560733795166, + "learning_rate": 5e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7605198621749878, + "num_tokens": 609636077.0, + "step": 23556 + }, + { + "epoch": 2.5869756204700196, + "grad_norm": 1.924672245979309, + "learning_rate": 5e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.760487973690033, + "num_tokens": 609663981.0, + "step": 23557 + }, + { + "epoch": 2.5870854381726334, + "grad_norm": 2.0089306831359863, + "learning_rate": 5e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7403488159179688, + "num_tokens": 609691673.0, + "step": 23558 + }, + { + "epoch": 2.587195255875247, + "grad_norm": 2.0388593673706055, + "learning_rate": 5e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7504665851593018, + "num_tokens": 609718679.0, + "step": 23559 + }, + { + "epoch": 2.587305073577861, + "grad_norm": 2.2529518604278564, + "learning_rate": 5e-06, + "loss": 0.6716, + "mean_token_accuracy": 0.7754709720611572, + "num_tokens": 609740384.0, + "step": 23560 + }, + { + "epoch": 2.587414891280474, + "grad_norm": 2.0370562076568604, + "learning_rate": 5e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7474771738052368, + "num_tokens": 609766179.0, + "step": 23561 + }, + { + "epoch": 2.587524708983088, + "grad_norm": 2.0486505031585693, + "learning_rate": 5e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.73569655418396, + "num_tokens": 609792972.0, + "step": 23562 + }, + { + "epoch": 2.5876345266857017, + "grad_norm": 2.3205325603485107, + "learning_rate": 5e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.799747884273529, + "num_tokens": 609812585.0, + "step": 23563 + }, + { + "epoch": 2.5877443443883155, + "grad_norm": 2.006866216659546, + "learning_rate": 5e-06, + "loss": 0.762, + "mean_token_accuracy": 0.754450798034668, + "num_tokens": 609836657.0, + "step": 23564 + }, + { + "epoch": 2.587854162090929, + "grad_norm": 2.002485990524292, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7777098417282104, + "num_tokens": 609863083.0, + "step": 23565 + }, + { + "epoch": 2.5879639797935425, + "grad_norm": 1.9901047945022583, + "learning_rate": 5e-06, + "loss": 0.6417, + "mean_token_accuracy": 0.7902307510375977, + "num_tokens": 609889288.0, + "step": 23566 + }, + { + "epoch": 2.5880737974961563, + "grad_norm": 2.009336471557617, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7682719230651855, + "num_tokens": 609916144.0, + "step": 23567 + }, + { + "epoch": 2.58818361519877, + "grad_norm": 2.047529458999634, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7529364228248596, + "num_tokens": 609941729.0, + "step": 23568 + }, + { + "epoch": 2.588293432901384, + "grad_norm": 2.1410861015319824, + "learning_rate": 5e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7708226442337036, + "num_tokens": 609965967.0, + "step": 23569 + }, + { + "epoch": 2.5884032506039976, + "grad_norm": 2.1316535472869873, + "learning_rate": 5e-06, + "loss": 0.6703, + "mean_token_accuracy": 0.7742870450019836, + "num_tokens": 609991029.0, + "step": 23570 + }, + { + "epoch": 2.588513068306611, + "grad_norm": 2.3489181995391846, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7586289644241333, + "num_tokens": 610012759.0, + "step": 23571 + }, + { + "epoch": 2.5886228860092246, + "grad_norm": 1.9452282190322876, + "learning_rate": 5e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.777397871017456, + "num_tokens": 610039674.0, + "step": 23572 + }, + { + "epoch": 2.5887327037118384, + "grad_norm": 2.190518856048584, + "learning_rate": 5e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.780417799949646, + "num_tokens": 610062713.0, + "step": 23573 + }, + { + "epoch": 2.588842521414452, + "grad_norm": 2.087475538253784, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7738378047943115, + "num_tokens": 610089522.0, + "step": 23574 + }, + { + "epoch": 2.588952339117066, + "grad_norm": 2.1169345378875732, + "learning_rate": 5e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7535436153411865, + "num_tokens": 610112580.0, + "step": 23575 + }, + { + "epoch": 2.589062156819679, + "grad_norm": 2.3121254444122314, + "learning_rate": 5e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.7780539393424988, + "num_tokens": 610133099.0, + "step": 23576 + }, + { + "epoch": 2.589171974522293, + "grad_norm": 2.0668509006500244, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7814274430274963, + "num_tokens": 610157288.0, + "step": 23577 + }, + { + "epoch": 2.5892817922249067, + "grad_norm": 1.8826416730880737, + "learning_rate": 5e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7307952046394348, + "num_tokens": 610189281.0, + "step": 23578 + }, + { + "epoch": 2.58939160992752, + "grad_norm": 2.0825355052948, + "learning_rate": 5e-06, + "loss": 0.6736, + "mean_token_accuracy": 0.7896871566772461, + "num_tokens": 610215237.0, + "step": 23579 + }, + { + "epoch": 2.5895014276301342, + "grad_norm": 2.083324432373047, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7535373568534851, + "num_tokens": 610242462.0, + "step": 23580 + }, + { + "epoch": 2.5896112453327476, + "grad_norm": 2.0957727432250977, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7667618989944458, + "num_tokens": 610266254.0, + "step": 23581 + }, + { + "epoch": 2.5897210630353613, + "grad_norm": 2.116607666015625, + "learning_rate": 5e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7684950232505798, + "num_tokens": 610292047.0, + "step": 23582 + }, + { + "epoch": 2.589830880737975, + "grad_norm": 2.1065304279327393, + "learning_rate": 5e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8039780259132385, + "num_tokens": 610317697.0, + "step": 23583 + }, + { + "epoch": 2.5899406984405884, + "grad_norm": 1.9884127378463745, + "learning_rate": 5e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7588721513748169, + "num_tokens": 610344267.0, + "step": 23584 + }, + { + "epoch": 2.590050516143202, + "grad_norm": 2.2066361904144287, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7670980095863342, + "num_tokens": 610368881.0, + "step": 23585 + }, + { + "epoch": 2.590160333845816, + "grad_norm": 2.219454765319824, + "learning_rate": 5e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7628029584884644, + "num_tokens": 610393870.0, + "step": 23586 + }, + { + "epoch": 2.5902701515484297, + "grad_norm": 2.0191831588745117, + "learning_rate": 5e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7765819430351257, + "num_tokens": 610421042.0, + "step": 23587 + }, + { + "epoch": 2.5903799692510434, + "grad_norm": 2.0483691692352295, + "learning_rate": 5e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.7798430919647217, + "num_tokens": 610448007.0, + "step": 23588 + }, + { + "epoch": 2.5904897869536567, + "grad_norm": 1.751656413078308, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7477945685386658, + "num_tokens": 610482655.0, + "step": 23589 + }, + { + "epoch": 2.5905996046562705, + "grad_norm": 2.039231061935425, + "learning_rate": 5e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.7797830104827881, + "num_tokens": 610507241.0, + "step": 23590 + }, + { + "epoch": 2.5907094223588842, + "grad_norm": 2.491650342941284, + "learning_rate": 5e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7771806120872498, + "num_tokens": 610528557.0, + "step": 23591 + }, + { + "epoch": 2.590819240061498, + "grad_norm": 2.0592823028564453, + "learning_rate": 5e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7585247755050659, + "num_tokens": 610556931.0, + "step": 23592 + }, + { + "epoch": 2.5909290577641118, + "grad_norm": 2.211190700531006, + "learning_rate": 5e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7847627401351929, + "num_tokens": 610579345.0, + "step": 23593 + }, + { + "epoch": 2.591038875466725, + "grad_norm": 2.211045503616333, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7648112773895264, + "num_tokens": 610604561.0, + "step": 23594 + }, + { + "epoch": 2.591148693169339, + "grad_norm": 2.1918203830718994, + "learning_rate": 5e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.760524570941925, + "num_tokens": 610629121.0, + "step": 23595 + }, + { + "epoch": 2.5912585108719526, + "grad_norm": 1.8691750764846802, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7679042816162109, + "num_tokens": 610659390.0, + "step": 23596 + }, + { + "epoch": 2.5913683285745663, + "grad_norm": 1.7983782291412354, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7572047710418701, + "num_tokens": 610690577.0, + "step": 23597 + }, + { + "epoch": 2.59147814627718, + "grad_norm": 1.9025776386260986, + "learning_rate": 5e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.7809078097343445, + "num_tokens": 610719820.0, + "step": 23598 + }, + { + "epoch": 2.5915879639797934, + "grad_norm": 2.039700508117676, + "learning_rate": 5e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7550773024559021, + "num_tokens": 610746782.0, + "step": 23599 + }, + { + "epoch": 2.591697781682407, + "grad_norm": 1.9942288398742676, + "learning_rate": 5e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7500793933868408, + "num_tokens": 610776332.0, + "step": 23600 + }, + { + "epoch": 2.591807599385021, + "grad_norm": 2.213616371154785, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7626428604125977, + "num_tokens": 610798629.0, + "step": 23601 + }, + { + "epoch": 2.5919174170876347, + "grad_norm": 1.9410171508789062, + "learning_rate": 5e-06, + "loss": 0.8121, + "mean_token_accuracy": 0.7334756255149841, + "num_tokens": 610829677.0, + "step": 23602 + }, + { + "epoch": 2.5920272347902484, + "grad_norm": 2.3295445442199707, + "learning_rate": 5e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7621336579322815, + "num_tokens": 610852794.0, + "step": 23603 + }, + { + "epoch": 2.5921370524928617, + "grad_norm": 2.572636604309082, + "learning_rate": 5e-06, + "loss": 0.668, + "mean_token_accuracy": 0.7784134149551392, + "num_tokens": 610874114.0, + "step": 23604 + }, + { + "epoch": 2.5922468701954755, + "grad_norm": 2.371143102645874, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7585903406143188, + "num_tokens": 610895291.0, + "step": 23605 + }, + { + "epoch": 2.5923566878980893, + "grad_norm": 1.9955179691314697, + "learning_rate": 5e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.7861685752868652, + "num_tokens": 610919827.0, + "step": 23606 + }, + { + "epoch": 2.5924665056007026, + "grad_norm": 1.9804061651229858, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7619962692260742, + "num_tokens": 610947961.0, + "step": 23607 + }, + { + "epoch": 2.5925763233033163, + "grad_norm": 2.065635919570923, + "learning_rate": 5e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7453323602676392, + "num_tokens": 610974151.0, + "step": 23608 + }, + { + "epoch": 2.59268614100593, + "grad_norm": 2.287659168243408, + "learning_rate": 5e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7810373306274414, + "num_tokens": 610995989.0, + "step": 23609 + }, + { + "epoch": 2.592795958708544, + "grad_norm": 2.3750524520874023, + "learning_rate": 5e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7626028060913086, + "num_tokens": 611018414.0, + "step": 23610 + }, + { + "epoch": 2.5929057764111576, + "grad_norm": 1.8903535604476929, + "learning_rate": 5e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7233551740646362, + "num_tokens": 611051160.0, + "step": 23611 + }, + { + "epoch": 2.593015594113771, + "grad_norm": 2.080972194671631, + "learning_rate": 5e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.774127185344696, + "num_tokens": 611075399.0, + "step": 23612 + }, + { + "epoch": 2.5931254118163847, + "grad_norm": 2.162950277328491, + "learning_rate": 5e-06, + "loss": 0.786, + "mean_token_accuracy": 0.744240403175354, + "num_tokens": 611102398.0, + "step": 23613 + }, + { + "epoch": 2.5932352295189984, + "grad_norm": 2.492917776107788, + "learning_rate": 5e-06, + "loss": 0.6352, + "mean_token_accuracy": 0.7888481616973877, + "num_tokens": 611121140.0, + "step": 23614 + }, + { + "epoch": 2.593345047221612, + "grad_norm": 2.4078152179718018, + "learning_rate": 5e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7550101280212402, + "num_tokens": 611142001.0, + "step": 23615 + }, + { + "epoch": 2.593454864924226, + "grad_norm": 2.1505417823791504, + "learning_rate": 5e-06, + "loss": 0.6257, + "mean_token_accuracy": 0.7937994003295898, + "num_tokens": 611166064.0, + "step": 23616 + }, + { + "epoch": 2.5935646826268393, + "grad_norm": 2.0464582443237305, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7560470104217529, + "num_tokens": 611193259.0, + "step": 23617 + }, + { + "epoch": 2.593674500329453, + "grad_norm": 2.457364559173584, + "learning_rate": 5e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.7792171239852905, + "num_tokens": 611212503.0, + "step": 23618 + }, + { + "epoch": 2.5937843180320668, + "grad_norm": 1.8774607181549072, + "learning_rate": 5e-06, + "loss": 0.6693, + "mean_token_accuracy": 0.7786387205123901, + "num_tokens": 611240919.0, + "step": 23619 + }, + { + "epoch": 2.5938941357346805, + "grad_norm": 2.1840858459472656, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7603419423103333, + "num_tokens": 611264787.0, + "step": 23620 + }, + { + "epoch": 2.5940039534372943, + "grad_norm": 2.1161749362945557, + "learning_rate": 5e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7494744062423706, + "num_tokens": 611289017.0, + "step": 23621 + }, + { + "epoch": 2.5941137711399076, + "grad_norm": 2.0581417083740234, + "learning_rate": 5e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7426543831825256, + "num_tokens": 611314774.0, + "step": 23622 + }, + { + "epoch": 2.5942235888425214, + "grad_norm": 1.9683191776275635, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7562757730484009, + "num_tokens": 611342649.0, + "step": 23623 + }, + { + "epoch": 2.594333406545135, + "grad_norm": 2.029796838760376, + "learning_rate": 5e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.773478627204895, + "num_tokens": 611372483.0, + "step": 23624 + }, + { + "epoch": 2.594443224247749, + "grad_norm": 2.0791537761688232, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7735663652420044, + "num_tokens": 611398577.0, + "step": 23625 + }, + { + "epoch": 2.5945530419503626, + "grad_norm": 2.206268072128296, + "learning_rate": 5e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7667069435119629, + "num_tokens": 611423961.0, + "step": 23626 + }, + { + "epoch": 2.594662859652976, + "grad_norm": 1.9298442602157593, + "learning_rate": 5e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7676680088043213, + "num_tokens": 611453215.0, + "step": 23627 + }, + { + "epoch": 2.5947726773555897, + "grad_norm": 2.471285343170166, + "learning_rate": 5e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7610094547271729, + "num_tokens": 611473974.0, + "step": 23628 + }, + { + "epoch": 2.5948824950582035, + "grad_norm": 1.756343960762024, + "learning_rate": 5e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7846673727035522, + "num_tokens": 611504591.0, + "step": 23629 + }, + { + "epoch": 2.594992312760817, + "grad_norm": 2.186645746231079, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7617166042327881, + "num_tokens": 611529137.0, + "step": 23630 + }, + { + "epoch": 2.595102130463431, + "grad_norm": 2.14141583442688, + "learning_rate": 5e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7468051910400391, + "num_tokens": 611555551.0, + "step": 23631 + }, + { + "epoch": 2.5952119481660443, + "grad_norm": 2.3031022548675537, + "learning_rate": 5e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.7851138114929199, + "num_tokens": 611575210.0, + "step": 23632 + }, + { + "epoch": 2.595321765868658, + "grad_norm": 2.149815320968628, + "learning_rate": 5e-06, + "loss": 0.652, + "mean_token_accuracy": 0.7819128036499023, + "num_tokens": 611598363.0, + "step": 23633 + }, + { + "epoch": 2.595431583571272, + "grad_norm": 2.2050623893737793, + "learning_rate": 5e-06, + "loss": 0.6284, + "mean_token_accuracy": 0.7870632410049438, + "num_tokens": 611619721.0, + "step": 23634 + }, + { + "epoch": 2.595541401273885, + "grad_norm": 2.2549538612365723, + "learning_rate": 5e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7458298206329346, + "num_tokens": 611644639.0, + "step": 23635 + }, + { + "epoch": 2.595651218976499, + "grad_norm": 2.0071165561676025, + "learning_rate": 5e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.7851535081863403, + "num_tokens": 611670779.0, + "step": 23636 + }, + { + "epoch": 2.5957610366791126, + "grad_norm": 1.8863892555236816, + "learning_rate": 5e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7422342896461487, + "num_tokens": 611703014.0, + "step": 23637 + }, + { + "epoch": 2.5958708543817264, + "grad_norm": 2.1540169715881348, + "learning_rate": 5e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7505614757537842, + "num_tokens": 611728178.0, + "step": 23638 + }, + { + "epoch": 2.59598067208434, + "grad_norm": 2.5132644176483154, + "learning_rate": 5e-06, + "loss": 0.6177, + "mean_token_accuracy": 0.7932698726654053, + "num_tokens": 611746433.0, + "step": 23639 + }, + { + "epoch": 2.5960904897869534, + "grad_norm": 1.993995189666748, + "learning_rate": 5e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7507396936416626, + "num_tokens": 611771849.0, + "step": 23640 + }, + { + "epoch": 2.596200307489567, + "grad_norm": 1.9214506149291992, + "learning_rate": 5e-06, + "loss": 0.7285, + "mean_token_accuracy": 0.7672631144523621, + "num_tokens": 611801864.0, + "step": 23641 + }, + { + "epoch": 2.596310125192181, + "grad_norm": 1.854821801185608, + "learning_rate": 5e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7557187080383301, + "num_tokens": 611832866.0, + "step": 23642 + }, + { + "epoch": 2.5964199428947947, + "grad_norm": 2.1099064350128174, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.758891761302948, + "num_tokens": 611857567.0, + "step": 23643 + }, + { + "epoch": 2.5965297605974085, + "grad_norm": 1.8332782983779907, + "learning_rate": 5e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7773045897483826, + "num_tokens": 611890958.0, + "step": 23644 + }, + { + "epoch": 2.596639578300022, + "grad_norm": 2.3008224964141846, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7721326351165771, + "num_tokens": 611913443.0, + "step": 23645 + }, + { + "epoch": 2.5967493960026355, + "grad_norm": 2.0447263717651367, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7741115689277649, + "num_tokens": 611939299.0, + "step": 23646 + }, + { + "epoch": 2.5968592137052493, + "grad_norm": 2.0126216411590576, + "learning_rate": 5e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7470303177833557, + "num_tokens": 611968698.0, + "step": 23647 + }, + { + "epoch": 2.596969031407863, + "grad_norm": 2.079019546508789, + "learning_rate": 5e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7693827152252197, + "num_tokens": 611995429.0, + "step": 23648 + }, + { + "epoch": 2.597078849110477, + "grad_norm": 1.947677731513977, + "learning_rate": 5e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7824205160140991, + "num_tokens": 612024933.0, + "step": 23649 + }, + { + "epoch": 2.59718866681309, + "grad_norm": 1.7928770780563354, + "learning_rate": 5e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.772370457649231, + "num_tokens": 612057449.0, + "step": 23650 + }, + { + "epoch": 2.597298484515704, + "grad_norm": 2.1590447425842285, + "learning_rate": 5e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.7700594067573547, + "num_tokens": 612082729.0, + "step": 23651 + }, + { + "epoch": 2.5974083022183176, + "grad_norm": 2.0867631435394287, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.764531135559082, + "num_tokens": 612111166.0, + "step": 23652 + }, + { + "epoch": 2.5975181199209314, + "grad_norm": 1.8051387071609497, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7672466039657593, + "num_tokens": 612142728.0, + "step": 23653 + }, + { + "epoch": 2.597627937623545, + "grad_norm": 2.178861618041992, + "learning_rate": 5e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.7756918668746948, + "num_tokens": 612165135.0, + "step": 23654 + }, + { + "epoch": 2.5977377553261585, + "grad_norm": 2.0339620113372803, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7565917372703552, + "num_tokens": 612192797.0, + "step": 23655 + }, + { + "epoch": 2.5978475730287722, + "grad_norm": 1.7941583395004272, + "learning_rate": 5e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7199729681015015, + "num_tokens": 612226345.0, + "step": 23656 + }, + { + "epoch": 2.597957390731386, + "grad_norm": 1.9948996305465698, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7765675783157349, + "num_tokens": 612252340.0, + "step": 23657 + }, + { + "epoch": 2.5980672084339993, + "grad_norm": 1.9777063131332397, + "learning_rate": 5e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.769422173500061, + "num_tokens": 612279015.0, + "step": 23658 + }, + { + "epoch": 2.5981770261366135, + "grad_norm": 2.398613929748535, + "learning_rate": 5e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7904787659645081, + "num_tokens": 612298640.0, + "step": 23659 + }, + { + "epoch": 2.598286843839227, + "grad_norm": 1.7983977794647217, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7722136378288269, + "num_tokens": 612334294.0, + "step": 23660 + }, + { + "epoch": 2.5983966615418406, + "grad_norm": 1.9291895627975464, + "learning_rate": 5e-06, + "loss": 0.8139, + "mean_token_accuracy": 0.7363617420196533, + "num_tokens": 612364929.0, + "step": 23661 + }, + { + "epoch": 2.5985064792444543, + "grad_norm": 1.9136029481887817, + "learning_rate": 5e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7628558874130249, + "num_tokens": 612396053.0, + "step": 23662 + }, + { + "epoch": 2.5986162969470676, + "grad_norm": 2.1170458793640137, + "learning_rate": 5e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7515761256217957, + "num_tokens": 612425821.0, + "step": 23663 + }, + { + "epoch": 2.5987261146496814, + "grad_norm": 1.8987431526184082, + "learning_rate": 5e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7491852045059204, + "num_tokens": 612458429.0, + "step": 23664 + }, + { + "epoch": 2.598835932352295, + "grad_norm": 2.2026777267456055, + "learning_rate": 5e-06, + "loss": 0.6703, + "mean_token_accuracy": 0.779451310634613, + "num_tokens": 612481152.0, + "step": 23665 + }, + { + "epoch": 2.598945750054909, + "grad_norm": 2.251634120941162, + "learning_rate": 5e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7645782232284546, + "num_tokens": 612504417.0, + "step": 23666 + }, + { + "epoch": 2.5990555677575227, + "grad_norm": 1.8460811376571655, + "learning_rate": 5e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7479284405708313, + "num_tokens": 612533498.0, + "step": 23667 + }, + { + "epoch": 2.599165385460136, + "grad_norm": 1.9304008483886719, + "learning_rate": 5e-06, + "loss": 0.6832, + "mean_token_accuracy": 0.7775205373764038, + "num_tokens": 612559367.0, + "step": 23668 + }, + { + "epoch": 2.5992752031627497, + "grad_norm": 1.9962830543518066, + "learning_rate": 5e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7597073912620544, + "num_tokens": 612584579.0, + "step": 23669 + }, + { + "epoch": 2.5993850208653635, + "grad_norm": 2.0017406940460205, + "learning_rate": 5e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7565119862556458, + "num_tokens": 612612808.0, + "step": 23670 + }, + { + "epoch": 2.5994948385679773, + "grad_norm": 2.1224117279052734, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7529826164245605, + "num_tokens": 612639270.0, + "step": 23671 + }, + { + "epoch": 2.599604656270591, + "grad_norm": 1.996068000793457, + "learning_rate": 5e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7656213641166687, + "num_tokens": 612666971.0, + "step": 23672 + }, + { + "epoch": 2.5997144739732043, + "grad_norm": 2.0427706241607666, + "learning_rate": 5e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7434475421905518, + "num_tokens": 612694538.0, + "step": 23673 + }, + { + "epoch": 2.599824291675818, + "grad_norm": 1.953642725944519, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7759090662002563, + "num_tokens": 612719498.0, + "step": 23674 + }, + { + "epoch": 2.599934109378432, + "grad_norm": 1.8229974508285522, + "learning_rate": 5e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7533102035522461, + "num_tokens": 612749130.0, + "step": 23675 + }, + { + "epoch": 2.6000439270810456, + "grad_norm": 1.8893147706985474, + "learning_rate": 5e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7374306321144104, + "num_tokens": 612780322.0, + "step": 23676 + }, + { + "epoch": 2.6001537447836593, + "grad_norm": 1.7052656412124634, + "learning_rate": 5e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7664406895637512, + "num_tokens": 612817195.0, + "step": 23677 + }, + { + "epoch": 2.6002635624862727, + "grad_norm": 2.065187931060791, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7742788791656494, + "num_tokens": 612842221.0, + "step": 23678 + }, + { + "epoch": 2.6003733801888864, + "grad_norm": 2.3696701526641846, + "learning_rate": 5e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.7743883728981018, + "num_tokens": 612863252.0, + "step": 23679 + }, + { + "epoch": 2.6004831978915, + "grad_norm": 2.1317577362060547, + "learning_rate": 5e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7486191987991333, + "num_tokens": 612888858.0, + "step": 23680 + }, + { + "epoch": 2.600593015594114, + "grad_norm": 1.9539272785186768, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7695518136024475, + "num_tokens": 612917476.0, + "step": 23681 + }, + { + "epoch": 2.6007028332967277, + "grad_norm": 2.0998153686523438, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7794904112815857, + "num_tokens": 612940961.0, + "step": 23682 + }, + { + "epoch": 2.600812650999341, + "grad_norm": 1.8884706497192383, + "learning_rate": 5e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7407833337783813, + "num_tokens": 612973846.0, + "step": 23683 + }, + { + "epoch": 2.6009224687019548, + "grad_norm": 2.314706325531006, + "learning_rate": 5e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7777396440505981, + "num_tokens": 612996991.0, + "step": 23684 + }, + { + "epoch": 2.6010322864045685, + "grad_norm": 2.0621113777160645, + "learning_rate": 5e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.74742192029953, + "num_tokens": 613023799.0, + "step": 23685 + }, + { + "epoch": 2.601142104107182, + "grad_norm": 2.0190205574035645, + "learning_rate": 5e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7727150917053223, + "num_tokens": 613052135.0, + "step": 23686 + }, + { + "epoch": 2.6012519218097956, + "grad_norm": 2.2048141956329346, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7662659883499146, + "num_tokens": 613076350.0, + "step": 23687 + }, + { + "epoch": 2.6013617395124093, + "grad_norm": 2.1461024284362793, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7744473218917847, + "num_tokens": 613100833.0, + "step": 23688 + }, + { + "epoch": 2.601471557215023, + "grad_norm": 2.315234661102295, + "learning_rate": 5e-06, + "loss": 0.6667, + "mean_token_accuracy": 0.776687502861023, + "num_tokens": 613122116.0, + "step": 23689 + }, + { + "epoch": 2.601581374917637, + "grad_norm": 2.0873639583587646, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7397366166114807, + "num_tokens": 613148798.0, + "step": 23690 + }, + { + "epoch": 2.60169119262025, + "grad_norm": 1.9337685108184814, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7629333734512329, + "num_tokens": 613176001.0, + "step": 23691 + }, + { + "epoch": 2.601801010322864, + "grad_norm": 2.0850017070770264, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7540957927703857, + "num_tokens": 613201357.0, + "step": 23692 + }, + { + "epoch": 2.6019108280254777, + "grad_norm": 2.0713555812835693, + "learning_rate": 5e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7497965097427368, + "num_tokens": 613229965.0, + "step": 23693 + }, + { + "epoch": 2.6020206457280914, + "grad_norm": 1.9032121896743774, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7624242305755615, + "num_tokens": 613256601.0, + "step": 23694 + }, + { + "epoch": 2.602130463430705, + "grad_norm": 2.378730297088623, + "learning_rate": 5e-06, + "loss": 0.6615, + "mean_token_accuracy": 0.7782394886016846, + "num_tokens": 613276508.0, + "step": 23695 + }, + { + "epoch": 2.6022402811333185, + "grad_norm": 2.3291525840759277, + "learning_rate": 5e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.7931768894195557, + "num_tokens": 613298115.0, + "step": 23696 + }, + { + "epoch": 2.6023500988359323, + "grad_norm": 2.250683069229126, + "learning_rate": 5e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7596095204353333, + "num_tokens": 613321706.0, + "step": 23697 + }, + { + "epoch": 2.602459916538546, + "grad_norm": 2.085904121398926, + "learning_rate": 5e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7617449164390564, + "num_tokens": 613349724.0, + "step": 23698 + }, + { + "epoch": 2.60256973424116, + "grad_norm": 2.0732359886169434, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.774156391620636, + "num_tokens": 613375371.0, + "step": 23699 + }, + { + "epoch": 2.6026795519437735, + "grad_norm": 1.8409425020217896, + "learning_rate": 5e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7500468492507935, + "num_tokens": 613410831.0, + "step": 23700 + }, + { + "epoch": 2.602789369646387, + "grad_norm": 2.125136613845825, + "learning_rate": 5e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7497735023498535, + "num_tokens": 613436398.0, + "step": 23701 + }, + { + "epoch": 2.6028991873490006, + "grad_norm": 2.534898519515991, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7746238112449646, + "num_tokens": 613454963.0, + "step": 23702 + }, + { + "epoch": 2.6030090050516144, + "grad_norm": 2.0359890460968018, + "learning_rate": 5e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7593917846679688, + "num_tokens": 613480539.0, + "step": 23703 + }, + { + "epoch": 2.603118822754228, + "grad_norm": 2.2342569828033447, + "learning_rate": 5e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7589443922042847, + "num_tokens": 613503967.0, + "step": 23704 + }, + { + "epoch": 2.603228640456842, + "grad_norm": 2.0933640003204346, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7612706422805786, + "num_tokens": 613530724.0, + "step": 23705 + }, + { + "epoch": 2.603338458159455, + "grad_norm": 2.1610801219940186, + "learning_rate": 5e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7586192488670349, + "num_tokens": 613556334.0, + "step": 23706 + }, + { + "epoch": 2.603448275862069, + "grad_norm": 1.9936513900756836, + "learning_rate": 5e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.7603910565376282, + "num_tokens": 613583775.0, + "step": 23707 + }, + { + "epoch": 2.6035580935646827, + "grad_norm": 1.9299675226211548, + "learning_rate": 5e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7724405527114868, + "num_tokens": 613610802.0, + "step": 23708 + }, + { + "epoch": 2.603667911267296, + "grad_norm": 1.9689357280731201, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7496506571769714, + "num_tokens": 613638391.0, + "step": 23709 + }, + { + "epoch": 2.60377772896991, + "grad_norm": 1.677592158317566, + "learning_rate": 5e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7591055035591125, + "num_tokens": 613671595.0, + "step": 23710 + }, + { + "epoch": 2.6038875466725235, + "grad_norm": 2.2641727924346924, + "learning_rate": 5e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7714349627494812, + "num_tokens": 613694492.0, + "step": 23711 + }, + { + "epoch": 2.6039973643751373, + "grad_norm": 1.9636650085449219, + "learning_rate": 5e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7814587354660034, + "num_tokens": 613723132.0, + "step": 23712 + }, + { + "epoch": 2.604107182077751, + "grad_norm": 1.8914003372192383, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7578171491622925, + "num_tokens": 613752105.0, + "step": 23713 + }, + { + "epoch": 2.6042169997803644, + "grad_norm": 2.138472557067871, + "learning_rate": 5e-06, + "loss": 0.732, + "mean_token_accuracy": 0.7666918635368347, + "num_tokens": 613775423.0, + "step": 23714 + }, + { + "epoch": 2.604326817482978, + "grad_norm": 2.053724527359009, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7632781267166138, + "num_tokens": 613800371.0, + "step": 23715 + }, + { + "epoch": 2.604436635185592, + "grad_norm": 1.9157657623291016, + "learning_rate": 5e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7796104550361633, + "num_tokens": 613828567.0, + "step": 23716 + }, + { + "epoch": 2.6045464528882056, + "grad_norm": 1.9006435871124268, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7466450929641724, + "num_tokens": 613861007.0, + "step": 23717 + }, + { + "epoch": 2.6046562705908194, + "grad_norm": 2.146841287612915, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7657209038734436, + "num_tokens": 613885577.0, + "step": 23718 + }, + { + "epoch": 2.6047660882934327, + "grad_norm": 2.0246455669403076, + "learning_rate": 5e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7418227195739746, + "num_tokens": 613913937.0, + "step": 23719 + }, + { + "epoch": 2.6048759059960465, + "grad_norm": 2.061488628387451, + "learning_rate": 5e-06, + "loss": 0.632, + "mean_token_accuracy": 0.7926595211029053, + "num_tokens": 613938374.0, + "step": 23720 + }, + { + "epoch": 2.60498572369866, + "grad_norm": 1.9276893138885498, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7708076238632202, + "num_tokens": 613970616.0, + "step": 23721 + }, + { + "epoch": 2.605095541401274, + "grad_norm": 1.9778603315353394, + "learning_rate": 5e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7408198714256287, + "num_tokens": 614000678.0, + "step": 23722 + }, + { + "epoch": 2.6052053591038877, + "grad_norm": 2.055534601211548, + "learning_rate": 5e-06, + "loss": 0.6739, + "mean_token_accuracy": 0.7769100666046143, + "num_tokens": 614025385.0, + "step": 23723 + }, + { + "epoch": 2.605315176806501, + "grad_norm": 1.9691559076309204, + "learning_rate": 5e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7791353464126587, + "num_tokens": 614056169.0, + "step": 23724 + }, + { + "epoch": 2.605424994509115, + "grad_norm": 2.0654256343841553, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7679570913314819, + "num_tokens": 614081616.0, + "step": 23725 + }, + { + "epoch": 2.6055348122117286, + "grad_norm": 2.0217318534851074, + "learning_rate": 5e-06, + "loss": 0.6868, + "mean_token_accuracy": 0.7714394330978394, + "num_tokens": 614109009.0, + "step": 23726 + }, + { + "epoch": 2.6056446299143423, + "grad_norm": 1.9576078653335571, + "learning_rate": 5e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7642722129821777, + "num_tokens": 614136389.0, + "step": 23727 + }, + { + "epoch": 2.605754447616956, + "grad_norm": 2.0710270404815674, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.770211398601532, + "num_tokens": 614162480.0, + "step": 23728 + }, + { + "epoch": 2.6058642653195694, + "grad_norm": 1.7977591753005981, + "learning_rate": 5e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.7866466045379639, + "num_tokens": 614193235.0, + "step": 23729 + }, + { + "epoch": 2.605974083022183, + "grad_norm": 2.07314395904541, + "learning_rate": 5e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7740561366081238, + "num_tokens": 614218707.0, + "step": 23730 + }, + { + "epoch": 2.606083900724797, + "grad_norm": 2.0505969524383545, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7570075392723083, + "num_tokens": 614244793.0, + "step": 23731 + }, + { + "epoch": 2.6061937184274107, + "grad_norm": 2.0378847122192383, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7776740789413452, + "num_tokens": 614271113.0, + "step": 23732 + }, + { + "epoch": 2.6063035361300244, + "grad_norm": 2.2776689529418945, + "learning_rate": 5e-06, + "loss": 0.6665, + "mean_token_accuracy": 0.7768357992172241, + "num_tokens": 614295381.0, + "step": 23733 + }, + { + "epoch": 2.6064133538326377, + "grad_norm": 2.167073965072632, + "learning_rate": 5e-06, + "loss": 0.7026, + "mean_token_accuracy": 0.7722312211990356, + "num_tokens": 614319279.0, + "step": 23734 + }, + { + "epoch": 2.6065231715352515, + "grad_norm": 2.1257290840148926, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7552281618118286, + "num_tokens": 614343101.0, + "step": 23735 + }, + { + "epoch": 2.6066329892378652, + "grad_norm": 2.2890379428863525, + "learning_rate": 5e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7918328642845154, + "num_tokens": 614363968.0, + "step": 23736 + }, + { + "epoch": 2.6067428069404786, + "grad_norm": 2.226788282394409, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7722238302230835, + "num_tokens": 614387655.0, + "step": 23737 + }, + { + "epoch": 2.6068526246430923, + "grad_norm": 2.067664861679077, + "learning_rate": 5e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7648640871047974, + "num_tokens": 614413336.0, + "step": 23738 + }, + { + "epoch": 2.606962442345706, + "grad_norm": 2.0262997150421143, + "learning_rate": 5e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7561944127082825, + "num_tokens": 614441345.0, + "step": 23739 + }, + { + "epoch": 2.60707226004832, + "grad_norm": 2.4255590438842773, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7605037689208984, + "num_tokens": 614462690.0, + "step": 23740 + }, + { + "epoch": 2.6071820777509336, + "grad_norm": 2.3425071239471436, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7771036624908447, + "num_tokens": 614483590.0, + "step": 23741 + }, + { + "epoch": 2.607291895453547, + "grad_norm": 2.1809823513031006, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7701619863510132, + "num_tokens": 614506171.0, + "step": 23742 + }, + { + "epoch": 2.6074017131561606, + "grad_norm": 2.148608922958374, + "learning_rate": 5e-06, + "loss": 0.6757, + "mean_token_accuracy": 0.7719151973724365, + "num_tokens": 614530050.0, + "step": 23743 + }, + { + "epoch": 2.6075115308587744, + "grad_norm": 2.1628990173339844, + "learning_rate": 5e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7536733746528625, + "num_tokens": 614553371.0, + "step": 23744 + }, + { + "epoch": 2.607621348561388, + "grad_norm": 2.042006015777588, + "learning_rate": 5e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.7674164772033691, + "num_tokens": 614579760.0, + "step": 23745 + }, + { + "epoch": 2.607731166264002, + "grad_norm": 1.971251368522644, + "learning_rate": 5e-06, + "loss": 0.6742, + "mean_token_accuracy": 0.7681124806404114, + "num_tokens": 614609348.0, + "step": 23746 + }, + { + "epoch": 2.6078409839666152, + "grad_norm": 2.235673666000366, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7698414921760559, + "num_tokens": 614631969.0, + "step": 23747 + }, + { + "epoch": 2.607950801669229, + "grad_norm": 2.088411331176758, + "learning_rate": 5e-06, + "loss": 0.7104, + "mean_token_accuracy": 0.7662209272384644, + "num_tokens": 614657451.0, + "step": 23748 + }, + { + "epoch": 2.6080606193718427, + "grad_norm": 1.97048020362854, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7597240209579468, + "num_tokens": 614686733.0, + "step": 23749 + }, + { + "epoch": 2.6081704370744565, + "grad_norm": 1.9886292219161987, + "learning_rate": 5e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7617757320404053, + "num_tokens": 614712873.0, + "step": 23750 + }, + { + "epoch": 2.6082802547770703, + "grad_norm": 2.0552313327789307, + "learning_rate": 5e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7703192830085754, + "num_tokens": 614736947.0, + "step": 23751 + }, + { + "epoch": 2.6083900724796836, + "grad_norm": 2.062420129776001, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7619483470916748, + "num_tokens": 614760918.0, + "step": 23752 + }, + { + "epoch": 2.6084998901822973, + "grad_norm": 1.9378271102905273, + "learning_rate": 5e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7369095087051392, + "num_tokens": 614789290.0, + "step": 23753 + }, + { + "epoch": 2.608609707884911, + "grad_norm": 1.8916350603103638, + "learning_rate": 5e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7655479907989502, + "num_tokens": 614821375.0, + "step": 23754 + }, + { + "epoch": 2.608719525587525, + "grad_norm": 1.8837851285934448, + "learning_rate": 5e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7284860610961914, + "num_tokens": 614854213.0, + "step": 23755 + }, + { + "epoch": 2.6088293432901386, + "grad_norm": 2.1808643341064453, + "learning_rate": 5e-06, + "loss": 0.673, + "mean_token_accuracy": 0.7789404392242432, + "num_tokens": 614878566.0, + "step": 23756 + }, + { + "epoch": 2.608939160992752, + "grad_norm": 2.22373104095459, + "learning_rate": 5e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7619564533233643, + "num_tokens": 614901100.0, + "step": 23757 + }, + { + "epoch": 2.6090489786953657, + "grad_norm": 2.415449857711792, + "learning_rate": 5e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8071600198745728, + "num_tokens": 614919184.0, + "step": 23758 + }, + { + "epoch": 2.6091587963979794, + "grad_norm": 2.349031925201416, + "learning_rate": 5e-06, + "loss": 0.639, + "mean_token_accuracy": 0.7849310636520386, + "num_tokens": 614939089.0, + "step": 23759 + }, + { + "epoch": 2.6092686141005927, + "grad_norm": 1.9328347444534302, + "learning_rate": 5e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.7628183364868164, + "num_tokens": 614968120.0, + "step": 23760 + }, + { + "epoch": 2.609378431803207, + "grad_norm": 1.946734070777893, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7584917545318604, + "num_tokens": 614995932.0, + "step": 23761 + }, + { + "epoch": 2.6094882495058203, + "grad_norm": 1.9177436828613281, + "learning_rate": 5e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7667627334594727, + "num_tokens": 615023668.0, + "step": 23762 + }, + { + "epoch": 2.609598067208434, + "grad_norm": 2.108675241470337, + "learning_rate": 5e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7810811400413513, + "num_tokens": 615050007.0, + "step": 23763 + }, + { + "epoch": 2.6097078849110478, + "grad_norm": 1.92845618724823, + "learning_rate": 5e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7604097127914429, + "num_tokens": 615078182.0, + "step": 23764 + }, + { + "epoch": 2.609817702613661, + "grad_norm": 2.191161870956421, + "learning_rate": 5e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.7778780460357666, + "num_tokens": 615101757.0, + "step": 23765 + }, + { + "epoch": 2.609927520316275, + "grad_norm": 2.3469693660736084, + "learning_rate": 5e-06, + "loss": 0.6357, + "mean_token_accuracy": 0.7846125364303589, + "num_tokens": 615122485.0, + "step": 23766 + }, + { + "epoch": 2.6100373380188886, + "grad_norm": 1.9839563369750977, + "learning_rate": 5e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7715833187103271, + "num_tokens": 615148970.0, + "step": 23767 + }, + { + "epoch": 2.6101471557215024, + "grad_norm": 2.2051620483398438, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7662532925605774, + "num_tokens": 615174371.0, + "step": 23768 + }, + { + "epoch": 2.610256973424116, + "grad_norm": 1.9882839918136597, + "learning_rate": 5e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7602350115776062, + "num_tokens": 615200170.0, + "step": 23769 + }, + { + "epoch": 2.6103667911267294, + "grad_norm": 2.2179958820343018, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7644238471984863, + "num_tokens": 615222741.0, + "step": 23770 + }, + { + "epoch": 2.610476608829343, + "grad_norm": 2.3076729774475098, + "learning_rate": 5e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7515391111373901, + "num_tokens": 615247258.0, + "step": 23771 + }, + { + "epoch": 2.610586426531957, + "grad_norm": 1.8254011869430542, + "learning_rate": 5e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7462577223777771, + "num_tokens": 615280599.0, + "step": 23772 + }, + { + "epoch": 2.6106962442345707, + "grad_norm": 1.9125922918319702, + "learning_rate": 5e-06, + "loss": 0.6834, + "mean_token_accuracy": 0.7795835733413696, + "num_tokens": 615308268.0, + "step": 23773 + }, + { + "epoch": 2.6108060619371845, + "grad_norm": 2.0844409465789795, + "learning_rate": 5e-06, + "loss": 0.6946, + "mean_token_accuracy": 0.7721704244613647, + "num_tokens": 615332307.0, + "step": 23774 + }, + { + "epoch": 2.6109158796397978, + "grad_norm": 2.1019668579101562, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.778650164604187, + "num_tokens": 615354663.0, + "step": 23775 + }, + { + "epoch": 2.6110256973424115, + "grad_norm": 1.8580641746520996, + "learning_rate": 5e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7535936832427979, + "num_tokens": 615387479.0, + "step": 23776 + }, + { + "epoch": 2.6111355150450253, + "grad_norm": 2.0674753189086914, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7609225511550903, + "num_tokens": 615414898.0, + "step": 23777 + }, + { + "epoch": 2.611245332747639, + "grad_norm": 2.1010499000549316, + "learning_rate": 5e-06, + "loss": 0.6881, + "mean_token_accuracy": 0.7753056287765503, + "num_tokens": 615437858.0, + "step": 23778 + }, + { + "epoch": 2.611355150450253, + "grad_norm": 2.4604074954986572, + "learning_rate": 5e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7767247557640076, + "num_tokens": 615458918.0, + "step": 23779 + }, + { + "epoch": 2.611464968152866, + "grad_norm": 2.0347847938537598, + "learning_rate": 5e-06, + "loss": 0.781, + "mean_token_accuracy": 0.7546654939651489, + "num_tokens": 615485899.0, + "step": 23780 + }, + { + "epoch": 2.61157478585548, + "grad_norm": 2.0704944133758545, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7685203552246094, + "num_tokens": 615510499.0, + "step": 23781 + }, + { + "epoch": 2.6116846035580936, + "grad_norm": 2.2683095932006836, + "learning_rate": 5e-06, + "loss": 0.6687, + "mean_token_accuracy": 0.7829464673995972, + "num_tokens": 615531416.0, + "step": 23782 + }, + { + "epoch": 2.6117944212607074, + "grad_norm": 2.0940921306610107, + "learning_rate": 5e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7686812877655029, + "num_tokens": 615554910.0, + "step": 23783 + }, + { + "epoch": 2.611904238963321, + "grad_norm": 2.13303279876709, + "learning_rate": 5e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.760306715965271, + "num_tokens": 615581133.0, + "step": 23784 + }, + { + "epoch": 2.6120140566659344, + "grad_norm": 2.0034799575805664, + "learning_rate": 5e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7566134929656982, + "num_tokens": 615612533.0, + "step": 23785 + }, + { + "epoch": 2.612123874368548, + "grad_norm": 2.306663751602173, + "learning_rate": 5e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7518548965454102, + "num_tokens": 615637003.0, + "step": 23786 + }, + { + "epoch": 2.612233692071162, + "grad_norm": 2.1506946086883545, + "learning_rate": 5e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7585278749465942, + "num_tokens": 615661758.0, + "step": 23787 + }, + { + "epoch": 2.6123435097737753, + "grad_norm": 2.1609838008880615, + "learning_rate": 5e-06, + "loss": 0.5864, + "mean_token_accuracy": 0.7956008911132812, + "num_tokens": 615684902.0, + "step": 23788 + }, + { + "epoch": 2.612453327476389, + "grad_norm": 2.1053683757781982, + "learning_rate": 5e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.7904375195503235, + "num_tokens": 615706465.0, + "step": 23789 + }, + { + "epoch": 2.612563145179003, + "grad_norm": 1.8929269313812256, + "learning_rate": 5e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7534561157226562, + "num_tokens": 615736781.0, + "step": 23790 + }, + { + "epoch": 2.6126729628816165, + "grad_norm": 2.013216018676758, + "learning_rate": 5e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.7792650461196899, + "num_tokens": 615763835.0, + "step": 23791 + }, + { + "epoch": 2.6127827805842303, + "grad_norm": 2.2873315811157227, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7799862027168274, + "num_tokens": 615785380.0, + "step": 23792 + }, + { + "epoch": 2.6128925982868436, + "grad_norm": 2.3219029903411865, + "learning_rate": 5e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7462308406829834, + "num_tokens": 615808220.0, + "step": 23793 + }, + { + "epoch": 2.6130024159894574, + "grad_norm": 2.1374526023864746, + "learning_rate": 5e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7672691345214844, + "num_tokens": 615831030.0, + "step": 23794 + }, + { + "epoch": 2.613112233692071, + "grad_norm": 1.8075249195098877, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7709393501281738, + "num_tokens": 615861767.0, + "step": 23795 + }, + { + "epoch": 2.613222051394685, + "grad_norm": 2.014437675476074, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7649608850479126, + "num_tokens": 615889110.0, + "step": 23796 + }, + { + "epoch": 2.6133318690972986, + "grad_norm": 1.995917558670044, + "learning_rate": 5e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.757135272026062, + "num_tokens": 615917836.0, + "step": 23797 + }, + { + "epoch": 2.613441686799912, + "grad_norm": 2.316122055053711, + "learning_rate": 5e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7797203063964844, + "num_tokens": 615938149.0, + "step": 23798 + }, + { + "epoch": 2.6135515045025257, + "grad_norm": 2.2757651805877686, + "learning_rate": 5e-06, + "loss": 0.7068, + "mean_token_accuracy": 0.7756986618041992, + "num_tokens": 615959677.0, + "step": 23799 + }, + { + "epoch": 2.6136613222051395, + "grad_norm": 2.095942497253418, + "learning_rate": 5e-06, + "loss": 0.76, + "mean_token_accuracy": 0.7479572296142578, + "num_tokens": 615984530.0, + "step": 23800 + }, + { + "epoch": 2.6137711399077532, + "grad_norm": 2.1455016136169434, + "learning_rate": 5e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7633671164512634, + "num_tokens": 616010312.0, + "step": 23801 + }, + { + "epoch": 2.613880957610367, + "grad_norm": 2.1439313888549805, + "learning_rate": 5e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.752699613571167, + "num_tokens": 616034685.0, + "step": 23802 + }, + { + "epoch": 2.6139907753129803, + "grad_norm": 2.1568806171417236, + "learning_rate": 5e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7534227967262268, + "num_tokens": 616062596.0, + "step": 23803 + }, + { + "epoch": 2.614100593015594, + "grad_norm": 2.1989543437957764, + "learning_rate": 5e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.768695592880249, + "num_tokens": 616089503.0, + "step": 23804 + }, + { + "epoch": 2.614210410718208, + "grad_norm": 2.097820520401001, + "learning_rate": 5e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.7796669006347656, + "num_tokens": 616114106.0, + "step": 23805 + }, + { + "epoch": 2.6143202284208216, + "grad_norm": 2.0077033042907715, + "learning_rate": 5e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7596056461334229, + "num_tokens": 616142219.0, + "step": 23806 + }, + { + "epoch": 2.6144300461234353, + "grad_norm": 2.057137966156006, + "learning_rate": 5e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7779719233512878, + "num_tokens": 616166770.0, + "step": 23807 + }, + { + "epoch": 2.6145398638260486, + "grad_norm": 2.285999059677124, + "learning_rate": 5e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.7712517976760864, + "num_tokens": 616190403.0, + "step": 23808 + }, + { + "epoch": 2.6146496815286624, + "grad_norm": 1.987663984298706, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7638761401176453, + "num_tokens": 616220475.0, + "step": 23809 + }, + { + "epoch": 2.614759499231276, + "grad_norm": 1.9249014854431152, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7781715393066406, + "num_tokens": 616251967.0, + "step": 23810 + }, + { + "epoch": 2.61486931693389, + "grad_norm": 2.056748390197754, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7800989151000977, + "num_tokens": 616276901.0, + "step": 23811 + }, + { + "epoch": 2.6149791346365037, + "grad_norm": 1.931023120880127, + "learning_rate": 5e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7396292686462402, + "num_tokens": 616306375.0, + "step": 23812 + }, + { + "epoch": 2.615088952339117, + "grad_norm": 2.1983883380889893, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7713704109191895, + "num_tokens": 616330376.0, + "step": 23813 + }, + { + "epoch": 2.6151987700417307, + "grad_norm": 2.035029649734497, + "learning_rate": 5e-06, + "loss": 0.677, + "mean_token_accuracy": 0.7743290662765503, + "num_tokens": 616355267.0, + "step": 23814 + }, + { + "epoch": 2.6153085877443445, + "grad_norm": 1.9850542545318604, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7673808336257935, + "num_tokens": 616383232.0, + "step": 23815 + }, + { + "epoch": 2.615418405446958, + "grad_norm": 2.0515310764312744, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7492731809616089, + "num_tokens": 616414017.0, + "step": 23816 + }, + { + "epoch": 2.6155282231495716, + "grad_norm": 2.15085768699646, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7706122994422913, + "num_tokens": 616439502.0, + "step": 23817 + }, + { + "epoch": 2.6156380408521853, + "grad_norm": 2.0637099742889404, + "learning_rate": 5e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7489886283874512, + "num_tokens": 616466636.0, + "step": 23818 + }, + { + "epoch": 2.615747858554799, + "grad_norm": 2.247157096862793, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7808043360710144, + "num_tokens": 616489292.0, + "step": 23819 + }, + { + "epoch": 2.615857676257413, + "grad_norm": 2.175553560256958, + "learning_rate": 5e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.7841427326202393, + "num_tokens": 616513011.0, + "step": 23820 + }, + { + "epoch": 2.615967493960026, + "grad_norm": 2.3661670684814453, + "learning_rate": 5e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7612749934196472, + "num_tokens": 616536247.0, + "step": 23821 + }, + { + "epoch": 2.61607731166264, + "grad_norm": 2.102281332015991, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7551817893981934, + "num_tokens": 616561586.0, + "step": 23822 + }, + { + "epoch": 2.6161871293652537, + "grad_norm": 2.0943901538848877, + "learning_rate": 5e-06, + "loss": 0.6621, + "mean_token_accuracy": 0.7825039625167847, + "num_tokens": 616587098.0, + "step": 23823 + }, + { + "epoch": 2.6162969470678674, + "grad_norm": 2.128629446029663, + "learning_rate": 5e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7750030755996704, + "num_tokens": 616612378.0, + "step": 23824 + }, + { + "epoch": 2.616406764770481, + "grad_norm": 2.1074442863464355, + "learning_rate": 5e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.758545994758606, + "num_tokens": 616640918.0, + "step": 23825 + }, + { + "epoch": 2.6165165824730945, + "grad_norm": 2.0696122646331787, + "learning_rate": 5e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7621798515319824, + "num_tokens": 616668776.0, + "step": 23826 + }, + { + "epoch": 2.6166264001757082, + "grad_norm": 2.0150582790374756, + "learning_rate": 5e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7476933002471924, + "num_tokens": 616697758.0, + "step": 23827 + }, + { + "epoch": 2.616736217878322, + "grad_norm": 1.9901885986328125, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7471329569816589, + "num_tokens": 616727684.0, + "step": 23828 + }, + { + "epoch": 2.6168460355809358, + "grad_norm": 1.973052740097046, + "learning_rate": 5e-06, + "loss": 0.756, + "mean_token_accuracy": 0.755171000957489, + "num_tokens": 616755333.0, + "step": 23829 + }, + { + "epoch": 2.6169558532835495, + "grad_norm": 1.9503393173217773, + "learning_rate": 5e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.7707943320274353, + "num_tokens": 616785458.0, + "step": 23830 + }, + { + "epoch": 2.617065670986163, + "grad_norm": 2.3098723888397217, + "learning_rate": 5e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7647030353546143, + "num_tokens": 616809252.0, + "step": 23831 + }, + { + "epoch": 2.6171754886887766, + "grad_norm": 2.0087742805480957, + "learning_rate": 5e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.7918627262115479, + "num_tokens": 616833676.0, + "step": 23832 + }, + { + "epoch": 2.6172853063913903, + "grad_norm": 2.0934512615203857, + "learning_rate": 5e-06, + "loss": 0.6506, + "mean_token_accuracy": 0.7808637619018555, + "num_tokens": 616860312.0, + "step": 23833 + }, + { + "epoch": 2.617395124094004, + "grad_norm": 2.2132163047790527, + "learning_rate": 5e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7612358331680298, + "num_tokens": 616882967.0, + "step": 23834 + }, + { + "epoch": 2.617504941796618, + "grad_norm": 2.433210611343384, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.7763814926147461, + "num_tokens": 616904751.0, + "step": 23835 + }, + { + "epoch": 2.617614759499231, + "grad_norm": 2.1413655281066895, + "learning_rate": 5e-06, + "loss": 0.6479, + "mean_token_accuracy": 0.7835797071456909, + "num_tokens": 616930089.0, + "step": 23836 + }, + { + "epoch": 2.617724577201845, + "grad_norm": 2.0523202419281006, + "learning_rate": 5e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7558478116989136, + "num_tokens": 616958855.0, + "step": 23837 + }, + { + "epoch": 2.6178343949044587, + "grad_norm": 2.0355136394500732, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7672454714775085, + "num_tokens": 616986440.0, + "step": 23838 + }, + { + "epoch": 2.617944212607072, + "grad_norm": 2.158416986465454, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7820795774459839, + "num_tokens": 617009962.0, + "step": 23839 + }, + { + "epoch": 2.618054030309686, + "grad_norm": 1.9562731981277466, + "learning_rate": 5e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7639639377593994, + "num_tokens": 617039590.0, + "step": 23840 + }, + { + "epoch": 2.6181638480122995, + "grad_norm": 2.3792660236358643, + "learning_rate": 5e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.768742024898529, + "num_tokens": 617062057.0, + "step": 23841 + }, + { + "epoch": 2.6182736657149133, + "grad_norm": 2.216299533843994, + "learning_rate": 5e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.7875780463218689, + "num_tokens": 617082428.0, + "step": 23842 + }, + { + "epoch": 2.618383483417527, + "grad_norm": 2.051314115524292, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7693305611610413, + "num_tokens": 617108799.0, + "step": 23843 + }, + { + "epoch": 2.6184933011201403, + "grad_norm": 2.099544048309326, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7608015537261963, + "num_tokens": 617135835.0, + "step": 23844 + }, + { + "epoch": 2.618603118822754, + "grad_norm": 2.38149094581604, + "learning_rate": 5e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.760309100151062, + "num_tokens": 617156910.0, + "step": 23845 + }, + { + "epoch": 2.618712936525368, + "grad_norm": 2.361870050430298, + "learning_rate": 5e-06, + "loss": 0.6599, + "mean_token_accuracy": 0.7838104367256165, + "num_tokens": 617177111.0, + "step": 23846 + }, + { + "epoch": 2.6188227542279816, + "grad_norm": 2.1495301723480225, + "learning_rate": 5e-06, + "loss": 0.5727, + "mean_token_accuracy": 0.8056472539901733, + "num_tokens": 617200733.0, + "step": 23847 + }, + { + "epoch": 2.6189325719305954, + "grad_norm": 2.444645404815674, + "learning_rate": 5e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7685182094573975, + "num_tokens": 617221428.0, + "step": 23848 + }, + { + "epoch": 2.6190423896332087, + "grad_norm": 2.385662078857422, + "learning_rate": 5e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.774648904800415, + "num_tokens": 617242774.0, + "step": 23849 + }, + { + "epoch": 2.6191522073358224, + "grad_norm": 2.6780316829681396, + "learning_rate": 5e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7685335874557495, + "num_tokens": 617264931.0, + "step": 23850 + }, + { + "epoch": 2.619262025038436, + "grad_norm": 2.0175371170043945, + "learning_rate": 5e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7405232787132263, + "num_tokens": 617295671.0, + "step": 23851 + }, + { + "epoch": 2.61937184274105, + "grad_norm": 1.8301533460617065, + "learning_rate": 5e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7372019290924072, + "num_tokens": 617327067.0, + "step": 23852 + }, + { + "epoch": 2.6194816604436637, + "grad_norm": 2.363541603088379, + "learning_rate": 5e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7618257999420166, + "num_tokens": 617348065.0, + "step": 23853 + }, + { + "epoch": 2.619591478146277, + "grad_norm": 2.0043482780456543, + "learning_rate": 5e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7763904333114624, + "num_tokens": 617372838.0, + "step": 23854 + }, + { + "epoch": 2.6197012958488908, + "grad_norm": 1.9436861276626587, + "learning_rate": 5e-06, + "loss": 0.6564, + "mean_token_accuracy": 0.786215603351593, + "num_tokens": 617401151.0, + "step": 23855 + }, + { + "epoch": 2.6198111135515045, + "grad_norm": 2.0967252254486084, + "learning_rate": 5e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7658650279045105, + "num_tokens": 617427019.0, + "step": 23856 + }, + { + "epoch": 2.6199209312541183, + "grad_norm": 2.1839144229888916, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7822444438934326, + "num_tokens": 617450712.0, + "step": 23857 + }, + { + "epoch": 2.620030748956732, + "grad_norm": 2.054211378097534, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.767947793006897, + "num_tokens": 617475776.0, + "step": 23858 + }, + { + "epoch": 2.6201405666593454, + "grad_norm": 1.83321213722229, + "learning_rate": 5e-06, + "loss": 0.7883, + "mean_token_accuracy": 0.7466299533843994, + "num_tokens": 617508131.0, + "step": 23859 + }, + { + "epoch": 2.620250384361959, + "grad_norm": 2.083735227584839, + "learning_rate": 5e-06, + "loss": 0.6796, + "mean_token_accuracy": 0.7737616896629333, + "num_tokens": 617532041.0, + "step": 23860 + }, + { + "epoch": 2.620360202064573, + "grad_norm": 2.2253222465515137, + "learning_rate": 5e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7649845480918884, + "num_tokens": 617554529.0, + "step": 23861 + }, + { + "epoch": 2.6204700197671866, + "grad_norm": 2.0304553508758545, + "learning_rate": 5e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7547626495361328, + "num_tokens": 617580108.0, + "step": 23862 + }, + { + "epoch": 2.6205798374698004, + "grad_norm": 2.058429479598999, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7626982927322388, + "num_tokens": 617608611.0, + "step": 23863 + }, + { + "epoch": 2.6206896551724137, + "grad_norm": 2.3014447689056396, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7596250772476196, + "num_tokens": 617633183.0, + "step": 23864 + }, + { + "epoch": 2.6207994728750275, + "grad_norm": 2.1154868602752686, + "learning_rate": 5e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7545715570449829, + "num_tokens": 617661691.0, + "step": 23865 + }, + { + "epoch": 2.620909290577641, + "grad_norm": 1.8336797952651978, + "learning_rate": 5e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7388876676559448, + "num_tokens": 617697521.0, + "step": 23866 + }, + { + "epoch": 2.6210191082802545, + "grad_norm": 1.844441294670105, + "learning_rate": 5e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7680986523628235, + "num_tokens": 617730801.0, + "step": 23867 + }, + { + "epoch": 2.6211289259828683, + "grad_norm": 2.204941749572754, + "learning_rate": 5e-06, + "loss": 0.6358, + "mean_token_accuracy": 0.7932519316673279, + "num_tokens": 617753041.0, + "step": 23868 + }, + { + "epoch": 2.621238743685482, + "grad_norm": 2.1712396144866943, + "learning_rate": 5e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7474027276039124, + "num_tokens": 617780602.0, + "step": 23869 + }, + { + "epoch": 2.621348561388096, + "grad_norm": 1.9246361255645752, + "learning_rate": 5e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.751408040523529, + "num_tokens": 617812406.0, + "step": 23870 + }, + { + "epoch": 2.6214583790907096, + "grad_norm": 2.005413293838501, + "learning_rate": 5e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7449973821640015, + "num_tokens": 617841899.0, + "step": 23871 + }, + { + "epoch": 2.621568196793323, + "grad_norm": 2.4594404697418213, + "learning_rate": 5e-06, + "loss": 0.6597, + "mean_token_accuracy": 0.7822412252426147, + "num_tokens": 617862217.0, + "step": 23872 + }, + { + "epoch": 2.6216780144959366, + "grad_norm": 2.261643648147583, + "learning_rate": 5e-06, + "loss": 0.6423, + "mean_token_accuracy": 0.7819807529449463, + "num_tokens": 617883883.0, + "step": 23873 + }, + { + "epoch": 2.6217878321985504, + "grad_norm": 1.8028157949447632, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7693195343017578, + "num_tokens": 617915698.0, + "step": 23874 + }, + { + "epoch": 2.621897649901164, + "grad_norm": 2.0389673709869385, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7596787214279175, + "num_tokens": 617943415.0, + "step": 23875 + }, + { + "epoch": 2.622007467603778, + "grad_norm": 1.918731451034546, + "learning_rate": 5e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7665034532546997, + "num_tokens": 617972515.0, + "step": 23876 + }, + { + "epoch": 2.622117285306391, + "grad_norm": 2.0807242393493652, + "learning_rate": 5e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7614580392837524, + "num_tokens": 617998745.0, + "step": 23877 + }, + { + "epoch": 2.622227103009005, + "grad_norm": 2.02773380279541, + "learning_rate": 5e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7642384767532349, + "num_tokens": 618023542.0, + "step": 23878 + }, + { + "epoch": 2.6223369207116187, + "grad_norm": 2.1763484477996826, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7577572464942932, + "num_tokens": 618045402.0, + "step": 23879 + }, + { + "epoch": 2.6224467384142325, + "grad_norm": 2.4448366165161133, + "learning_rate": 5e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7601833939552307, + "num_tokens": 618068302.0, + "step": 23880 + }, + { + "epoch": 2.6225565561168462, + "grad_norm": 1.7849392890930176, + "learning_rate": 5e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.753668487071991, + "num_tokens": 618102437.0, + "step": 23881 + }, + { + "epoch": 2.6226663738194596, + "grad_norm": 2.061875581741333, + "learning_rate": 5e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7656840085983276, + "num_tokens": 618132542.0, + "step": 23882 + }, + { + "epoch": 2.6227761915220733, + "grad_norm": 2.114837169647217, + "learning_rate": 5e-06, + "loss": 0.6722, + "mean_token_accuracy": 0.7709497213363647, + "num_tokens": 618155193.0, + "step": 23883 + }, + { + "epoch": 2.622886009224687, + "grad_norm": 2.2017605304718018, + "learning_rate": 5e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7498754858970642, + "num_tokens": 618179730.0, + "step": 23884 + }, + { + "epoch": 2.622995826927301, + "grad_norm": 2.0368432998657227, + "learning_rate": 5e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.762171745300293, + "num_tokens": 618207537.0, + "step": 23885 + }, + { + "epoch": 2.6231056446299146, + "grad_norm": 2.1903936862945557, + "learning_rate": 5e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7647513747215271, + "num_tokens": 618231438.0, + "step": 23886 + }, + { + "epoch": 2.623215462332528, + "grad_norm": 2.0128731727600098, + "learning_rate": 5e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.7754249572753906, + "num_tokens": 618257318.0, + "step": 23887 + }, + { + "epoch": 2.6233252800351416, + "grad_norm": 1.9844378232955933, + "learning_rate": 5e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7594881653785706, + "num_tokens": 618285488.0, + "step": 23888 + }, + { + "epoch": 2.6234350977377554, + "grad_norm": 2.283017158508301, + "learning_rate": 5e-06, + "loss": 0.6134, + "mean_token_accuracy": 0.7915060520172119, + "num_tokens": 618306058.0, + "step": 23889 + }, + { + "epoch": 2.6235449154403687, + "grad_norm": 1.977339506149292, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.754299521446228, + "num_tokens": 618335406.0, + "step": 23890 + }, + { + "epoch": 2.623654733142983, + "grad_norm": 2.010493040084839, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7854017019271851, + "num_tokens": 618360677.0, + "step": 23891 + }, + { + "epoch": 2.6237645508455962, + "grad_norm": 1.9355337619781494, + "learning_rate": 5e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7667850255966187, + "num_tokens": 618390181.0, + "step": 23892 + }, + { + "epoch": 2.62387436854821, + "grad_norm": 1.9875849485397339, + "learning_rate": 5e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7469284534454346, + "num_tokens": 618419718.0, + "step": 23893 + }, + { + "epoch": 2.6239841862508237, + "grad_norm": 1.9894648790359497, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7552988529205322, + "num_tokens": 618448191.0, + "step": 23894 + }, + { + "epoch": 2.624094003953437, + "grad_norm": 1.882364273071289, + "learning_rate": 5e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7631216049194336, + "num_tokens": 618477828.0, + "step": 23895 + }, + { + "epoch": 2.624203821656051, + "grad_norm": 1.9568498134613037, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.7636129856109619, + "num_tokens": 618507144.0, + "step": 23896 + }, + { + "epoch": 2.6243136393586646, + "grad_norm": 1.9483952522277832, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7611266374588013, + "num_tokens": 618537585.0, + "step": 23897 + }, + { + "epoch": 2.6244234570612783, + "grad_norm": 1.9217298030853271, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.767074704170227, + "num_tokens": 618565056.0, + "step": 23898 + }, + { + "epoch": 2.624533274763892, + "grad_norm": 2.1470584869384766, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7671476602554321, + "num_tokens": 618588756.0, + "step": 23899 + }, + { + "epoch": 2.6246430924665054, + "grad_norm": 2.1069040298461914, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7425737977027893, + "num_tokens": 618614951.0, + "step": 23900 + }, + { + "epoch": 2.624752910169119, + "grad_norm": 1.9759198427200317, + "learning_rate": 5e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.7557767629623413, + "num_tokens": 618645265.0, + "step": 23901 + }, + { + "epoch": 2.624862727871733, + "grad_norm": 2.076594352722168, + "learning_rate": 5e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7484235763549805, + "num_tokens": 618673209.0, + "step": 23902 + }, + { + "epoch": 2.6249725455743467, + "grad_norm": 2.0050454139709473, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7705698013305664, + "num_tokens": 618701154.0, + "step": 23903 + }, + { + "epoch": 2.6250823632769604, + "grad_norm": 2.1041059494018555, + "learning_rate": 5e-06, + "loss": 0.6726, + "mean_token_accuracy": 0.7791626453399658, + "num_tokens": 618725124.0, + "step": 23904 + }, + { + "epoch": 2.6251921809795737, + "grad_norm": 1.912124752998352, + "learning_rate": 5e-06, + "loss": 0.6516, + "mean_token_accuracy": 0.7787408828735352, + "num_tokens": 618750931.0, + "step": 23905 + }, + { + "epoch": 2.6253019986821875, + "grad_norm": 2.027599573135376, + "learning_rate": 5e-06, + "loss": 0.7484, + "mean_token_accuracy": 0.7519725561141968, + "num_tokens": 618777508.0, + "step": 23906 + }, + { + "epoch": 2.6254118163848013, + "grad_norm": 2.0212957859039307, + "learning_rate": 5e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.770318329334259, + "num_tokens": 618803762.0, + "step": 23907 + }, + { + "epoch": 2.625521634087415, + "grad_norm": 2.068859577178955, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7798466682434082, + "num_tokens": 618828460.0, + "step": 23908 + }, + { + "epoch": 2.6256314517900288, + "grad_norm": 1.8948063850402832, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7595950365066528, + "num_tokens": 618858328.0, + "step": 23909 + }, + { + "epoch": 2.625741269492642, + "grad_norm": 2.326624631881714, + "learning_rate": 5e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7613238096237183, + "num_tokens": 618880828.0, + "step": 23910 + }, + { + "epoch": 2.625851087195256, + "grad_norm": 2.4136135578155518, + "learning_rate": 5e-06, + "loss": 0.6245, + "mean_token_accuracy": 0.7895821332931519, + "num_tokens": 618899141.0, + "step": 23911 + }, + { + "epoch": 2.6259609048978696, + "grad_norm": 2.24617338180542, + "learning_rate": 5e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7746458649635315, + "num_tokens": 618921685.0, + "step": 23912 + }, + { + "epoch": 2.6260707226004834, + "grad_norm": 2.2617263793945312, + "learning_rate": 5e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7591395378112793, + "num_tokens": 618947723.0, + "step": 23913 + }, + { + "epoch": 2.626180540303097, + "grad_norm": 2.111340284347534, + "learning_rate": 5e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7706277966499329, + "num_tokens": 618972243.0, + "step": 23914 + }, + { + "epoch": 2.6262903580057104, + "grad_norm": 1.965589165687561, + "learning_rate": 5e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7411682605743408, + "num_tokens": 619002924.0, + "step": 23915 + }, + { + "epoch": 2.626400175708324, + "grad_norm": 1.8752222061157227, + "learning_rate": 5e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.7617026567459106, + "num_tokens": 619031431.0, + "step": 23916 + }, + { + "epoch": 2.626509993410938, + "grad_norm": 1.864603877067566, + "learning_rate": 5e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7731391787528992, + "num_tokens": 619062077.0, + "step": 23917 + }, + { + "epoch": 2.6266198111135513, + "grad_norm": 2.355215072631836, + "learning_rate": 5e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7758587002754211, + "num_tokens": 619082957.0, + "step": 23918 + }, + { + "epoch": 2.626729628816165, + "grad_norm": 2.1840620040893555, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7625009417533875, + "num_tokens": 619107059.0, + "step": 23919 + }, + { + "epoch": 2.6268394465187788, + "grad_norm": 2.0948636531829834, + "learning_rate": 5e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.7782108187675476, + "num_tokens": 619130797.0, + "step": 23920 + }, + { + "epoch": 2.6269492642213925, + "grad_norm": 2.214806079864502, + "learning_rate": 5e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7786535024642944, + "num_tokens": 619152356.0, + "step": 23921 + }, + { + "epoch": 2.6270590819240063, + "grad_norm": 2.0540027618408203, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7593637704849243, + "num_tokens": 619179051.0, + "step": 23922 + }, + { + "epoch": 2.6271688996266196, + "grad_norm": 2.185138702392578, + "learning_rate": 5e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7681665420532227, + "num_tokens": 619202367.0, + "step": 23923 + }, + { + "epoch": 2.6272787173292333, + "grad_norm": 2.382277488708496, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7611310482025146, + "num_tokens": 619224151.0, + "step": 23924 + }, + { + "epoch": 2.627388535031847, + "grad_norm": 2.008870840072632, + "learning_rate": 5e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.756434977054596, + "num_tokens": 619252299.0, + "step": 23925 + }, + { + "epoch": 2.627498352734461, + "grad_norm": 2.025994062423706, + "learning_rate": 5e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7554314136505127, + "num_tokens": 619278534.0, + "step": 23926 + }, + { + "epoch": 2.6276081704370746, + "grad_norm": 2.355975866317749, + "learning_rate": 5e-06, + "loss": 0.6725, + "mean_token_accuracy": 0.7751476764678955, + "num_tokens": 619299264.0, + "step": 23927 + }, + { + "epoch": 2.627717988139688, + "grad_norm": 2.216815948486328, + "learning_rate": 5e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7746058702468872, + "num_tokens": 619322930.0, + "step": 23928 + }, + { + "epoch": 2.6278278058423017, + "grad_norm": 2.1343040466308594, + "learning_rate": 5e-06, + "loss": 0.6655, + "mean_token_accuracy": 0.7832313179969788, + "num_tokens": 619347632.0, + "step": 23929 + }, + { + "epoch": 2.6279376235449154, + "grad_norm": 1.9339345693588257, + "learning_rate": 5e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7672099471092224, + "num_tokens": 619380395.0, + "step": 23930 + }, + { + "epoch": 2.628047441247529, + "grad_norm": 2.1160075664520264, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7533515095710754, + "num_tokens": 619408911.0, + "step": 23931 + }, + { + "epoch": 2.628157258950143, + "grad_norm": 2.196730136871338, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7660670280456543, + "num_tokens": 619430561.0, + "step": 23932 + }, + { + "epoch": 2.6282670766527563, + "grad_norm": 2.2101473808288574, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7708227038383484, + "num_tokens": 619453380.0, + "step": 23933 + }, + { + "epoch": 2.62837689435537, + "grad_norm": 2.2237436771392822, + "learning_rate": 5e-06, + "loss": 0.6627, + "mean_token_accuracy": 0.7855101823806763, + "num_tokens": 619475412.0, + "step": 23934 + }, + { + "epoch": 2.628486712057984, + "grad_norm": 1.9830386638641357, + "learning_rate": 5e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7423056364059448, + "num_tokens": 619504624.0, + "step": 23935 + }, + { + "epoch": 2.6285965297605975, + "grad_norm": 2.129249095916748, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7733372449874878, + "num_tokens": 619529393.0, + "step": 23936 + }, + { + "epoch": 2.6287063474632113, + "grad_norm": 2.222499370574951, + "learning_rate": 5e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7535344958305359, + "num_tokens": 619555672.0, + "step": 23937 + }, + { + "epoch": 2.6288161651658246, + "grad_norm": 2.131563186645508, + "learning_rate": 5e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7463036775588989, + "num_tokens": 619582522.0, + "step": 23938 + }, + { + "epoch": 2.6289259828684384, + "grad_norm": 2.406278371810913, + "learning_rate": 5e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.778349757194519, + "num_tokens": 619603229.0, + "step": 23939 + }, + { + "epoch": 2.629035800571052, + "grad_norm": 2.2276804447174072, + "learning_rate": 5e-06, + "loss": 0.6953, + "mean_token_accuracy": 0.7817508578300476, + "num_tokens": 619628224.0, + "step": 23940 + }, + { + "epoch": 2.6291456182736654, + "grad_norm": 2.0444133281707764, + "learning_rate": 5e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7638901472091675, + "num_tokens": 619655461.0, + "step": 23941 + }, + { + "epoch": 2.6292554359762796, + "grad_norm": 2.235553026199341, + "learning_rate": 5e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.7738442420959473, + "num_tokens": 619677905.0, + "step": 23942 + }, + { + "epoch": 2.629365253678893, + "grad_norm": 2.276435375213623, + "learning_rate": 5e-06, + "loss": 0.7076, + "mean_token_accuracy": 0.7685983180999756, + "num_tokens": 619701969.0, + "step": 23943 + }, + { + "epoch": 2.6294750713815067, + "grad_norm": 2.1233859062194824, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7529904842376709, + "num_tokens": 619726202.0, + "step": 23944 + }, + { + "epoch": 2.6295848890841205, + "grad_norm": 1.992087483406067, + "learning_rate": 5e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7709203958511353, + "num_tokens": 619751887.0, + "step": 23945 + }, + { + "epoch": 2.629694706786734, + "grad_norm": 2.1707639694213867, + "learning_rate": 5e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7656733989715576, + "num_tokens": 619778065.0, + "step": 23946 + }, + { + "epoch": 2.6298045244893475, + "grad_norm": 2.0485329627990723, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7527966499328613, + "num_tokens": 619805499.0, + "step": 23947 + }, + { + "epoch": 2.6299143421919613, + "grad_norm": 2.1632771492004395, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7711117267608643, + "num_tokens": 619829888.0, + "step": 23948 + }, + { + "epoch": 2.630024159894575, + "grad_norm": 2.1569292545318604, + "learning_rate": 5e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.770108699798584, + "num_tokens": 619855700.0, + "step": 23949 + }, + { + "epoch": 2.630133977597189, + "grad_norm": 2.0964787006378174, + "learning_rate": 5e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7618929147720337, + "num_tokens": 619883009.0, + "step": 23950 + }, + { + "epoch": 2.630243795299802, + "grad_norm": 1.9588686227798462, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7585338354110718, + "num_tokens": 619908171.0, + "step": 23951 + }, + { + "epoch": 2.630353613002416, + "grad_norm": 1.8301395177841187, + "learning_rate": 5e-06, + "loss": 0.6129, + "mean_token_accuracy": 0.7937201857566833, + "num_tokens": 619937859.0, + "step": 23952 + }, + { + "epoch": 2.6304634307050296, + "grad_norm": 2.0838472843170166, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7616478204727173, + "num_tokens": 619962306.0, + "step": 23953 + }, + { + "epoch": 2.6305732484076434, + "grad_norm": 1.9932042360305786, + "learning_rate": 5e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7665750980377197, + "num_tokens": 619989684.0, + "step": 23954 + }, + { + "epoch": 2.630683066110257, + "grad_norm": 2.281318187713623, + "learning_rate": 5e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7412112951278687, + "num_tokens": 620013845.0, + "step": 23955 + }, + { + "epoch": 2.6307928838128705, + "grad_norm": 2.1041104793548584, + "learning_rate": 5e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7753798365592957, + "num_tokens": 620036840.0, + "step": 23956 + }, + { + "epoch": 2.630902701515484, + "grad_norm": 2.1776700019836426, + "learning_rate": 5e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7672855854034424, + "num_tokens": 620061969.0, + "step": 23957 + }, + { + "epoch": 2.631012519218098, + "grad_norm": 1.978281855583191, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7499369978904724, + "num_tokens": 620091686.0, + "step": 23958 + }, + { + "epoch": 2.6311223369207117, + "grad_norm": 2.1737537384033203, + "learning_rate": 5e-06, + "loss": 0.6843, + "mean_token_accuracy": 0.7740784883499146, + "num_tokens": 620113098.0, + "step": 23959 + }, + { + "epoch": 2.6312321546233255, + "grad_norm": 2.2994253635406494, + "learning_rate": 5e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7790579795837402, + "num_tokens": 620135106.0, + "step": 23960 + }, + { + "epoch": 2.631341972325939, + "grad_norm": 1.9044709205627441, + "learning_rate": 5e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7560734152793884, + "num_tokens": 620165920.0, + "step": 23961 + }, + { + "epoch": 2.6314517900285526, + "grad_norm": 2.2131969928741455, + "learning_rate": 5e-06, + "loss": 0.671, + "mean_token_accuracy": 0.7873432040214539, + "num_tokens": 620188135.0, + "step": 23962 + }, + { + "epoch": 2.6315616077311663, + "grad_norm": 2.303443193435669, + "learning_rate": 5e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7703351378440857, + "num_tokens": 620211612.0, + "step": 23963 + }, + { + "epoch": 2.63167142543378, + "grad_norm": 2.1346981525421143, + "learning_rate": 5e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7491763234138489, + "num_tokens": 620236646.0, + "step": 23964 + }, + { + "epoch": 2.631781243136394, + "grad_norm": 2.4105677604675293, + "learning_rate": 5e-06, + "loss": 0.6507, + "mean_token_accuracy": 0.7865665555000305, + "num_tokens": 620256736.0, + "step": 23965 + }, + { + "epoch": 2.631891060839007, + "grad_norm": 2.338467597961426, + "learning_rate": 5e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7649349570274353, + "num_tokens": 620277861.0, + "step": 23966 + }, + { + "epoch": 2.632000878541621, + "grad_norm": 2.065706729888916, + "learning_rate": 5e-06, + "loss": 0.6176, + "mean_token_accuracy": 0.8012344241142273, + "num_tokens": 620302006.0, + "step": 23967 + }, + { + "epoch": 2.6321106962442347, + "grad_norm": 2.0441761016845703, + "learning_rate": 5e-06, + "loss": 0.676, + "mean_token_accuracy": 0.7758823037147522, + "num_tokens": 620327341.0, + "step": 23968 + }, + { + "epoch": 2.632220513946848, + "grad_norm": 1.954780101776123, + "learning_rate": 5e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7556185722351074, + "num_tokens": 620358304.0, + "step": 23969 + }, + { + "epoch": 2.6323303316494617, + "grad_norm": 2.0541930198669434, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7613548040390015, + "num_tokens": 620386273.0, + "step": 23970 + }, + { + "epoch": 2.6324401493520755, + "grad_norm": 2.416267156600952, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7765337228775024, + "num_tokens": 620408112.0, + "step": 23971 + }, + { + "epoch": 2.6325499670546892, + "grad_norm": 2.1283600330352783, + "learning_rate": 5e-06, + "loss": 0.5876, + "mean_token_accuracy": 0.8016409873962402, + "num_tokens": 620428932.0, + "step": 23972 + }, + { + "epoch": 2.632659784757303, + "grad_norm": 2.651400327682495, + "learning_rate": 5e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7757084369659424, + "num_tokens": 620448741.0, + "step": 23973 + }, + { + "epoch": 2.6327696024599163, + "grad_norm": 2.1882050037384033, + "learning_rate": 5e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.765492856502533, + "num_tokens": 620470579.0, + "step": 23974 + }, + { + "epoch": 2.63287942016253, + "grad_norm": 2.0240583419799805, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7426375150680542, + "num_tokens": 620503549.0, + "step": 23975 + }, + { + "epoch": 2.632989237865144, + "grad_norm": 1.8971738815307617, + "learning_rate": 5e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7725715637207031, + "num_tokens": 620532711.0, + "step": 23976 + }, + { + "epoch": 2.6330990555677576, + "grad_norm": 2.0752267837524414, + "learning_rate": 5e-06, + "loss": 0.621, + "mean_token_accuracy": 0.7859801054000854, + "num_tokens": 620556273.0, + "step": 23977 + }, + { + "epoch": 2.6332088732703713, + "grad_norm": 2.1724393367767334, + "learning_rate": 5e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.7814738154411316, + "num_tokens": 620578174.0, + "step": 23978 + }, + { + "epoch": 2.6333186909729847, + "grad_norm": 2.04189133644104, + "learning_rate": 5e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7608632445335388, + "num_tokens": 620606334.0, + "step": 23979 + }, + { + "epoch": 2.6334285086755984, + "grad_norm": 2.266005754470825, + "learning_rate": 5e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7715015411376953, + "num_tokens": 620628488.0, + "step": 23980 + }, + { + "epoch": 2.633538326378212, + "grad_norm": 2.129549980163574, + "learning_rate": 5e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.8004199266433716, + "num_tokens": 620652028.0, + "step": 23981 + }, + { + "epoch": 2.633648144080826, + "grad_norm": 2.0158650875091553, + "learning_rate": 5e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.7647415995597839, + "num_tokens": 620679271.0, + "step": 23982 + }, + { + "epoch": 2.6337579617834397, + "grad_norm": 2.2219398021698, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7690513134002686, + "num_tokens": 620702023.0, + "step": 23983 + }, + { + "epoch": 2.633867779486053, + "grad_norm": 1.9405128955841064, + "learning_rate": 5e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7526715993881226, + "num_tokens": 620730817.0, + "step": 23984 + }, + { + "epoch": 2.6339775971886668, + "grad_norm": 1.9817010164260864, + "learning_rate": 5e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7493425011634827, + "num_tokens": 620758495.0, + "step": 23985 + }, + { + "epoch": 2.6340874148912805, + "grad_norm": 2.037928819656372, + "learning_rate": 5e-06, + "loss": 0.6425, + "mean_token_accuracy": 0.7811068296432495, + "num_tokens": 620782121.0, + "step": 23986 + }, + { + "epoch": 2.6341972325938943, + "grad_norm": 2.1543080806732178, + "learning_rate": 5e-06, + "loss": 0.6843, + "mean_token_accuracy": 0.7700151205062866, + "num_tokens": 620804708.0, + "step": 23987 + }, + { + "epoch": 2.634307050296508, + "grad_norm": 2.0273053646087646, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7542012929916382, + "num_tokens": 620832245.0, + "step": 23988 + }, + { + "epoch": 2.6344168679991213, + "grad_norm": 1.8624310493469238, + "learning_rate": 5e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.777696967124939, + "num_tokens": 620861639.0, + "step": 23989 + }, + { + "epoch": 2.634526685701735, + "grad_norm": 2.2286033630371094, + "learning_rate": 5e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7627255916595459, + "num_tokens": 620884520.0, + "step": 23990 + }, + { + "epoch": 2.634636503404349, + "grad_norm": 2.2503745555877686, + "learning_rate": 5e-06, + "loss": 0.6576, + "mean_token_accuracy": 0.7814586162567139, + "num_tokens": 620906732.0, + "step": 23991 + }, + { + "epoch": 2.6347463211069626, + "grad_norm": 1.925234079360962, + "learning_rate": 5e-06, + "loss": 0.7429, + "mean_token_accuracy": 0.7546672224998474, + "num_tokens": 620936707.0, + "step": 23992 + }, + { + "epoch": 2.6348561388095764, + "grad_norm": 2.1290276050567627, + "learning_rate": 5e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7616592049598694, + "num_tokens": 620963910.0, + "step": 23993 + }, + { + "epoch": 2.6349659565121897, + "grad_norm": 2.1684250831604004, + "learning_rate": 5e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7213841676712036, + "num_tokens": 620991864.0, + "step": 23994 + }, + { + "epoch": 2.6350757742148034, + "grad_norm": 1.997627854347229, + "learning_rate": 5e-06, + "loss": 0.677, + "mean_token_accuracy": 0.7788498401641846, + "num_tokens": 621017673.0, + "step": 23995 + }, + { + "epoch": 2.635185591917417, + "grad_norm": 2.1397736072540283, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7626875638961792, + "num_tokens": 621041729.0, + "step": 23996 + }, + { + "epoch": 2.6352954096200305, + "grad_norm": 2.1751458644866943, + "learning_rate": 5e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7655194997787476, + "num_tokens": 621066682.0, + "step": 23997 + }, + { + "epoch": 2.6354052273226443, + "grad_norm": 2.1272695064544678, + "learning_rate": 5e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.780250072479248, + "num_tokens": 621091532.0, + "step": 23998 + }, + { + "epoch": 2.635515045025258, + "grad_norm": 2.1730244159698486, + "learning_rate": 5e-06, + "loss": 0.6765, + "mean_token_accuracy": 0.779931902885437, + "num_tokens": 621114323.0, + "step": 23999 + }, + { + "epoch": 2.6356248627278718, + "grad_norm": 2.163209915161133, + "learning_rate": 5e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.7742995023727417, + "num_tokens": 621138603.0, + "step": 24000 + }, + { + "epoch": 2.6357346804304855, + "grad_norm": 2.304556369781494, + "learning_rate": 5e-06, + "loss": 0.711, + "mean_token_accuracy": 0.7681779861450195, + "num_tokens": 621160145.0, + "step": 24001 + }, + { + "epoch": 2.635844498133099, + "grad_norm": 2.1380326747894287, + "learning_rate": 5e-06, + "loss": 0.6707, + "mean_token_accuracy": 0.7816618084907532, + "num_tokens": 621182902.0, + "step": 24002 + }, + { + "epoch": 2.6359543158357126, + "grad_norm": 2.082411050796509, + "learning_rate": 5e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7540372610092163, + "num_tokens": 621209234.0, + "step": 24003 + }, + { + "epoch": 2.6360641335383264, + "grad_norm": 2.0208020210266113, + "learning_rate": 5e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7632129192352295, + "num_tokens": 621235519.0, + "step": 24004 + }, + { + "epoch": 2.63617395124094, + "grad_norm": 2.0975687503814697, + "learning_rate": 5e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7749310731887817, + "num_tokens": 621258891.0, + "step": 24005 + }, + { + "epoch": 2.636283768943554, + "grad_norm": 1.975152611732483, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7532353401184082, + "num_tokens": 621291358.0, + "step": 24006 + }, + { + "epoch": 2.636393586646167, + "grad_norm": 2.0169448852539062, + "learning_rate": 5e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7608926296234131, + "num_tokens": 621320189.0, + "step": 24007 + }, + { + "epoch": 2.636503404348781, + "grad_norm": 2.091573476791382, + "learning_rate": 5e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7469111680984497, + "num_tokens": 621347024.0, + "step": 24008 + }, + { + "epoch": 2.6366132220513947, + "grad_norm": 2.039767026901245, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.759117603302002, + "num_tokens": 621375116.0, + "step": 24009 + }, + { + "epoch": 2.6367230397540085, + "grad_norm": 2.4467551708221436, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7707387208938599, + "num_tokens": 621395537.0, + "step": 24010 + }, + { + "epoch": 2.636832857456622, + "grad_norm": 2.5460355281829834, + "learning_rate": 5e-06, + "loss": 0.6573, + "mean_token_accuracy": 0.7830206155776978, + "num_tokens": 621414675.0, + "step": 24011 + }, + { + "epoch": 2.6369426751592355, + "grad_norm": 2.5348010063171387, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7753854393959045, + "num_tokens": 621433476.0, + "step": 24012 + }, + { + "epoch": 2.6370524928618493, + "grad_norm": 2.0669140815734863, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7707800269126892, + "num_tokens": 621458400.0, + "step": 24013 + }, + { + "epoch": 2.637162310564463, + "grad_norm": 2.064739465713501, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.755162239074707, + "num_tokens": 621485806.0, + "step": 24014 + }, + { + "epoch": 2.637272128267077, + "grad_norm": 2.26509165763855, + "learning_rate": 5e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.769751787185669, + "num_tokens": 621508273.0, + "step": 24015 + }, + { + "epoch": 2.6373819459696906, + "grad_norm": 2.721468925476074, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7608257532119751, + "num_tokens": 621526305.0, + "step": 24016 + }, + { + "epoch": 2.637491763672304, + "grad_norm": 1.8199912309646606, + "learning_rate": 5e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7721627950668335, + "num_tokens": 621560195.0, + "step": 24017 + }, + { + "epoch": 2.6376015813749176, + "grad_norm": 2.153672456741333, + "learning_rate": 5e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7614635825157166, + "num_tokens": 621585040.0, + "step": 24018 + }, + { + "epoch": 2.6377113990775314, + "grad_norm": 1.9070568084716797, + "learning_rate": 5e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7705564498901367, + "num_tokens": 621612512.0, + "step": 24019 + }, + { + "epoch": 2.6378212167801447, + "grad_norm": 2.2603085041046143, + "learning_rate": 5e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7504773139953613, + "num_tokens": 621635212.0, + "step": 24020 + }, + { + "epoch": 2.637931034482759, + "grad_norm": 2.2748782634735107, + "learning_rate": 5e-06, + "loss": 0.704, + "mean_token_accuracy": 0.767951250076294, + "num_tokens": 621657184.0, + "step": 24021 + }, + { + "epoch": 2.638040852185372, + "grad_norm": 2.1175694465637207, + "learning_rate": 5e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7474896907806396, + "num_tokens": 621684203.0, + "step": 24022 + }, + { + "epoch": 2.638150669887986, + "grad_norm": 1.9406914710998535, + "learning_rate": 5e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7349140048027039, + "num_tokens": 621714367.0, + "step": 24023 + }, + { + "epoch": 2.6382604875905997, + "grad_norm": 1.8903162479400635, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7617243528366089, + "num_tokens": 621742656.0, + "step": 24024 + }, + { + "epoch": 2.638370305293213, + "grad_norm": 1.907459020614624, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7516456246376038, + "num_tokens": 621771565.0, + "step": 24025 + }, + { + "epoch": 2.638480122995827, + "grad_norm": 2.0198707580566406, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7573277950286865, + "num_tokens": 621798868.0, + "step": 24026 + }, + { + "epoch": 2.6385899406984406, + "grad_norm": 1.9935684204101562, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7650262713432312, + "num_tokens": 621827008.0, + "step": 24027 + }, + { + "epoch": 2.6386997584010543, + "grad_norm": 1.8287503719329834, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7698267698287964, + "num_tokens": 621859300.0, + "step": 24028 + }, + { + "epoch": 2.638809576103668, + "grad_norm": 2.1969757080078125, + "learning_rate": 5e-06, + "loss": 0.668, + "mean_token_accuracy": 0.7801251411437988, + "num_tokens": 621882339.0, + "step": 24029 + }, + { + "epoch": 2.6389193938062814, + "grad_norm": 2.4519195556640625, + "learning_rate": 5e-06, + "loss": 0.6721, + "mean_token_accuracy": 0.7777026891708374, + "num_tokens": 621902764.0, + "step": 24030 + }, + { + "epoch": 2.639029211508895, + "grad_norm": 2.1980884075164795, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7614443898200989, + "num_tokens": 621925662.0, + "step": 24031 + }, + { + "epoch": 2.639139029211509, + "grad_norm": 1.9861491918563843, + "learning_rate": 5e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.7841486930847168, + "num_tokens": 621948384.0, + "step": 24032 + }, + { + "epoch": 2.6392488469141226, + "grad_norm": 2.062239170074463, + "learning_rate": 5e-06, + "loss": 0.6662, + "mean_token_accuracy": 0.7836888432502747, + "num_tokens": 621975408.0, + "step": 24033 + }, + { + "epoch": 2.6393586646167364, + "grad_norm": 2.459359645843506, + "learning_rate": 5e-06, + "loss": 0.6306, + "mean_token_accuracy": 0.79165118932724, + "num_tokens": 621994862.0, + "step": 24034 + }, + { + "epoch": 2.6394684823193497, + "grad_norm": 2.3133418560028076, + "learning_rate": 5e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7567785978317261, + "num_tokens": 622017478.0, + "step": 24035 + }, + { + "epoch": 2.6395783000219635, + "grad_norm": 2.0882315635681152, + "learning_rate": 5e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7727993726730347, + "num_tokens": 622040947.0, + "step": 24036 + }, + { + "epoch": 2.6396881177245772, + "grad_norm": 2.207076072692871, + "learning_rate": 5e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.769752562046051, + "num_tokens": 622064764.0, + "step": 24037 + }, + { + "epoch": 2.639797935427191, + "grad_norm": 2.14373517036438, + "learning_rate": 5e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.764518678188324, + "num_tokens": 622090816.0, + "step": 24038 + }, + { + "epoch": 2.6399077531298047, + "grad_norm": 2.021350622177124, + "learning_rate": 5e-06, + "loss": 0.7875, + "mean_token_accuracy": 0.7423433065414429, + "num_tokens": 622120861.0, + "step": 24039 + }, + { + "epoch": 2.640017570832418, + "grad_norm": 2.365583896636963, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7581369876861572, + "num_tokens": 622143093.0, + "step": 24040 + }, + { + "epoch": 2.640127388535032, + "grad_norm": 2.215294599533081, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7629068493843079, + "num_tokens": 622166872.0, + "step": 24041 + }, + { + "epoch": 2.6402372062376456, + "grad_norm": 2.045787811279297, + "learning_rate": 5e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7395017147064209, + "num_tokens": 622195108.0, + "step": 24042 + }, + { + "epoch": 2.6403470239402593, + "grad_norm": 2.3084301948547363, + "learning_rate": 5e-06, + "loss": 0.6598, + "mean_token_accuracy": 0.781546413898468, + "num_tokens": 622217260.0, + "step": 24043 + }, + { + "epoch": 2.640456841642873, + "grad_norm": 1.9844753742218018, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.769179105758667, + "num_tokens": 622244473.0, + "step": 24044 + }, + { + "epoch": 2.6405666593454864, + "grad_norm": 2.1326799392700195, + "learning_rate": 5e-06, + "loss": 0.6433, + "mean_token_accuracy": 0.782251238822937, + "num_tokens": 622265791.0, + "step": 24045 + }, + { + "epoch": 2.6406764770481, + "grad_norm": 2.079335927963257, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7591732740402222, + "num_tokens": 622292061.0, + "step": 24046 + }, + { + "epoch": 2.640786294750714, + "grad_norm": 2.0523464679718018, + "learning_rate": 5e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.7804033756256104, + "num_tokens": 622318428.0, + "step": 24047 + }, + { + "epoch": 2.6408961124533272, + "grad_norm": 2.209263324737549, + "learning_rate": 5e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7642858028411865, + "num_tokens": 622343919.0, + "step": 24048 + }, + { + "epoch": 2.641005930155941, + "grad_norm": 2.13809871673584, + "learning_rate": 5e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7445486783981323, + "num_tokens": 622371380.0, + "step": 24049 + }, + { + "epoch": 2.6411157478585547, + "grad_norm": 1.7651818990707397, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7600347399711609, + "num_tokens": 622403880.0, + "step": 24050 + }, + { + "epoch": 2.6412255655611685, + "grad_norm": 1.962174654006958, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.759550154209137, + "num_tokens": 622429721.0, + "step": 24051 + }, + { + "epoch": 2.6413353832637823, + "grad_norm": 2.179469108581543, + "learning_rate": 5e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.786454975605011, + "num_tokens": 622451126.0, + "step": 24052 + }, + { + "epoch": 2.6414452009663956, + "grad_norm": 2.0405290126800537, + "learning_rate": 5e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7522530555725098, + "num_tokens": 622481885.0, + "step": 24053 + }, + { + "epoch": 2.6415550186690093, + "grad_norm": 2.7057204246520996, + "learning_rate": 5e-06, + "loss": 0.5889, + "mean_token_accuracy": 0.7999260425567627, + "num_tokens": 622498337.0, + "step": 24054 + }, + { + "epoch": 2.641664836371623, + "grad_norm": 2.115812301635742, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.755297839641571, + "num_tokens": 622522747.0, + "step": 24055 + }, + { + "epoch": 2.641774654074237, + "grad_norm": 2.2611215114593506, + "learning_rate": 5e-06, + "loss": 0.7015, + "mean_token_accuracy": 0.7677834033966064, + "num_tokens": 622544376.0, + "step": 24056 + }, + { + "epoch": 2.6418844717768506, + "grad_norm": 1.8169816732406616, + "learning_rate": 5e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7377114295959473, + "num_tokens": 622581757.0, + "step": 24057 + }, + { + "epoch": 2.641994289479464, + "grad_norm": 2.476395845413208, + "learning_rate": 5e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.7854903936386108, + "num_tokens": 622601323.0, + "step": 24058 + }, + { + "epoch": 2.6421041071820777, + "grad_norm": 1.823501706123352, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7577272057533264, + "num_tokens": 622637465.0, + "step": 24059 + }, + { + "epoch": 2.6422139248846914, + "grad_norm": 2.448087215423584, + "learning_rate": 5e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7533143162727356, + "num_tokens": 622658082.0, + "step": 24060 + }, + { + "epoch": 2.642323742587305, + "grad_norm": 2.3122031688690186, + "learning_rate": 5e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.768738865852356, + "num_tokens": 622679915.0, + "step": 24061 + }, + { + "epoch": 2.642433560289919, + "grad_norm": 2.222209930419922, + "learning_rate": 5e-06, + "loss": 0.6156, + "mean_token_accuracy": 0.7892467379570007, + "num_tokens": 622701348.0, + "step": 24062 + }, + { + "epoch": 2.6425433779925322, + "grad_norm": 2.1182515621185303, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7778513431549072, + "num_tokens": 622728325.0, + "step": 24063 + }, + { + "epoch": 2.642653195695146, + "grad_norm": 2.201280117034912, + "learning_rate": 5e-06, + "loss": 0.7065, + "mean_token_accuracy": 0.7683301568031311, + "num_tokens": 622751555.0, + "step": 24064 + }, + { + "epoch": 2.6427630133977598, + "grad_norm": 2.1078543663024902, + "learning_rate": 5e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7416539192199707, + "num_tokens": 622777147.0, + "step": 24065 + }, + { + "epoch": 2.6428728311003735, + "grad_norm": 1.9135338068008423, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.754158616065979, + "num_tokens": 622805820.0, + "step": 24066 + }, + { + "epoch": 2.6429826488029873, + "grad_norm": 2.0235249996185303, + "learning_rate": 5e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.7477757930755615, + "num_tokens": 622833748.0, + "step": 24067 + }, + { + "epoch": 2.6430924665056006, + "grad_norm": 1.7399652004241943, + "learning_rate": 5e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7708269953727722, + "num_tokens": 622867507.0, + "step": 24068 + }, + { + "epoch": 2.6432022842082143, + "grad_norm": 2.068974733352661, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7600111961364746, + "num_tokens": 622894586.0, + "step": 24069 + }, + { + "epoch": 2.643312101910828, + "grad_norm": 2.0299439430236816, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7631869316101074, + "num_tokens": 622921736.0, + "step": 24070 + }, + { + "epoch": 2.6434219196134414, + "grad_norm": 1.8884351253509521, + "learning_rate": 5e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7527148723602295, + "num_tokens": 622954693.0, + "step": 24071 + }, + { + "epoch": 2.6435317373160556, + "grad_norm": 1.9325987100601196, + "learning_rate": 5e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.748829185962677, + "num_tokens": 622983479.0, + "step": 24072 + }, + { + "epoch": 2.643641555018669, + "grad_norm": 2.187814474105835, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7700595855712891, + "num_tokens": 623008455.0, + "step": 24073 + }, + { + "epoch": 2.6437513727212827, + "grad_norm": 2.4707531929016113, + "learning_rate": 5e-06, + "loss": 0.6038, + "mean_token_accuracy": 0.7918835878372192, + "num_tokens": 623029374.0, + "step": 24074 + }, + { + "epoch": 2.6438611904238964, + "grad_norm": 2.1694862842559814, + "learning_rate": 5e-06, + "loss": 0.7414, + "mean_token_accuracy": 0.7581891417503357, + "num_tokens": 623054927.0, + "step": 24075 + }, + { + "epoch": 2.6439710081265098, + "grad_norm": 1.999220609664917, + "learning_rate": 5e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.754624605178833, + "num_tokens": 623085398.0, + "step": 24076 + }, + { + "epoch": 2.6440808258291235, + "grad_norm": 1.8375288248062134, + "learning_rate": 5e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7293010354042053, + "num_tokens": 623117907.0, + "step": 24077 + }, + { + "epoch": 2.6441906435317373, + "grad_norm": 2.1786510944366455, + "learning_rate": 5e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.771863579750061, + "num_tokens": 623139896.0, + "step": 24078 + }, + { + "epoch": 2.644300461234351, + "grad_norm": 2.0472424030303955, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7592525482177734, + "num_tokens": 623167755.0, + "step": 24079 + }, + { + "epoch": 2.644410278936965, + "grad_norm": 2.05576229095459, + "learning_rate": 5e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7643382549285889, + "num_tokens": 623192676.0, + "step": 24080 + }, + { + "epoch": 2.644520096639578, + "grad_norm": 2.0905561447143555, + "learning_rate": 5e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7509394288063049, + "num_tokens": 623221082.0, + "step": 24081 + }, + { + "epoch": 2.644629914342192, + "grad_norm": 1.9936988353729248, + "learning_rate": 5e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7333162426948547, + "num_tokens": 623249752.0, + "step": 24082 + }, + { + "epoch": 2.6447397320448056, + "grad_norm": 1.9124391078948975, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7448612451553345, + "num_tokens": 623280427.0, + "step": 24083 + }, + { + "epoch": 2.6448495497474194, + "grad_norm": 2.2336533069610596, + "learning_rate": 5e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.76316899061203, + "num_tokens": 623303088.0, + "step": 24084 + }, + { + "epoch": 2.644959367450033, + "grad_norm": 2.0545356273651123, + "learning_rate": 5e-06, + "loss": 0.663, + "mean_token_accuracy": 0.7806039452552795, + "num_tokens": 623326471.0, + "step": 24085 + }, + { + "epoch": 2.6450691851526464, + "grad_norm": 2.008690595626831, + "learning_rate": 5e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7574121952056885, + "num_tokens": 623350296.0, + "step": 24086 + }, + { + "epoch": 2.64517900285526, + "grad_norm": 2.213418483734131, + "learning_rate": 5e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7636416554450989, + "num_tokens": 623374528.0, + "step": 24087 + }, + { + "epoch": 2.645288820557874, + "grad_norm": 2.0484018325805664, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7689396142959595, + "num_tokens": 623400134.0, + "step": 24088 + }, + { + "epoch": 2.6453986382604877, + "grad_norm": 2.2788243293762207, + "learning_rate": 5e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7841789722442627, + "num_tokens": 623421225.0, + "step": 24089 + }, + { + "epoch": 2.6455084559631015, + "grad_norm": 1.9571845531463623, + "learning_rate": 5e-06, + "loss": 0.6984, + "mean_token_accuracy": 0.7678529024124146, + "num_tokens": 623449971.0, + "step": 24090 + }, + { + "epoch": 2.645618273665715, + "grad_norm": 2.0327816009521484, + "learning_rate": 5e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7601149678230286, + "num_tokens": 623477497.0, + "step": 24091 + }, + { + "epoch": 2.6457280913683285, + "grad_norm": 2.0708627700805664, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7621814012527466, + "num_tokens": 623502466.0, + "step": 24092 + }, + { + "epoch": 2.6458379090709423, + "grad_norm": 1.7383966445922852, + "learning_rate": 5e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7787707448005676, + "num_tokens": 623533707.0, + "step": 24093 + }, + { + "epoch": 2.645947726773556, + "grad_norm": 2.301072359085083, + "learning_rate": 5e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.7793748378753662, + "num_tokens": 623554853.0, + "step": 24094 + }, + { + "epoch": 2.64605754447617, + "grad_norm": 2.1135475635528564, + "learning_rate": 5e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7676730751991272, + "num_tokens": 623577816.0, + "step": 24095 + }, + { + "epoch": 2.646167362178783, + "grad_norm": 2.1583855152130127, + "learning_rate": 5e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7711019515991211, + "num_tokens": 623604129.0, + "step": 24096 + }, + { + "epoch": 2.646277179881397, + "grad_norm": 2.4856114387512207, + "learning_rate": 5e-06, + "loss": 0.6269, + "mean_token_accuracy": 0.7949498891830444, + "num_tokens": 623623354.0, + "step": 24097 + }, + { + "epoch": 2.6463869975840106, + "grad_norm": 2.3247640132904053, + "learning_rate": 5e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7692238092422485, + "num_tokens": 623646740.0, + "step": 24098 + }, + { + "epoch": 2.646496815286624, + "grad_norm": 2.0413336753845215, + "learning_rate": 5e-06, + "loss": 0.708, + "mean_token_accuracy": 0.7611098289489746, + "num_tokens": 623676655.0, + "step": 24099 + }, + { + "epoch": 2.6466066329892377, + "grad_norm": 2.188767910003662, + "learning_rate": 5e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.760811984539032, + "num_tokens": 623697933.0, + "step": 24100 + }, + { + "epoch": 2.6467164506918515, + "grad_norm": 2.2947564125061035, + "learning_rate": 5e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7609270811080933, + "num_tokens": 623720381.0, + "step": 24101 + }, + { + "epoch": 2.646826268394465, + "grad_norm": 2.0975265502929688, + "learning_rate": 5e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7567716836929321, + "num_tokens": 623745442.0, + "step": 24102 + }, + { + "epoch": 2.646936086097079, + "grad_norm": 2.3491687774658203, + "learning_rate": 5e-06, + "loss": 0.6509, + "mean_token_accuracy": 0.7818164825439453, + "num_tokens": 623767757.0, + "step": 24103 + }, + { + "epoch": 2.6470459037996923, + "grad_norm": 2.1210014820098877, + "learning_rate": 5e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7599263787269592, + "num_tokens": 623794304.0, + "step": 24104 + }, + { + "epoch": 2.647155721502306, + "grad_norm": 1.945926547050476, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7481826543807983, + "num_tokens": 623824358.0, + "step": 24105 + }, + { + "epoch": 2.64726553920492, + "grad_norm": 2.0805065631866455, + "learning_rate": 5e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.7788863182067871, + "num_tokens": 623848484.0, + "step": 24106 + }, + { + "epoch": 2.6473753569075336, + "grad_norm": 2.214463233947754, + "learning_rate": 5e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.7738766670227051, + "num_tokens": 623871274.0, + "step": 24107 + }, + { + "epoch": 2.6474851746101473, + "grad_norm": 2.1932950019836426, + "learning_rate": 5e-06, + "loss": 0.6391, + "mean_token_accuracy": 0.7831755876541138, + "num_tokens": 623894653.0, + "step": 24108 + }, + { + "epoch": 2.6475949923127606, + "grad_norm": 2.146756410598755, + "learning_rate": 5e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.7581932544708252, + "num_tokens": 623918633.0, + "step": 24109 + }, + { + "epoch": 2.6477048100153744, + "grad_norm": 2.076883554458618, + "learning_rate": 5e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7612071633338928, + "num_tokens": 623944797.0, + "step": 24110 + }, + { + "epoch": 2.647814627717988, + "grad_norm": 2.265925168991089, + "learning_rate": 5e-06, + "loss": 0.6669, + "mean_token_accuracy": 0.7843029499053955, + "num_tokens": 623965594.0, + "step": 24111 + }, + { + "epoch": 2.647924445420602, + "grad_norm": 1.8566350936889648, + "learning_rate": 5e-06, + "loss": 0.6552, + "mean_token_accuracy": 0.7879213690757751, + "num_tokens": 623995056.0, + "step": 24112 + }, + { + "epoch": 2.6480342631232157, + "grad_norm": 1.8876805305480957, + "learning_rate": 5e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7565834522247314, + "num_tokens": 624024709.0, + "step": 24113 + }, + { + "epoch": 2.648144080825829, + "grad_norm": 1.8912166357040405, + "learning_rate": 5e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7740329504013062, + "num_tokens": 624055465.0, + "step": 24114 + }, + { + "epoch": 2.6482538985284427, + "grad_norm": 1.9280943870544434, + "learning_rate": 5e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.7971079349517822, + "num_tokens": 624081518.0, + "step": 24115 + }, + { + "epoch": 2.6483637162310565, + "grad_norm": 1.970916509628296, + "learning_rate": 5e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.746568500995636, + "num_tokens": 624109716.0, + "step": 24116 + }, + { + "epoch": 2.6484735339336702, + "grad_norm": 2.1785783767700195, + "learning_rate": 5e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7384829521179199, + "num_tokens": 624137834.0, + "step": 24117 + }, + { + "epoch": 2.648583351636284, + "grad_norm": 2.1960787773132324, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7605630159378052, + "num_tokens": 624161784.0, + "step": 24118 + }, + { + "epoch": 2.6486931693388973, + "grad_norm": 2.3731181621551514, + "learning_rate": 5e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.7721397876739502, + "num_tokens": 624181444.0, + "step": 24119 + }, + { + "epoch": 2.648802987041511, + "grad_norm": 2.109066963195801, + "learning_rate": 5e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7483670711517334, + "num_tokens": 624208077.0, + "step": 24120 + }, + { + "epoch": 2.648912804744125, + "grad_norm": 2.1892521381378174, + "learning_rate": 5e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7695612907409668, + "num_tokens": 624236698.0, + "step": 24121 + }, + { + "epoch": 2.649022622446738, + "grad_norm": 2.285601854324341, + "learning_rate": 5e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7616285085678101, + "num_tokens": 624260600.0, + "step": 24122 + }, + { + "epoch": 2.6491324401493523, + "grad_norm": 2.1051201820373535, + "learning_rate": 5e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7726491689682007, + "num_tokens": 624283662.0, + "step": 24123 + }, + { + "epoch": 2.6492422578519657, + "grad_norm": 2.018367290496826, + "learning_rate": 5e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7538185119628906, + "num_tokens": 624310668.0, + "step": 24124 + }, + { + "epoch": 2.6493520755545794, + "grad_norm": 2.0764780044555664, + "learning_rate": 5e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7658132314682007, + "num_tokens": 624336102.0, + "step": 24125 + }, + { + "epoch": 2.649461893257193, + "grad_norm": 1.997487187385559, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7844704389572144, + "num_tokens": 624363786.0, + "step": 24126 + }, + { + "epoch": 2.6495717109598065, + "grad_norm": 2.276386260986328, + "learning_rate": 5e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.7836477756500244, + "num_tokens": 624384697.0, + "step": 24127 + }, + { + "epoch": 2.6496815286624202, + "grad_norm": 2.160900115966797, + "learning_rate": 5e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7742782235145569, + "num_tokens": 624407692.0, + "step": 24128 + }, + { + "epoch": 2.649791346365034, + "grad_norm": 2.1143763065338135, + "learning_rate": 5e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7554070949554443, + "num_tokens": 624432712.0, + "step": 24129 + }, + { + "epoch": 2.6499011640676478, + "grad_norm": 1.9723726511001587, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.760091245174408, + "num_tokens": 624460193.0, + "step": 24130 + }, + { + "epoch": 2.6500109817702615, + "grad_norm": 1.98957359790802, + "learning_rate": 5e-06, + "loss": 0.7243, + "mean_token_accuracy": 0.7651762962341309, + "num_tokens": 624487274.0, + "step": 24131 + }, + { + "epoch": 2.650120799472875, + "grad_norm": 2.1469807624816895, + "learning_rate": 5e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7630212903022766, + "num_tokens": 624512758.0, + "step": 24132 + }, + { + "epoch": 2.6502306171754886, + "grad_norm": 2.0120654106140137, + "learning_rate": 5e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7414187788963318, + "num_tokens": 624541750.0, + "step": 24133 + }, + { + "epoch": 2.6503404348781023, + "grad_norm": 2.028020143508911, + "learning_rate": 5e-06, + "loss": 0.6758, + "mean_token_accuracy": 0.7762700319290161, + "num_tokens": 624568092.0, + "step": 24134 + }, + { + "epoch": 2.650450252580716, + "grad_norm": 2.128282070159912, + "learning_rate": 5e-06, + "loss": 0.7008, + "mean_token_accuracy": 0.7709048390388489, + "num_tokens": 624592783.0, + "step": 24135 + }, + { + "epoch": 2.65056007028333, + "grad_norm": 1.972231388092041, + "learning_rate": 5e-06, + "loss": 0.7346, + "mean_token_accuracy": 0.7633563876152039, + "num_tokens": 624622053.0, + "step": 24136 + }, + { + "epoch": 2.650669887985943, + "grad_norm": 2.3142902851104736, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7535320520401001, + "num_tokens": 624645389.0, + "step": 24137 + }, + { + "epoch": 2.650779705688557, + "grad_norm": 2.0278232097625732, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7342589497566223, + "num_tokens": 624678018.0, + "step": 24138 + }, + { + "epoch": 2.6508895233911707, + "grad_norm": 2.130662202835083, + "learning_rate": 5e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7647032737731934, + "num_tokens": 624704365.0, + "step": 24139 + }, + { + "epoch": 2.6509993410937844, + "grad_norm": 1.9652900695800781, + "learning_rate": 5e-06, + "loss": 0.7666, + "mean_token_accuracy": 0.7468054294586182, + "num_tokens": 624733821.0, + "step": 24140 + }, + { + "epoch": 2.651109158796398, + "grad_norm": 1.7951024770736694, + "learning_rate": 5e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7584021091461182, + "num_tokens": 624768422.0, + "step": 24141 + }, + { + "epoch": 2.6512189764990115, + "grad_norm": 2.426506996154785, + "learning_rate": 5e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7425498962402344, + "num_tokens": 624793290.0, + "step": 24142 + }, + { + "epoch": 2.6513287942016253, + "grad_norm": 2.3606786727905273, + "learning_rate": 5e-06, + "loss": 0.6729, + "mean_token_accuracy": 0.7716213464736938, + "num_tokens": 624814692.0, + "step": 24143 + }, + { + "epoch": 2.651438611904239, + "grad_norm": 1.811922550201416, + "learning_rate": 5e-06, + "loss": 0.7498, + "mean_token_accuracy": 0.7507364153862, + "num_tokens": 624847141.0, + "step": 24144 + }, + { + "epoch": 2.6515484296068528, + "grad_norm": 1.8535640239715576, + "learning_rate": 5e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7577289342880249, + "num_tokens": 624877002.0, + "step": 24145 + }, + { + "epoch": 2.6516582473094665, + "grad_norm": 1.925076961517334, + "learning_rate": 5e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.7559307813644409, + "num_tokens": 624905830.0, + "step": 24146 + }, + { + "epoch": 2.65176806501208, + "grad_norm": 2.0774621963500977, + "learning_rate": 5e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7577264308929443, + "num_tokens": 624933586.0, + "step": 24147 + }, + { + "epoch": 2.6518778827146936, + "grad_norm": 2.0809459686279297, + "learning_rate": 5e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.7778064012527466, + "num_tokens": 624957541.0, + "step": 24148 + }, + { + "epoch": 2.6519877004173074, + "grad_norm": 1.9589303731918335, + "learning_rate": 5e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7550863027572632, + "num_tokens": 624985574.0, + "step": 24149 + }, + { + "epoch": 2.6520975181199207, + "grad_norm": 1.8183225393295288, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7712821960449219, + "num_tokens": 625014859.0, + "step": 24150 + }, + { + "epoch": 2.6522073358225344, + "grad_norm": 2.096773624420166, + "learning_rate": 5e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.7506202459335327, + "num_tokens": 625040411.0, + "step": 24151 + }, + { + "epoch": 2.652317153525148, + "grad_norm": 1.8364366292953491, + "learning_rate": 5e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7780226469039917, + "num_tokens": 625070465.0, + "step": 24152 + }, + { + "epoch": 2.652426971227762, + "grad_norm": 2.0846054553985596, + "learning_rate": 5e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.7825191020965576, + "num_tokens": 625094168.0, + "step": 24153 + }, + { + "epoch": 2.6525367889303757, + "grad_norm": 2.0020391941070557, + "learning_rate": 5e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7510200142860413, + "num_tokens": 625120716.0, + "step": 24154 + }, + { + "epoch": 2.652646606632989, + "grad_norm": 2.0357956886291504, + "learning_rate": 5e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7517442107200623, + "num_tokens": 625147121.0, + "step": 24155 + }, + { + "epoch": 2.6527564243356028, + "grad_norm": 1.9763338565826416, + "learning_rate": 5e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7767072916030884, + "num_tokens": 625173537.0, + "step": 24156 + }, + { + "epoch": 2.6528662420382165, + "grad_norm": 2.1397712230682373, + "learning_rate": 5e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7711443305015564, + "num_tokens": 625200785.0, + "step": 24157 + }, + { + "epoch": 2.6529760597408303, + "grad_norm": 2.106602668762207, + "learning_rate": 5e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.7857690453529358, + "num_tokens": 625225195.0, + "step": 24158 + }, + { + "epoch": 2.653085877443444, + "grad_norm": 2.1854889392852783, + "learning_rate": 5e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7550687789916992, + "num_tokens": 625248632.0, + "step": 24159 + }, + { + "epoch": 2.6531956951460574, + "grad_norm": 2.1761906147003174, + "learning_rate": 5e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7493932247161865, + "num_tokens": 625274565.0, + "step": 24160 + }, + { + "epoch": 2.653305512848671, + "grad_norm": 1.8686645030975342, + "learning_rate": 5e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7652637958526611, + "num_tokens": 625307512.0, + "step": 24161 + }, + { + "epoch": 2.653415330551285, + "grad_norm": 2.0232796669006348, + "learning_rate": 5e-06, + "loss": 0.6412, + "mean_token_accuracy": 0.7819920778274536, + "num_tokens": 625334084.0, + "step": 24162 + }, + { + "epoch": 2.6535251482538986, + "grad_norm": 2.3682479858398438, + "learning_rate": 5e-06, + "loss": 0.621, + "mean_token_accuracy": 0.7932131886482239, + "num_tokens": 625354789.0, + "step": 24163 + }, + { + "epoch": 2.6536349659565124, + "grad_norm": 1.8489521741867065, + "learning_rate": 5e-06, + "loss": 0.684, + "mean_token_accuracy": 0.7812604904174805, + "num_tokens": 625385752.0, + "step": 24164 + }, + { + "epoch": 2.6537447836591257, + "grad_norm": 2.1208078861236572, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7516739368438721, + "num_tokens": 625410812.0, + "step": 24165 + }, + { + "epoch": 2.6538546013617395, + "grad_norm": 2.167860984802246, + "learning_rate": 5e-06, + "loss": 0.7622, + "mean_token_accuracy": 0.7648525238037109, + "num_tokens": 625435668.0, + "step": 24166 + }, + { + "epoch": 2.653964419064353, + "grad_norm": 2.0572800636291504, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7453596591949463, + "num_tokens": 625465680.0, + "step": 24167 + }, + { + "epoch": 2.654074236766967, + "grad_norm": 2.4298322200775146, + "learning_rate": 5e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.7838073968887329, + "num_tokens": 625484050.0, + "step": 24168 + }, + { + "epoch": 2.6541840544695807, + "grad_norm": 2.1545560359954834, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.778779149055481, + "num_tokens": 625508046.0, + "step": 24169 + }, + { + "epoch": 2.654293872172194, + "grad_norm": 2.2909600734710693, + "learning_rate": 5e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7668418884277344, + "num_tokens": 625530243.0, + "step": 24170 + }, + { + "epoch": 2.654403689874808, + "grad_norm": 2.169149160385132, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7640602588653564, + "num_tokens": 625552349.0, + "step": 24171 + }, + { + "epoch": 2.6545135075774215, + "grad_norm": 2.0770370960235596, + "learning_rate": 5e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7650203704833984, + "num_tokens": 625575200.0, + "step": 24172 + }, + { + "epoch": 2.6546233252800353, + "grad_norm": 2.0417656898498535, + "learning_rate": 5e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.752548336982727, + "num_tokens": 625602094.0, + "step": 24173 + }, + { + "epoch": 2.654733142982649, + "grad_norm": 2.0198216438293457, + "learning_rate": 5e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.7751393914222717, + "num_tokens": 625629563.0, + "step": 24174 + }, + { + "epoch": 2.6548429606852624, + "grad_norm": 2.3118691444396973, + "learning_rate": 5e-06, + "loss": 0.6654, + "mean_token_accuracy": 0.7833693027496338, + "num_tokens": 625650047.0, + "step": 24175 + }, + { + "epoch": 2.654952778387876, + "grad_norm": 2.4394288063049316, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.773688018321991, + "num_tokens": 625669137.0, + "step": 24176 + }, + { + "epoch": 2.65506259609049, + "grad_norm": 2.270461320877075, + "learning_rate": 5e-06, + "loss": 0.6501, + "mean_token_accuracy": 0.7831732034683228, + "num_tokens": 625689887.0, + "step": 24177 + }, + { + "epoch": 2.655172413793103, + "grad_norm": 2.185112476348877, + "learning_rate": 5e-06, + "loss": 0.6415, + "mean_token_accuracy": 0.7901273965835571, + "num_tokens": 625712459.0, + "step": 24178 + }, + { + "epoch": 2.655282231495717, + "grad_norm": 1.9950405359268188, + "learning_rate": 5e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.7483874559402466, + "num_tokens": 625741357.0, + "step": 24179 + }, + { + "epoch": 2.6553920491983307, + "grad_norm": 1.9983131885528564, + "learning_rate": 5e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7574623823165894, + "num_tokens": 625769722.0, + "step": 24180 + }, + { + "epoch": 2.6555018669009445, + "grad_norm": 2.3203179836273193, + "learning_rate": 5e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7706575393676758, + "num_tokens": 625792196.0, + "step": 24181 + }, + { + "epoch": 2.6556116846035582, + "grad_norm": 1.9033167362213135, + "learning_rate": 5e-06, + "loss": 0.6858, + "mean_token_accuracy": 0.7800113558769226, + "num_tokens": 625821252.0, + "step": 24182 + }, + { + "epoch": 2.6557215023061715, + "grad_norm": 2.26375150680542, + "learning_rate": 5e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.789315938949585, + "num_tokens": 625843616.0, + "step": 24183 + }, + { + "epoch": 2.6558313200087853, + "grad_norm": 1.9902831315994263, + "learning_rate": 5e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7328382730484009, + "num_tokens": 625873700.0, + "step": 24184 + }, + { + "epoch": 2.655941137711399, + "grad_norm": 2.2605419158935547, + "learning_rate": 5e-06, + "loss": 0.6494, + "mean_token_accuracy": 0.786270797252655, + "num_tokens": 625893126.0, + "step": 24185 + }, + { + "epoch": 2.656050955414013, + "grad_norm": 2.486494302749634, + "learning_rate": 5e-06, + "loss": 0.6678, + "mean_token_accuracy": 0.7803133726119995, + "num_tokens": 625911341.0, + "step": 24186 + }, + { + "epoch": 2.6561607731166266, + "grad_norm": 1.8523393869400024, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.76129549741745, + "num_tokens": 625939935.0, + "step": 24187 + }, + { + "epoch": 2.65627059081924, + "grad_norm": 2.222141742706299, + "learning_rate": 5e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7671557664871216, + "num_tokens": 625963142.0, + "step": 24188 + }, + { + "epoch": 2.6563804085218536, + "grad_norm": 2.2430083751678467, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7569758892059326, + "num_tokens": 625987517.0, + "step": 24189 + }, + { + "epoch": 2.6564902262244674, + "grad_norm": 2.0106124877929688, + "learning_rate": 5e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.7741016745567322, + "num_tokens": 626015444.0, + "step": 24190 + }, + { + "epoch": 2.656600043927081, + "grad_norm": 2.255939483642578, + "learning_rate": 5e-06, + "loss": 0.716, + "mean_token_accuracy": 0.770674467086792, + "num_tokens": 626039807.0, + "step": 24191 + }, + { + "epoch": 2.656709861629695, + "grad_norm": 1.9375882148742676, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.774401843547821, + "num_tokens": 626066619.0, + "step": 24192 + }, + { + "epoch": 2.6568196793323082, + "grad_norm": 1.9108623266220093, + "learning_rate": 5e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.766714870929718, + "num_tokens": 626093325.0, + "step": 24193 + }, + { + "epoch": 2.656929497034922, + "grad_norm": 2.1743335723876953, + "learning_rate": 5e-06, + "loss": 0.6204, + "mean_token_accuracy": 0.7926276922225952, + "num_tokens": 626117115.0, + "step": 24194 + }, + { + "epoch": 2.6570393147375357, + "grad_norm": 2.052982807159424, + "learning_rate": 5e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7485947012901306, + "num_tokens": 626143065.0, + "step": 24195 + }, + { + "epoch": 2.6571491324401495, + "grad_norm": 1.9285715818405151, + "learning_rate": 5e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7720292210578918, + "num_tokens": 626172325.0, + "step": 24196 + }, + { + "epoch": 2.6572589501427633, + "grad_norm": 2.2148208618164062, + "learning_rate": 5e-06, + "loss": 0.627, + "mean_token_accuracy": 0.7862201929092407, + "num_tokens": 626194105.0, + "step": 24197 + }, + { + "epoch": 2.6573687678453766, + "grad_norm": 1.9840131998062134, + "learning_rate": 5e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.7797276973724365, + "num_tokens": 626223506.0, + "step": 24198 + }, + { + "epoch": 2.6574785855479903, + "grad_norm": 2.0598175525665283, + "learning_rate": 5e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7593567371368408, + "num_tokens": 626250654.0, + "step": 24199 + }, + { + "epoch": 2.657588403250604, + "grad_norm": 2.0130057334899902, + "learning_rate": 5e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.7761064171791077, + "num_tokens": 626279214.0, + "step": 24200 + }, + { + "epoch": 2.6576982209532174, + "grad_norm": 1.9322121143341064, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.754357635974884, + "num_tokens": 626306470.0, + "step": 24201 + }, + { + "epoch": 2.6578080386558316, + "grad_norm": 2.1133639812469482, + "learning_rate": 5e-06, + "loss": 0.6638, + "mean_token_accuracy": 0.7799917459487915, + "num_tokens": 626328837.0, + "step": 24202 + }, + { + "epoch": 2.657917856358445, + "grad_norm": 2.1779539585113525, + "learning_rate": 5e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7541459798812866, + "num_tokens": 626356521.0, + "step": 24203 + }, + { + "epoch": 2.6580276740610587, + "grad_norm": 2.1328439712524414, + "learning_rate": 5e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7513600587844849, + "num_tokens": 626382759.0, + "step": 24204 + }, + { + "epoch": 2.6581374917636724, + "grad_norm": 1.963977336883545, + "learning_rate": 5e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.76856929063797, + "num_tokens": 626411009.0, + "step": 24205 + }, + { + "epoch": 2.6582473094662857, + "grad_norm": 2.0456063747406006, + "learning_rate": 5e-06, + "loss": 0.807, + "mean_token_accuracy": 0.74261873960495, + "num_tokens": 626440425.0, + "step": 24206 + }, + { + "epoch": 2.6583571271688995, + "grad_norm": 2.0039172172546387, + "learning_rate": 5e-06, + "loss": 0.6548, + "mean_token_accuracy": 0.78293776512146, + "num_tokens": 626464997.0, + "step": 24207 + }, + { + "epoch": 2.6584669448715132, + "grad_norm": 2.481722116470337, + "learning_rate": 5e-06, + "loss": 0.6225, + "mean_token_accuracy": 0.7928043603897095, + "num_tokens": 626485856.0, + "step": 24208 + }, + { + "epoch": 2.658576762574127, + "grad_norm": 2.088346004486084, + "learning_rate": 5e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7456519603729248, + "num_tokens": 626511571.0, + "step": 24209 + }, + { + "epoch": 2.6586865802767408, + "grad_norm": 2.3721442222595215, + "learning_rate": 5e-06, + "loss": 0.6454, + "mean_token_accuracy": 0.781584620475769, + "num_tokens": 626532358.0, + "step": 24210 + }, + { + "epoch": 2.658796397979354, + "grad_norm": 2.4802322387695312, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7681739330291748, + "num_tokens": 626551896.0, + "step": 24211 + }, + { + "epoch": 2.658906215681968, + "grad_norm": 1.9844599962234497, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7609832286834717, + "num_tokens": 626581263.0, + "step": 24212 + }, + { + "epoch": 2.6590160333845816, + "grad_norm": 2.0397019386291504, + "learning_rate": 5e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7467169761657715, + "num_tokens": 626609893.0, + "step": 24213 + }, + { + "epoch": 2.6591258510871953, + "grad_norm": 2.4043326377868652, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7754219770431519, + "num_tokens": 626632343.0, + "step": 24214 + }, + { + "epoch": 2.659235668789809, + "grad_norm": 2.104005813598633, + "learning_rate": 5e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7630409598350525, + "num_tokens": 626658893.0, + "step": 24215 + }, + { + "epoch": 2.6593454864924224, + "grad_norm": 2.054391384124756, + "learning_rate": 5e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7466316819190979, + "num_tokens": 626686247.0, + "step": 24216 + }, + { + "epoch": 2.659455304195036, + "grad_norm": 1.949316143989563, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7689681053161621, + "num_tokens": 626713282.0, + "step": 24217 + }, + { + "epoch": 2.65956512189765, + "grad_norm": 2.1901984214782715, + "learning_rate": 5e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7565022110939026, + "num_tokens": 626741697.0, + "step": 24218 + }, + { + "epoch": 2.6596749396002637, + "grad_norm": 2.0537025928497314, + "learning_rate": 5e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7688091993331909, + "num_tokens": 626768640.0, + "step": 24219 + }, + { + "epoch": 2.6597847573028774, + "grad_norm": 2.157557725906372, + "learning_rate": 5e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7714579105377197, + "num_tokens": 626792246.0, + "step": 24220 + }, + { + "epoch": 2.6598945750054908, + "grad_norm": 1.9318513870239258, + "learning_rate": 5e-06, + "loss": 0.7226, + "mean_token_accuracy": 0.7662422060966492, + "num_tokens": 626819432.0, + "step": 24221 + }, + { + "epoch": 2.6600043927081045, + "grad_norm": 1.8733834028244019, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7630347013473511, + "num_tokens": 626847410.0, + "step": 24222 + }, + { + "epoch": 2.6601142104107183, + "grad_norm": 2.1089513301849365, + "learning_rate": 5e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7671425938606262, + "num_tokens": 626873060.0, + "step": 24223 + }, + { + "epoch": 2.660224028113332, + "grad_norm": 1.8681100606918335, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7393605709075928, + "num_tokens": 626903445.0, + "step": 24224 + }, + { + "epoch": 2.660333845815946, + "grad_norm": 2.3044381141662598, + "learning_rate": 5e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.7862595915794373, + "num_tokens": 626925581.0, + "step": 24225 + }, + { + "epoch": 2.660443663518559, + "grad_norm": 2.0386507511138916, + "learning_rate": 5e-06, + "loss": 0.6724, + "mean_token_accuracy": 0.777454674243927, + "num_tokens": 626950278.0, + "step": 24226 + }, + { + "epoch": 2.660553481221173, + "grad_norm": 2.077910900115967, + "learning_rate": 5e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7758976221084595, + "num_tokens": 626974377.0, + "step": 24227 + }, + { + "epoch": 2.6606632989237866, + "grad_norm": 2.061666965484619, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7685625553131104, + "num_tokens": 626999716.0, + "step": 24228 + }, + { + "epoch": 2.6607731166264, + "grad_norm": 1.7755825519561768, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7516108751296997, + "num_tokens": 627033333.0, + "step": 24229 + }, + { + "epoch": 2.6608829343290137, + "grad_norm": 1.9821217060089111, + "learning_rate": 5e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7723312377929688, + "num_tokens": 627059811.0, + "step": 24230 + }, + { + "epoch": 2.6609927520316274, + "grad_norm": 2.1593384742736816, + "learning_rate": 5e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.7560389041900635, + "num_tokens": 627084295.0, + "step": 24231 + }, + { + "epoch": 2.661102569734241, + "grad_norm": 2.0338363647460938, + "learning_rate": 5e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7421712875366211, + "num_tokens": 627112143.0, + "step": 24232 + }, + { + "epoch": 2.661212387436855, + "grad_norm": 2.2975895404815674, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7780139446258545, + "num_tokens": 627133812.0, + "step": 24233 + }, + { + "epoch": 2.6613222051394683, + "grad_norm": 2.738440752029419, + "learning_rate": 5e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7749029397964478, + "num_tokens": 627156566.0, + "step": 24234 + }, + { + "epoch": 2.661432022842082, + "grad_norm": 1.7865369319915771, + "learning_rate": 5e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7496604323387146, + "num_tokens": 627189340.0, + "step": 24235 + }, + { + "epoch": 2.661541840544696, + "grad_norm": 1.8638560771942139, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7645735740661621, + "num_tokens": 627219629.0, + "step": 24236 + }, + { + "epoch": 2.6616516582473095, + "grad_norm": 2.0517385005950928, + "learning_rate": 5e-06, + "loss": 0.741, + "mean_token_accuracy": 0.7571930885314941, + "num_tokens": 627249178.0, + "step": 24237 + }, + { + "epoch": 2.6617614759499233, + "grad_norm": 2.1695785522460938, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.756854772567749, + "num_tokens": 627274370.0, + "step": 24238 + }, + { + "epoch": 2.6618712936525366, + "grad_norm": 2.5794427394866943, + "learning_rate": 5e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.7771581411361694, + "num_tokens": 627294125.0, + "step": 24239 + }, + { + "epoch": 2.6619811113551504, + "grad_norm": 2.246469497680664, + "learning_rate": 5e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7649964690208435, + "num_tokens": 627317282.0, + "step": 24240 + }, + { + "epoch": 2.662090929057764, + "grad_norm": 2.0516273975372314, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7472214698791504, + "num_tokens": 627344388.0, + "step": 24241 + }, + { + "epoch": 2.662200746760378, + "grad_norm": 2.23862624168396, + "learning_rate": 5e-06, + "loss": 0.6611, + "mean_token_accuracy": 0.783406674861908, + "num_tokens": 627366851.0, + "step": 24242 + }, + { + "epoch": 2.6623105644629916, + "grad_norm": 1.8622541427612305, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7500426769256592, + "num_tokens": 627396602.0, + "step": 24243 + }, + { + "epoch": 2.662420382165605, + "grad_norm": 1.9476672410964966, + "learning_rate": 5e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7384792566299438, + "num_tokens": 627430839.0, + "step": 24244 + }, + { + "epoch": 2.6625301998682187, + "grad_norm": 1.980078935623169, + "learning_rate": 5e-06, + "loss": 0.664, + "mean_token_accuracy": 0.7828558683395386, + "num_tokens": 627456297.0, + "step": 24245 + }, + { + "epoch": 2.6626400175708325, + "grad_norm": 2.226172685623169, + "learning_rate": 5e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7443100214004517, + "num_tokens": 627481298.0, + "step": 24246 + }, + { + "epoch": 2.662749835273446, + "grad_norm": 2.227811574935913, + "learning_rate": 5e-06, + "loss": 0.614, + "mean_token_accuracy": 0.7967000603675842, + "num_tokens": 627502809.0, + "step": 24247 + }, + { + "epoch": 2.66285965297606, + "grad_norm": 1.9533897638320923, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7359106540679932, + "num_tokens": 627534946.0, + "step": 24248 + }, + { + "epoch": 2.6629694706786733, + "grad_norm": 2.174476146697998, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7621299028396606, + "num_tokens": 627557581.0, + "step": 24249 + }, + { + "epoch": 2.663079288381287, + "grad_norm": 1.9879616498947144, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7547239661216736, + "num_tokens": 627587148.0, + "step": 24250 + }, + { + "epoch": 2.663189106083901, + "grad_norm": 1.8621797561645508, + "learning_rate": 5e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7760999202728271, + "num_tokens": 627614939.0, + "step": 24251 + }, + { + "epoch": 2.663298923786514, + "grad_norm": 1.9662362337112427, + "learning_rate": 5e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7525745630264282, + "num_tokens": 627643096.0, + "step": 24252 + }, + { + "epoch": 2.6634087414891283, + "grad_norm": 2.239182710647583, + "learning_rate": 5e-06, + "loss": 0.7419, + "mean_token_accuracy": 0.7578403949737549, + "num_tokens": 627667643.0, + "step": 24253 + }, + { + "epoch": 2.6635185591917416, + "grad_norm": 2.036912441253662, + "learning_rate": 5e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.762844443321228, + "num_tokens": 627693914.0, + "step": 24254 + }, + { + "epoch": 2.6636283768943554, + "grad_norm": 2.298241138458252, + "learning_rate": 5e-06, + "loss": 0.6501, + "mean_token_accuracy": 0.7846962213516235, + "num_tokens": 627715351.0, + "step": 24255 + }, + { + "epoch": 2.663738194596969, + "grad_norm": 1.9866299629211426, + "learning_rate": 5e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7586438655853271, + "num_tokens": 627742666.0, + "step": 24256 + }, + { + "epoch": 2.6638480122995825, + "grad_norm": 2.419691801071167, + "learning_rate": 5e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7717310190200806, + "num_tokens": 627762063.0, + "step": 24257 + }, + { + "epoch": 2.663957830002196, + "grad_norm": 2.0777509212493896, + "learning_rate": 5e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7653661966323853, + "num_tokens": 627786071.0, + "step": 24258 + }, + { + "epoch": 2.66406764770481, + "grad_norm": 2.400118589401245, + "learning_rate": 5e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.7688870429992676, + "num_tokens": 627807094.0, + "step": 24259 + }, + { + "epoch": 2.6641774654074237, + "grad_norm": 2.2440450191497803, + "learning_rate": 5e-06, + "loss": 0.641, + "mean_token_accuracy": 0.7888638377189636, + "num_tokens": 627829526.0, + "step": 24260 + }, + { + "epoch": 2.6642872831100375, + "grad_norm": 1.8813527822494507, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7701879143714905, + "num_tokens": 627859317.0, + "step": 24261 + }, + { + "epoch": 2.664397100812651, + "grad_norm": 2.067516326904297, + "learning_rate": 5e-06, + "loss": 0.7631, + "mean_token_accuracy": 0.7564103007316589, + "num_tokens": 627888522.0, + "step": 24262 + }, + { + "epoch": 2.6645069185152646, + "grad_norm": 1.9215489625930786, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7632060647010803, + "num_tokens": 627917138.0, + "step": 24263 + }, + { + "epoch": 2.6646167362178783, + "grad_norm": 2.2135326862335205, + "learning_rate": 5e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7597014307975769, + "num_tokens": 627939897.0, + "step": 24264 + }, + { + "epoch": 2.664726553920492, + "grad_norm": 2.0287704467773438, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7527970671653748, + "num_tokens": 627967001.0, + "step": 24265 + }, + { + "epoch": 2.664836371623106, + "grad_norm": 2.2335336208343506, + "learning_rate": 5e-06, + "loss": 0.689, + "mean_token_accuracy": 0.7716283202171326, + "num_tokens": 627989372.0, + "step": 24266 + }, + { + "epoch": 2.664946189325719, + "grad_norm": 2.1229326725006104, + "learning_rate": 5e-06, + "loss": 0.6727, + "mean_token_accuracy": 0.7749737501144409, + "num_tokens": 628014839.0, + "step": 24267 + }, + { + "epoch": 2.665056007028333, + "grad_norm": 1.9724717140197754, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.752164363861084, + "num_tokens": 628042262.0, + "step": 24268 + }, + { + "epoch": 2.6651658247309467, + "grad_norm": 2.1919233798980713, + "learning_rate": 5e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7525889873504639, + "num_tokens": 628066857.0, + "step": 24269 + }, + { + "epoch": 2.6652756424335604, + "grad_norm": 2.0951530933380127, + "learning_rate": 5e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7521215677261353, + "num_tokens": 628092072.0, + "step": 24270 + }, + { + "epoch": 2.665385460136174, + "grad_norm": 2.089306354522705, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7667275667190552, + "num_tokens": 628117840.0, + "step": 24271 + }, + { + "epoch": 2.6654952778387875, + "grad_norm": 2.4672422409057617, + "learning_rate": 5e-06, + "loss": 0.5705, + "mean_token_accuracy": 0.8052711486816406, + "num_tokens": 628137575.0, + "step": 24272 + }, + { + "epoch": 2.6656050955414012, + "grad_norm": 2.469407796859741, + "learning_rate": 5e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7578522562980652, + "num_tokens": 628160377.0, + "step": 24273 + }, + { + "epoch": 2.665714913244015, + "grad_norm": 1.9503427743911743, + "learning_rate": 5e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7411329746246338, + "num_tokens": 628191553.0, + "step": 24274 + }, + { + "epoch": 2.6658247309466288, + "grad_norm": 2.141047954559326, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7618453502655029, + "num_tokens": 628217684.0, + "step": 24275 + }, + { + "epoch": 2.6659345486492425, + "grad_norm": 2.0266165733337402, + "learning_rate": 5e-06, + "loss": 0.705, + "mean_token_accuracy": 0.7674871683120728, + "num_tokens": 628246549.0, + "step": 24276 + }, + { + "epoch": 2.666044366351856, + "grad_norm": 2.359941005706787, + "learning_rate": 5e-06, + "loss": 0.6353, + "mean_token_accuracy": 0.7850757837295532, + "num_tokens": 628266260.0, + "step": 24277 + }, + { + "epoch": 2.6661541840544696, + "grad_norm": 2.1549153327941895, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7515425682067871, + "num_tokens": 628290386.0, + "step": 24278 + }, + { + "epoch": 2.6662640017570833, + "grad_norm": 2.1335248947143555, + "learning_rate": 5e-06, + "loss": 0.611, + "mean_token_accuracy": 0.795081615447998, + "num_tokens": 628312412.0, + "step": 24279 + }, + { + "epoch": 2.6663738194596966, + "grad_norm": 1.967088222503662, + "learning_rate": 5e-06, + "loss": 0.7211, + "mean_token_accuracy": 0.7608343362808228, + "num_tokens": 628338783.0, + "step": 24280 + }, + { + "epoch": 2.6664836371623104, + "grad_norm": 1.8506096601486206, + "learning_rate": 5e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.745857834815979, + "num_tokens": 628372202.0, + "step": 24281 + }, + { + "epoch": 2.666593454864924, + "grad_norm": 1.8608821630477905, + "learning_rate": 5e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7747499942779541, + "num_tokens": 628402175.0, + "step": 24282 + }, + { + "epoch": 2.666703272567538, + "grad_norm": 2.1317226886749268, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7667717933654785, + "num_tokens": 628426263.0, + "step": 24283 + }, + { + "epoch": 2.6668130902701517, + "grad_norm": 2.152509927749634, + "learning_rate": 5e-06, + "loss": 0.6383, + "mean_token_accuracy": 0.7809340357780457, + "num_tokens": 628449184.0, + "step": 24284 + }, + { + "epoch": 2.666922907972765, + "grad_norm": 1.8682786226272583, + "learning_rate": 5e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7694945335388184, + "num_tokens": 628478140.0, + "step": 24285 + }, + { + "epoch": 2.6670327256753787, + "grad_norm": 2.0594210624694824, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7908926010131836, + "num_tokens": 628503516.0, + "step": 24286 + }, + { + "epoch": 2.6671425433779925, + "grad_norm": 2.291686534881592, + "learning_rate": 5e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7622407674789429, + "num_tokens": 628526907.0, + "step": 24287 + }, + { + "epoch": 2.6672523610806063, + "grad_norm": 1.9168988466262817, + "learning_rate": 5e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7401033043861389, + "num_tokens": 628557252.0, + "step": 24288 + }, + { + "epoch": 2.66736217878322, + "grad_norm": 2.103339433670044, + "learning_rate": 5e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7734202146530151, + "num_tokens": 628584292.0, + "step": 24289 + }, + { + "epoch": 2.6674719964858333, + "grad_norm": 1.8946369886398315, + "learning_rate": 5e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7627875208854675, + "num_tokens": 628612958.0, + "step": 24290 + }, + { + "epoch": 2.667581814188447, + "grad_norm": 2.105828046798706, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7582552433013916, + "num_tokens": 628638133.0, + "step": 24291 + }, + { + "epoch": 2.667691631891061, + "grad_norm": 1.936843991279602, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7678348422050476, + "num_tokens": 628666041.0, + "step": 24292 + }, + { + "epoch": 2.6678014495936746, + "grad_norm": 2.019690990447998, + "learning_rate": 5e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7712823152542114, + "num_tokens": 628692229.0, + "step": 24293 + }, + { + "epoch": 2.6679112672962884, + "grad_norm": 2.164344310760498, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7644190788269043, + "num_tokens": 628717550.0, + "step": 24294 + }, + { + "epoch": 2.6680210849989017, + "grad_norm": 1.877694845199585, + "learning_rate": 5e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.7579257488250732, + "num_tokens": 628747398.0, + "step": 24295 + }, + { + "epoch": 2.6681309027015154, + "grad_norm": 1.9155380725860596, + "learning_rate": 5e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7331051826477051, + "num_tokens": 628779858.0, + "step": 24296 + }, + { + "epoch": 2.668240720404129, + "grad_norm": 2.1390693187713623, + "learning_rate": 5e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.7786965370178223, + "num_tokens": 628805094.0, + "step": 24297 + }, + { + "epoch": 2.668350538106743, + "grad_norm": 2.1476035118103027, + "learning_rate": 5e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7402491569519043, + "num_tokens": 628831521.0, + "step": 24298 + }, + { + "epoch": 2.6684603558093567, + "grad_norm": 1.979063868522644, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7552656531333923, + "num_tokens": 628861606.0, + "step": 24299 + }, + { + "epoch": 2.66857017351197, + "grad_norm": 1.9183558225631714, + "learning_rate": 5e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7720857858657837, + "num_tokens": 628890274.0, + "step": 24300 + }, + { + "epoch": 2.6686799912145838, + "grad_norm": 1.9123809337615967, + "learning_rate": 5e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7657264471054077, + "num_tokens": 628921384.0, + "step": 24301 + }, + { + "epoch": 2.6687898089171975, + "grad_norm": 2.2079782485961914, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7664279937744141, + "num_tokens": 628943927.0, + "step": 24302 + }, + { + "epoch": 2.668899626619811, + "grad_norm": 2.373398780822754, + "learning_rate": 5e-06, + "loss": 0.6228, + "mean_token_accuracy": 0.7901787757873535, + "num_tokens": 628962216.0, + "step": 24303 + }, + { + "epoch": 2.669009444322425, + "grad_norm": 1.9973164796829224, + "learning_rate": 5e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.764492928981781, + "num_tokens": 628990388.0, + "step": 24304 + }, + { + "epoch": 2.6691192620250384, + "grad_norm": 1.940307855606079, + "learning_rate": 5e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7546206712722778, + "num_tokens": 629021000.0, + "step": 24305 + }, + { + "epoch": 2.669229079727652, + "grad_norm": 1.9506391286849976, + "learning_rate": 5e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7570042610168457, + "num_tokens": 629049549.0, + "step": 24306 + }, + { + "epoch": 2.669338897430266, + "grad_norm": 2.0273311138153076, + "learning_rate": 5e-06, + "loss": 0.6908, + "mean_token_accuracy": 0.7746652364730835, + "num_tokens": 629074961.0, + "step": 24307 + }, + { + "epoch": 2.669448715132879, + "grad_norm": 2.301015853881836, + "learning_rate": 5e-06, + "loss": 0.6698, + "mean_token_accuracy": 0.7828612327575684, + "num_tokens": 629095976.0, + "step": 24308 + }, + { + "epoch": 2.669558532835493, + "grad_norm": 2.3207895755767822, + "learning_rate": 5e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7781379222869873, + "num_tokens": 629116917.0, + "step": 24309 + }, + { + "epoch": 2.6696683505381067, + "grad_norm": 2.0322511196136475, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.781511664390564, + "num_tokens": 629144379.0, + "step": 24310 + }, + { + "epoch": 2.6697781682407205, + "grad_norm": 1.988054633140564, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7516219019889832, + "num_tokens": 629170865.0, + "step": 24311 + }, + { + "epoch": 2.669887985943334, + "grad_norm": 2.2262284755706787, + "learning_rate": 5e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7528254985809326, + "num_tokens": 629193304.0, + "step": 24312 + }, + { + "epoch": 2.6699978036459475, + "grad_norm": 2.0128586292266846, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7602716684341431, + "num_tokens": 629218840.0, + "step": 24313 + }, + { + "epoch": 2.6701076213485613, + "grad_norm": 2.0719027519226074, + "learning_rate": 5e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7619009613990784, + "num_tokens": 629244708.0, + "step": 24314 + }, + { + "epoch": 2.670217439051175, + "grad_norm": 2.083911418914795, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7670485377311707, + "num_tokens": 629268574.0, + "step": 24315 + }, + { + "epoch": 2.670327256753789, + "grad_norm": 1.7977650165557861, + "learning_rate": 5e-06, + "loss": 0.6057, + "mean_token_accuracy": 0.7936958074569702, + "num_tokens": 629296556.0, + "step": 24316 + }, + { + "epoch": 2.6704370744564025, + "grad_norm": 2.0374293327331543, + "learning_rate": 5e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7758336067199707, + "num_tokens": 629322739.0, + "step": 24317 + }, + { + "epoch": 2.670546892159016, + "grad_norm": 2.1159846782684326, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7602495551109314, + "num_tokens": 629348846.0, + "step": 24318 + }, + { + "epoch": 2.6706567098616296, + "grad_norm": 1.9720561504364014, + "learning_rate": 5e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7365519404411316, + "num_tokens": 629377826.0, + "step": 24319 + }, + { + "epoch": 2.6707665275642434, + "grad_norm": 2.2045669555664062, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7670063972473145, + "num_tokens": 629402250.0, + "step": 24320 + }, + { + "epoch": 2.670876345266857, + "grad_norm": 2.1061325073242188, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7651687264442444, + "num_tokens": 629427510.0, + "step": 24321 + }, + { + "epoch": 2.670986162969471, + "grad_norm": 2.035878896713257, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7733812928199768, + "num_tokens": 629453081.0, + "step": 24322 + }, + { + "epoch": 2.671095980672084, + "grad_norm": 2.0799992084503174, + "learning_rate": 5e-06, + "loss": 0.6644, + "mean_token_accuracy": 0.780224084854126, + "num_tokens": 629476230.0, + "step": 24323 + }, + { + "epoch": 2.671205798374698, + "grad_norm": 1.9768213033676147, + "learning_rate": 5e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7244576811790466, + "num_tokens": 629506330.0, + "step": 24324 + }, + { + "epoch": 2.6713156160773117, + "grad_norm": 2.103732109069824, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7658411860466003, + "num_tokens": 629531697.0, + "step": 24325 + }, + { + "epoch": 2.6714254337799255, + "grad_norm": 1.9673110246658325, + "learning_rate": 5e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.765349268913269, + "num_tokens": 629558635.0, + "step": 24326 + }, + { + "epoch": 2.6715352514825392, + "grad_norm": 2.1144070625305176, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7459089756011963, + "num_tokens": 629584489.0, + "step": 24327 + }, + { + "epoch": 2.6716450691851525, + "grad_norm": 2.0689361095428467, + "learning_rate": 5e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7734696865081787, + "num_tokens": 629607234.0, + "step": 24328 + }, + { + "epoch": 2.6717548868877663, + "grad_norm": 2.1090493202209473, + "learning_rate": 5e-06, + "loss": 0.6381, + "mean_token_accuracy": 0.7816164493560791, + "num_tokens": 629630185.0, + "step": 24329 + }, + { + "epoch": 2.67186470459038, + "grad_norm": 2.2445828914642334, + "learning_rate": 5e-06, + "loss": 0.6107, + "mean_token_accuracy": 0.7937317490577698, + "num_tokens": 629650887.0, + "step": 24330 + }, + { + "epoch": 2.6719745222929934, + "grad_norm": 1.9478733539581299, + "learning_rate": 5e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7683278322219849, + "num_tokens": 629677411.0, + "step": 24331 + }, + { + "epoch": 2.672084339995607, + "grad_norm": 1.9450026750564575, + "learning_rate": 5e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7686785459518433, + "num_tokens": 629707311.0, + "step": 24332 + }, + { + "epoch": 2.672194157698221, + "grad_norm": 2.1243019104003906, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.78626549243927, + "num_tokens": 629732124.0, + "step": 24333 + }, + { + "epoch": 2.6723039754008346, + "grad_norm": 2.5297372341156006, + "learning_rate": 5e-06, + "loss": 0.6039, + "mean_token_accuracy": 0.7956674098968506, + "num_tokens": 629750849.0, + "step": 24334 + }, + { + "epoch": 2.6724137931034484, + "grad_norm": 1.9253325462341309, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7635257244110107, + "num_tokens": 629780421.0, + "step": 24335 + }, + { + "epoch": 2.6725236108060617, + "grad_norm": 1.9995167255401611, + "learning_rate": 5e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7682655453681946, + "num_tokens": 629810870.0, + "step": 24336 + }, + { + "epoch": 2.6726334285086755, + "grad_norm": 2.1376419067382812, + "learning_rate": 5e-06, + "loss": 0.6772, + "mean_token_accuracy": 0.7803711891174316, + "num_tokens": 629832750.0, + "step": 24337 + }, + { + "epoch": 2.6727432462112892, + "grad_norm": 1.8902236223220825, + "learning_rate": 5e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.7742749452590942, + "num_tokens": 629860853.0, + "step": 24338 + }, + { + "epoch": 2.672853063913903, + "grad_norm": 2.364495277404785, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.7741550207138062, + "num_tokens": 629881042.0, + "step": 24339 + }, + { + "epoch": 2.6729628816165167, + "grad_norm": 2.4191648960113525, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7684869766235352, + "num_tokens": 629903670.0, + "step": 24340 + }, + { + "epoch": 2.67307269931913, + "grad_norm": 2.2292895317077637, + "learning_rate": 5e-06, + "loss": 0.6662, + "mean_token_accuracy": 0.7744717597961426, + "num_tokens": 629927757.0, + "step": 24341 + }, + { + "epoch": 2.673182517021744, + "grad_norm": 2.1786298751831055, + "learning_rate": 5e-06, + "loss": 0.6118, + "mean_token_accuracy": 0.790256142616272, + "num_tokens": 629949469.0, + "step": 24342 + }, + { + "epoch": 2.6732923347243576, + "grad_norm": 2.195875644683838, + "learning_rate": 5e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7606962323188782, + "num_tokens": 629973562.0, + "step": 24343 + }, + { + "epoch": 2.6734021524269713, + "grad_norm": 1.99027419090271, + "learning_rate": 5e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.7481979727745056, + "num_tokens": 630002468.0, + "step": 24344 + }, + { + "epoch": 2.673511970129585, + "grad_norm": 2.137392044067383, + "learning_rate": 5e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.7781373262405396, + "num_tokens": 630027925.0, + "step": 24345 + }, + { + "epoch": 2.6736217878321984, + "grad_norm": 2.2652647495269775, + "learning_rate": 5e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7721080780029297, + "num_tokens": 630050507.0, + "step": 24346 + }, + { + "epoch": 2.673731605534812, + "grad_norm": 1.8378007411956787, + "learning_rate": 5e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7687637805938721, + "num_tokens": 630082094.0, + "step": 24347 + }, + { + "epoch": 2.673841423237426, + "grad_norm": 2.1294968128204346, + "learning_rate": 5e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7790144681930542, + "num_tokens": 630106425.0, + "step": 24348 + }, + { + "epoch": 2.6739512409400397, + "grad_norm": 2.141406297683716, + "learning_rate": 5e-06, + "loss": 0.6559, + "mean_token_accuracy": 0.7810162305831909, + "num_tokens": 630131774.0, + "step": 24349 + }, + { + "epoch": 2.6740610586426534, + "grad_norm": 2.2338485717773438, + "learning_rate": 5e-06, + "loss": 0.6959, + "mean_token_accuracy": 0.7725136280059814, + "num_tokens": 630156584.0, + "step": 24350 + }, + { + "epoch": 2.6741708763452667, + "grad_norm": 1.9407646656036377, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7588047385215759, + "num_tokens": 630183831.0, + "step": 24351 + }, + { + "epoch": 2.6742806940478805, + "grad_norm": 2.048133373260498, + "learning_rate": 5e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7661044597625732, + "num_tokens": 630208260.0, + "step": 24352 + }, + { + "epoch": 2.6743905117504942, + "grad_norm": 2.2346694469451904, + "learning_rate": 5e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7465255856513977, + "num_tokens": 630234588.0, + "step": 24353 + }, + { + "epoch": 2.674500329453108, + "grad_norm": 1.9433341026306152, + "learning_rate": 5e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7665576338768005, + "num_tokens": 630263827.0, + "step": 24354 + }, + { + "epoch": 2.6746101471557218, + "grad_norm": 1.981215238571167, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7564022541046143, + "num_tokens": 630293413.0, + "step": 24355 + }, + { + "epoch": 2.674719964858335, + "grad_norm": 1.561916708946228, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7546737194061279, + "num_tokens": 630332918.0, + "step": 24356 + }, + { + "epoch": 2.674829782560949, + "grad_norm": 2.2096405029296875, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7598080039024353, + "num_tokens": 630355347.0, + "step": 24357 + }, + { + "epoch": 2.6749396002635626, + "grad_norm": 1.8660614490509033, + "learning_rate": 5e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7472444772720337, + "num_tokens": 630386854.0, + "step": 24358 + }, + { + "epoch": 2.675049417966176, + "grad_norm": 2.1364636421203613, + "learning_rate": 5e-06, + "loss": 0.7097, + "mean_token_accuracy": 0.7652182579040527, + "num_tokens": 630410498.0, + "step": 24359 + }, + { + "epoch": 2.6751592356687897, + "grad_norm": 2.0793559551239014, + "learning_rate": 5e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7427087426185608, + "num_tokens": 630438034.0, + "step": 24360 + }, + { + "epoch": 2.6752690533714034, + "grad_norm": 2.013685941696167, + "learning_rate": 5e-06, + "loss": 0.7415, + "mean_token_accuracy": 0.7638028264045715, + "num_tokens": 630464934.0, + "step": 24361 + }, + { + "epoch": 2.675378871074017, + "grad_norm": 2.0712532997131348, + "learning_rate": 5e-06, + "loss": 0.7308, + "mean_token_accuracy": 0.7625320553779602, + "num_tokens": 630492341.0, + "step": 24362 + }, + { + "epoch": 2.675488688776631, + "grad_norm": 2.2754383087158203, + "learning_rate": 5e-06, + "loss": 0.6775, + "mean_token_accuracy": 0.7813333868980408, + "num_tokens": 630515293.0, + "step": 24363 + }, + { + "epoch": 2.6755985064792442, + "grad_norm": 2.290203332901001, + "learning_rate": 5e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.7952682375907898, + "num_tokens": 630538121.0, + "step": 24364 + }, + { + "epoch": 2.675708324181858, + "grad_norm": 2.222367286682129, + "learning_rate": 5e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7710504531860352, + "num_tokens": 630562685.0, + "step": 24365 + }, + { + "epoch": 2.6758181418844718, + "grad_norm": 1.8557411432266235, + "learning_rate": 5e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7605703473091125, + "num_tokens": 630594824.0, + "step": 24366 + }, + { + "epoch": 2.6759279595870855, + "grad_norm": 1.9784609079360962, + "learning_rate": 5e-06, + "loss": 0.7007, + "mean_token_accuracy": 0.7648168802261353, + "num_tokens": 630620053.0, + "step": 24367 + }, + { + "epoch": 2.6760377772896993, + "grad_norm": 2.0021073818206787, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7675191164016724, + "num_tokens": 630646055.0, + "step": 24368 + }, + { + "epoch": 2.6761475949923126, + "grad_norm": 2.1636769771575928, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7683839797973633, + "num_tokens": 630669446.0, + "step": 24369 + }, + { + "epoch": 2.6762574126949263, + "grad_norm": 2.314877510070801, + "learning_rate": 5e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7541831135749817, + "num_tokens": 630694131.0, + "step": 24370 + }, + { + "epoch": 2.67636723039754, + "grad_norm": 2.2758829593658447, + "learning_rate": 5e-06, + "loss": 0.658, + "mean_token_accuracy": 0.7833578586578369, + "num_tokens": 630715308.0, + "step": 24371 + }, + { + "epoch": 2.676477048100154, + "grad_norm": 2.113799810409546, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.761101484298706, + "num_tokens": 630741180.0, + "step": 24372 + }, + { + "epoch": 2.6765868658027676, + "grad_norm": 2.1918816566467285, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7562129497528076, + "num_tokens": 630765378.0, + "step": 24373 + }, + { + "epoch": 2.676696683505381, + "grad_norm": 2.2054717540740967, + "learning_rate": 5e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7457871437072754, + "num_tokens": 630789309.0, + "step": 24374 + }, + { + "epoch": 2.6768065012079947, + "grad_norm": 2.2625858783721924, + "learning_rate": 5e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.7735702395439148, + "num_tokens": 630811642.0, + "step": 24375 + }, + { + "epoch": 2.6769163189106084, + "grad_norm": 2.03536057472229, + "learning_rate": 5e-06, + "loss": 0.6777, + "mean_token_accuracy": 0.7720455527305603, + "num_tokens": 630838437.0, + "step": 24376 + }, + { + "epoch": 2.677026136613222, + "grad_norm": 2.0188374519348145, + "learning_rate": 5e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7637909650802612, + "num_tokens": 630866194.0, + "step": 24377 + }, + { + "epoch": 2.677135954315836, + "grad_norm": 1.9655921459197998, + "learning_rate": 5e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7493191957473755, + "num_tokens": 630898459.0, + "step": 24378 + }, + { + "epoch": 2.6772457720184493, + "grad_norm": 2.635199785232544, + "learning_rate": 5e-06, + "loss": 0.6425, + "mean_token_accuracy": 0.7839353084564209, + "num_tokens": 630915728.0, + "step": 24379 + }, + { + "epoch": 2.677355589721063, + "grad_norm": 2.314758539199829, + "learning_rate": 5e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.753529965877533, + "num_tokens": 630939538.0, + "step": 24380 + }, + { + "epoch": 2.677465407423677, + "grad_norm": 2.0104029178619385, + "learning_rate": 5e-06, + "loss": 0.6756, + "mean_token_accuracy": 0.7732528448104858, + "num_tokens": 630968230.0, + "step": 24381 + }, + { + "epoch": 2.67757522512629, + "grad_norm": 1.7914984226226807, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7485750913619995, + "num_tokens": 631003212.0, + "step": 24382 + }, + { + "epoch": 2.6776850428289043, + "grad_norm": 1.9921718835830688, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7632555961608887, + "num_tokens": 631031163.0, + "step": 24383 + }, + { + "epoch": 2.6777948605315176, + "grad_norm": 2.0313560962677, + "learning_rate": 5e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7666702270507812, + "num_tokens": 631057864.0, + "step": 24384 + }, + { + "epoch": 2.6779046782341314, + "grad_norm": 1.8940945863723755, + "learning_rate": 5e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7467333674430847, + "num_tokens": 631088391.0, + "step": 24385 + }, + { + "epoch": 2.678014495936745, + "grad_norm": 1.9796116352081299, + "learning_rate": 5e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7793742418289185, + "num_tokens": 631115985.0, + "step": 24386 + }, + { + "epoch": 2.6781243136393584, + "grad_norm": 2.4024202823638916, + "learning_rate": 5e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7745611667633057, + "num_tokens": 631136650.0, + "step": 24387 + }, + { + "epoch": 2.678234131341972, + "grad_norm": 1.9146488904953003, + "learning_rate": 5e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.76529860496521, + "num_tokens": 631165476.0, + "step": 24388 + }, + { + "epoch": 2.678343949044586, + "grad_norm": 1.9625107049942017, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7596040368080139, + "num_tokens": 631193939.0, + "step": 24389 + }, + { + "epoch": 2.6784537667471997, + "grad_norm": 2.0954196453094482, + "learning_rate": 5e-06, + "loss": 0.7032, + "mean_token_accuracy": 0.7650459408760071, + "num_tokens": 631218643.0, + "step": 24390 + }, + { + "epoch": 2.6785635844498135, + "grad_norm": 1.8750630617141724, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7724027633666992, + "num_tokens": 631247749.0, + "step": 24391 + }, + { + "epoch": 2.6786734021524268, + "grad_norm": 2.1296470165252686, + "learning_rate": 5e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7650213241577148, + "num_tokens": 631273650.0, + "step": 24392 + }, + { + "epoch": 2.6787832198550405, + "grad_norm": 1.9995061159133911, + "learning_rate": 5e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7684141993522644, + "num_tokens": 631303046.0, + "step": 24393 + }, + { + "epoch": 2.6788930375576543, + "grad_norm": 2.3176989555358887, + "learning_rate": 5e-06, + "loss": 0.6708, + "mean_token_accuracy": 0.7795009016990662, + "num_tokens": 631324757.0, + "step": 24394 + }, + { + "epoch": 2.679002855260268, + "grad_norm": 2.1108486652374268, + "learning_rate": 5e-06, + "loss": 0.6305, + "mean_token_accuracy": 0.7851310968399048, + "num_tokens": 631347845.0, + "step": 24395 + }, + { + "epoch": 2.679112672962882, + "grad_norm": 2.424866199493408, + "learning_rate": 5e-06, + "loss": 0.6068, + "mean_token_accuracy": 0.7974191904067993, + "num_tokens": 631366412.0, + "step": 24396 + }, + { + "epoch": 2.679222490665495, + "grad_norm": 1.8678340911865234, + "learning_rate": 5e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.73585045337677, + "num_tokens": 631398583.0, + "step": 24397 + }, + { + "epoch": 2.679332308368109, + "grad_norm": 2.015155076980591, + "learning_rate": 5e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7670405507087708, + "num_tokens": 631426520.0, + "step": 24398 + }, + { + "epoch": 2.6794421260707226, + "grad_norm": 2.122034788131714, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7710996270179749, + "num_tokens": 631451231.0, + "step": 24399 + }, + { + "epoch": 2.6795519437733364, + "grad_norm": 2.0401785373687744, + "learning_rate": 5e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7685075998306274, + "num_tokens": 631477272.0, + "step": 24400 + }, + { + "epoch": 2.67966176147595, + "grad_norm": 2.2617111206054688, + "learning_rate": 5e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.7789590954780579, + "num_tokens": 631497778.0, + "step": 24401 + }, + { + "epoch": 2.6797715791785635, + "grad_norm": 2.3151144981384277, + "learning_rate": 5e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.755207896232605, + "num_tokens": 631520797.0, + "step": 24402 + }, + { + "epoch": 2.679881396881177, + "grad_norm": 2.1772005558013916, + "learning_rate": 5e-06, + "loss": 0.7477, + "mean_token_accuracy": 0.7547460794448853, + "num_tokens": 631544686.0, + "step": 24403 + }, + { + "epoch": 2.679991214583791, + "grad_norm": 2.344045400619507, + "learning_rate": 5e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7667883038520813, + "num_tokens": 631566557.0, + "step": 24404 + }, + { + "epoch": 2.6801010322864047, + "grad_norm": 2.2840592861175537, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7704504132270813, + "num_tokens": 631588093.0, + "step": 24405 + }, + { + "epoch": 2.6802108499890185, + "grad_norm": 2.1000261306762695, + "learning_rate": 5e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.756902277469635, + "num_tokens": 631615683.0, + "step": 24406 + }, + { + "epoch": 2.680320667691632, + "grad_norm": 2.3577752113342285, + "learning_rate": 5e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.7799742221832275, + "num_tokens": 631634715.0, + "step": 24407 + }, + { + "epoch": 2.6804304853942456, + "grad_norm": 1.9826878309249878, + "learning_rate": 5e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7478564381599426, + "num_tokens": 631662232.0, + "step": 24408 + }, + { + "epoch": 2.6805403030968593, + "grad_norm": 2.1369121074676514, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7718825340270996, + "num_tokens": 631686384.0, + "step": 24409 + }, + { + "epoch": 2.6806501207994726, + "grad_norm": 1.9366180896759033, + "learning_rate": 5e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7681244611740112, + "num_tokens": 631714893.0, + "step": 24410 + }, + { + "epoch": 2.6807599385020864, + "grad_norm": 2.1073310375213623, + "learning_rate": 5e-06, + "loss": 0.766, + "mean_token_accuracy": 0.7520822286605835, + "num_tokens": 631739855.0, + "step": 24411 + }, + { + "epoch": 2.6808697562047, + "grad_norm": 2.1843302249908447, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7638750076293945, + "num_tokens": 631763233.0, + "step": 24412 + }, + { + "epoch": 2.680979573907314, + "grad_norm": 1.909307837486267, + "learning_rate": 5e-06, + "loss": 0.719, + "mean_token_accuracy": 0.7628967761993408, + "num_tokens": 631792473.0, + "step": 24413 + }, + { + "epoch": 2.6810893916099277, + "grad_norm": 2.1037044525146484, + "learning_rate": 5e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7602490186691284, + "num_tokens": 631816748.0, + "step": 24414 + }, + { + "epoch": 2.681199209312541, + "grad_norm": 2.18426251411438, + "learning_rate": 5e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7839211225509644, + "num_tokens": 631839310.0, + "step": 24415 + }, + { + "epoch": 2.6813090270151547, + "grad_norm": 1.9904054403305054, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7565301060676575, + "num_tokens": 631869379.0, + "step": 24416 + }, + { + "epoch": 2.6814188447177685, + "grad_norm": 2.0832271575927734, + "learning_rate": 5e-06, + "loss": 0.6583, + "mean_token_accuracy": 0.7881231307983398, + "num_tokens": 631894344.0, + "step": 24417 + }, + { + "epoch": 2.6815286624203822, + "grad_norm": 1.9904295206069946, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.760796070098877, + "num_tokens": 631925742.0, + "step": 24418 + }, + { + "epoch": 2.681638480122996, + "grad_norm": 2.4131696224212646, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.758391797542572, + "num_tokens": 631946936.0, + "step": 24419 + }, + { + "epoch": 2.6817482978256093, + "grad_norm": 2.3231465816497803, + "learning_rate": 5e-06, + "loss": 0.6672, + "mean_token_accuracy": 0.7773191928863525, + "num_tokens": 631967983.0, + "step": 24420 + }, + { + "epoch": 2.681858115528223, + "grad_norm": 2.0775575637817383, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7586339712142944, + "num_tokens": 631992885.0, + "step": 24421 + }, + { + "epoch": 2.681967933230837, + "grad_norm": 2.2258918285369873, + "learning_rate": 5e-06, + "loss": 0.6866, + "mean_token_accuracy": 0.7679355144500732, + "num_tokens": 632013544.0, + "step": 24422 + }, + { + "epoch": 2.6820777509334506, + "grad_norm": 2.4525506496429443, + "learning_rate": 5e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.752845048904419, + "num_tokens": 632033973.0, + "step": 24423 + }, + { + "epoch": 2.6821875686360643, + "grad_norm": 2.2668018341064453, + "learning_rate": 5e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.7848680019378662, + "num_tokens": 632055123.0, + "step": 24424 + }, + { + "epoch": 2.6822973863386776, + "grad_norm": 1.9493601322174072, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7556940317153931, + "num_tokens": 632085870.0, + "step": 24425 + }, + { + "epoch": 2.6824072040412914, + "grad_norm": 2.2034780979156494, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7558606266975403, + "num_tokens": 632110648.0, + "step": 24426 + }, + { + "epoch": 2.682517021743905, + "grad_norm": 2.192329168319702, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7529700994491577, + "num_tokens": 632134899.0, + "step": 24427 + }, + { + "epoch": 2.682626839446519, + "grad_norm": 2.127931594848633, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7765477299690247, + "num_tokens": 632159452.0, + "step": 24428 + }, + { + "epoch": 2.6827366571491327, + "grad_norm": 1.9684628248214722, + "learning_rate": 5e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7716763019561768, + "num_tokens": 632186209.0, + "step": 24429 + }, + { + "epoch": 2.682846474851746, + "grad_norm": 1.99979567527771, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7493636012077332, + "num_tokens": 632214073.0, + "step": 24430 + }, + { + "epoch": 2.6829562925543597, + "grad_norm": 2.065804958343506, + "learning_rate": 5e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7728030681610107, + "num_tokens": 632239640.0, + "step": 24431 + }, + { + "epoch": 2.6830661102569735, + "grad_norm": 1.945975422859192, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7521713972091675, + "num_tokens": 632266852.0, + "step": 24432 + }, + { + "epoch": 2.683175927959587, + "grad_norm": 1.9126453399658203, + "learning_rate": 5e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7453768253326416, + "num_tokens": 632295630.0, + "step": 24433 + }, + { + "epoch": 2.683285745662201, + "grad_norm": 2.0334267616271973, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7670563459396362, + "num_tokens": 632322906.0, + "step": 24434 + }, + { + "epoch": 2.6833955633648143, + "grad_norm": 2.0346920490264893, + "learning_rate": 5e-06, + "loss": 0.631, + "mean_token_accuracy": 0.7896031141281128, + "num_tokens": 632346029.0, + "step": 24435 + }, + { + "epoch": 2.683505381067428, + "grad_norm": 2.403346538543701, + "learning_rate": 5e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.7843869924545288, + "num_tokens": 632365699.0, + "step": 24436 + }, + { + "epoch": 2.683615198770042, + "grad_norm": 2.2318942546844482, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.758394718170166, + "num_tokens": 632389424.0, + "step": 24437 + }, + { + "epoch": 2.683725016472655, + "grad_norm": 2.4001593589782715, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7760312557220459, + "num_tokens": 632410497.0, + "step": 24438 + }, + { + "epoch": 2.683834834175269, + "grad_norm": 2.206132411956787, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7675269246101379, + "num_tokens": 632434601.0, + "step": 24439 + }, + { + "epoch": 2.6839446518778827, + "grad_norm": 1.9436101913452148, + "learning_rate": 5e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7502676844596863, + "num_tokens": 632464075.0, + "step": 24440 + }, + { + "epoch": 2.6840544695804964, + "grad_norm": 2.0976572036743164, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7659008502960205, + "num_tokens": 632490276.0, + "step": 24441 + }, + { + "epoch": 2.68416428728311, + "grad_norm": 2.0136892795562744, + "learning_rate": 5e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7691150903701782, + "num_tokens": 632517651.0, + "step": 24442 + }, + { + "epoch": 2.6842741049857235, + "grad_norm": 2.002589464187622, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7851492166519165, + "num_tokens": 632545064.0, + "step": 24443 + }, + { + "epoch": 2.6843839226883373, + "grad_norm": 2.0486700534820557, + "learning_rate": 5e-06, + "loss": 0.6527, + "mean_token_accuracy": 0.7832673788070679, + "num_tokens": 632569263.0, + "step": 24444 + }, + { + "epoch": 2.684493740390951, + "grad_norm": 2.3022944927215576, + "learning_rate": 5e-06, + "loss": 0.6358, + "mean_token_accuracy": 0.7851273417472839, + "num_tokens": 632590384.0, + "step": 24445 + }, + { + "epoch": 2.6846035580935648, + "grad_norm": 2.29681658744812, + "learning_rate": 5e-06, + "loss": 0.6757, + "mean_token_accuracy": 0.7826753854751587, + "num_tokens": 632612062.0, + "step": 24446 + }, + { + "epoch": 2.6847133757961785, + "grad_norm": 2.3113389015197754, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7700856924057007, + "num_tokens": 632636893.0, + "step": 24447 + }, + { + "epoch": 2.684823193498792, + "grad_norm": 2.3251843452453613, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7664047479629517, + "num_tokens": 632661520.0, + "step": 24448 + }, + { + "epoch": 2.6849330112014056, + "grad_norm": 1.9918341636657715, + "learning_rate": 5e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7498176693916321, + "num_tokens": 632690167.0, + "step": 24449 + }, + { + "epoch": 2.6850428289040194, + "grad_norm": 2.213987112045288, + "learning_rate": 5e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7732452750205994, + "num_tokens": 632713729.0, + "step": 24450 + }, + { + "epoch": 2.685152646606633, + "grad_norm": 2.0560526847839355, + "learning_rate": 5e-06, + "loss": 0.7316, + "mean_token_accuracy": 0.7600763440132141, + "num_tokens": 632739111.0, + "step": 24451 + }, + { + "epoch": 2.685262464309247, + "grad_norm": 1.9096752405166626, + "learning_rate": 5e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.7931599020957947, + "num_tokens": 632765223.0, + "step": 24452 + }, + { + "epoch": 2.68537228201186, + "grad_norm": 2.137096643447876, + "learning_rate": 5e-06, + "loss": 0.6782, + "mean_token_accuracy": 0.7738310694694519, + "num_tokens": 632788945.0, + "step": 24453 + }, + { + "epoch": 2.685482099714474, + "grad_norm": 2.0602614879608154, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7601200938224792, + "num_tokens": 632815677.0, + "step": 24454 + }, + { + "epoch": 2.6855919174170877, + "grad_norm": 2.0225329399108887, + "learning_rate": 5e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7614544034004211, + "num_tokens": 632842490.0, + "step": 24455 + }, + { + "epoch": 2.6857017351197014, + "grad_norm": 2.097041130065918, + "learning_rate": 5e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7546387314796448, + "num_tokens": 632870466.0, + "step": 24456 + }, + { + "epoch": 2.685811552822315, + "grad_norm": 2.066239356994629, + "learning_rate": 5e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7372498512268066, + "num_tokens": 632894821.0, + "step": 24457 + }, + { + "epoch": 2.6859213705249285, + "grad_norm": 2.248561143875122, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7827807664871216, + "num_tokens": 632916274.0, + "step": 24458 + }, + { + "epoch": 2.6860311882275423, + "grad_norm": 1.9928795099258423, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7621088027954102, + "num_tokens": 632943832.0, + "step": 24459 + }, + { + "epoch": 2.686141005930156, + "grad_norm": 2.057887554168701, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.769798994064331, + "num_tokens": 632969911.0, + "step": 24460 + }, + { + "epoch": 2.6862508236327693, + "grad_norm": 2.2023239135742188, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7737365961074829, + "num_tokens": 632991901.0, + "step": 24461 + }, + { + "epoch": 2.686360641335383, + "grad_norm": 1.8961971998214722, + "learning_rate": 5e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.7875109314918518, + "num_tokens": 633020268.0, + "step": 24462 + }, + { + "epoch": 2.686470459037997, + "grad_norm": 1.9086509943008423, + "learning_rate": 5e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7609648108482361, + "num_tokens": 633051560.0, + "step": 24463 + }, + { + "epoch": 2.6865802767406106, + "grad_norm": 2.3306872844696045, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7505228519439697, + "num_tokens": 633074633.0, + "step": 24464 + }, + { + "epoch": 2.6866900944432244, + "grad_norm": 2.1578774452209473, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7557404041290283, + "num_tokens": 633099238.0, + "step": 24465 + }, + { + "epoch": 2.6867999121458377, + "grad_norm": 2.2656102180480957, + "learning_rate": 5e-06, + "loss": 0.677, + "mean_token_accuracy": 0.7781696915626526, + "num_tokens": 633122755.0, + "step": 24466 + }, + { + "epoch": 2.6869097298484514, + "grad_norm": 1.970373272895813, + "learning_rate": 5e-06, + "loss": 0.6648, + "mean_token_accuracy": 0.7796480655670166, + "num_tokens": 633147149.0, + "step": 24467 + }, + { + "epoch": 2.687019547551065, + "grad_norm": 2.175373077392578, + "learning_rate": 5e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7615534663200378, + "num_tokens": 633169991.0, + "step": 24468 + }, + { + "epoch": 2.687129365253679, + "grad_norm": 1.9389592409133911, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7491475343704224, + "num_tokens": 633198806.0, + "step": 24469 + }, + { + "epoch": 2.6872391829562927, + "grad_norm": 1.9945975542068481, + "learning_rate": 5e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7767177820205688, + "num_tokens": 633224986.0, + "step": 24470 + }, + { + "epoch": 2.687349000658906, + "grad_norm": 2.183091163635254, + "learning_rate": 5e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7670567035675049, + "num_tokens": 633247785.0, + "step": 24471 + }, + { + "epoch": 2.68745881836152, + "grad_norm": 1.9495536088943481, + "learning_rate": 5e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7532151937484741, + "num_tokens": 633276556.0, + "step": 24472 + }, + { + "epoch": 2.6875686360641335, + "grad_norm": 1.9381181001663208, + "learning_rate": 5e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7393709421157837, + "num_tokens": 633309807.0, + "step": 24473 + }, + { + "epoch": 2.6876784537667473, + "grad_norm": 2.1875481605529785, + "learning_rate": 5e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7591437101364136, + "num_tokens": 633337635.0, + "step": 24474 + }, + { + "epoch": 2.687788271469361, + "grad_norm": 2.0879507064819336, + "learning_rate": 5e-06, + "loss": 0.7227, + "mean_token_accuracy": 0.7727715969085693, + "num_tokens": 633361497.0, + "step": 24475 + }, + { + "epoch": 2.6878980891719744, + "grad_norm": 2.3738505840301514, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.772851288318634, + "num_tokens": 633383420.0, + "step": 24476 + }, + { + "epoch": 2.688007906874588, + "grad_norm": 2.1764652729034424, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.748191773891449, + "num_tokens": 633408392.0, + "step": 24477 + }, + { + "epoch": 2.688117724577202, + "grad_norm": 2.0214080810546875, + "learning_rate": 5e-06, + "loss": 0.6742, + "mean_token_accuracy": 0.7852988839149475, + "num_tokens": 633433997.0, + "step": 24478 + }, + { + "epoch": 2.6882275422798156, + "grad_norm": 2.173644781112671, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.771823525428772, + "num_tokens": 633457476.0, + "step": 24479 + }, + { + "epoch": 2.6883373599824294, + "grad_norm": 2.1670384407043457, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7606109380722046, + "num_tokens": 633481977.0, + "step": 24480 + }, + { + "epoch": 2.6884471776850427, + "grad_norm": 2.2566473484039307, + "learning_rate": 5e-06, + "loss": 0.6516, + "mean_token_accuracy": 0.7796995639801025, + "num_tokens": 633501927.0, + "step": 24481 + }, + { + "epoch": 2.6885569953876565, + "grad_norm": 1.9183465242385864, + "learning_rate": 5e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7408217191696167, + "num_tokens": 633531635.0, + "step": 24482 + }, + { + "epoch": 2.6886668130902702, + "grad_norm": 2.292250156402588, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7690469622612, + "num_tokens": 633554171.0, + "step": 24483 + }, + { + "epoch": 2.6887766307928835, + "grad_norm": 2.0327913761138916, + "learning_rate": 5e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.772360622882843, + "num_tokens": 633578842.0, + "step": 24484 + }, + { + "epoch": 2.6888864484954977, + "grad_norm": 2.168671131134033, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7660986185073853, + "num_tokens": 633600355.0, + "step": 24485 + }, + { + "epoch": 2.688996266198111, + "grad_norm": 2.214362621307373, + "learning_rate": 5e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.7897247076034546, + "num_tokens": 633621205.0, + "step": 24486 + }, + { + "epoch": 2.689106083900725, + "grad_norm": 2.478191614151001, + "learning_rate": 5e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.7849563360214233, + "num_tokens": 633640600.0, + "step": 24487 + }, + { + "epoch": 2.6892159016033386, + "grad_norm": 2.188568115234375, + "learning_rate": 5e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.7818893790245056, + "num_tokens": 633667872.0, + "step": 24488 + }, + { + "epoch": 2.689325719305952, + "grad_norm": 2.323605537414551, + "learning_rate": 5e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7624822854995728, + "num_tokens": 633690693.0, + "step": 24489 + }, + { + "epoch": 2.6894355370085656, + "grad_norm": 1.968122959136963, + "learning_rate": 5e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7509055733680725, + "num_tokens": 633720974.0, + "step": 24490 + }, + { + "epoch": 2.6895453547111794, + "grad_norm": 2.05108904838562, + "learning_rate": 5e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7627150416374207, + "num_tokens": 633747696.0, + "step": 24491 + }, + { + "epoch": 2.689655172413793, + "grad_norm": 1.9653480052947998, + "learning_rate": 5e-06, + "loss": 0.6462, + "mean_token_accuracy": 0.7899677753448486, + "num_tokens": 633774778.0, + "step": 24492 + }, + { + "epoch": 2.689764990116407, + "grad_norm": 2.315657138824463, + "learning_rate": 5e-06, + "loss": 0.6864, + "mean_token_accuracy": 0.7722563743591309, + "num_tokens": 633796301.0, + "step": 24493 + }, + { + "epoch": 2.68987480781902, + "grad_norm": 2.233060836791992, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7728426456451416, + "num_tokens": 633819317.0, + "step": 24494 + }, + { + "epoch": 2.689984625521634, + "grad_norm": 2.3683977127075195, + "learning_rate": 5e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7613992094993591, + "num_tokens": 633841769.0, + "step": 24495 + }, + { + "epoch": 2.6900944432242477, + "grad_norm": 2.030151605606079, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7559537887573242, + "num_tokens": 633868972.0, + "step": 24496 + }, + { + "epoch": 2.6902042609268615, + "grad_norm": 1.9001388549804688, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7543481588363647, + "num_tokens": 633899622.0, + "step": 24497 + }, + { + "epoch": 2.6903140786294752, + "grad_norm": 2.0230014324188232, + "learning_rate": 5e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7505968809127808, + "num_tokens": 633926981.0, + "step": 24498 + }, + { + "epoch": 2.6904238963320886, + "grad_norm": 2.098405361175537, + "learning_rate": 5e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7447775602340698, + "num_tokens": 633953749.0, + "step": 24499 + }, + { + "epoch": 2.6905337140347023, + "grad_norm": 2.4706125259399414, + "learning_rate": 5e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.7838900685310364, + "num_tokens": 633972828.0, + "step": 24500 + }, + { + "epoch": 2.690643531737316, + "grad_norm": 1.9809746742248535, + "learning_rate": 5e-06, + "loss": 0.6626, + "mean_token_accuracy": 0.7862308025360107, + "num_tokens": 633999896.0, + "step": 24501 + }, + { + "epoch": 2.69075334943993, + "grad_norm": 2.077700138092041, + "learning_rate": 5e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.7762935161590576, + "num_tokens": 634024154.0, + "step": 24502 + }, + { + "epoch": 2.6908631671425436, + "grad_norm": 2.0881359577178955, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7760566473007202, + "num_tokens": 634050805.0, + "step": 24503 + }, + { + "epoch": 2.690972984845157, + "grad_norm": 1.9500542879104614, + "learning_rate": 5e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7581321597099304, + "num_tokens": 634081013.0, + "step": 24504 + }, + { + "epoch": 2.6910828025477707, + "grad_norm": 1.9914069175720215, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7635146379470825, + "num_tokens": 634109892.0, + "step": 24505 + }, + { + "epoch": 2.6911926202503844, + "grad_norm": 2.4173531532287598, + "learning_rate": 5e-06, + "loss": 0.7524, + "mean_token_accuracy": 0.761411190032959, + "num_tokens": 634134474.0, + "step": 24506 + }, + { + "epoch": 2.691302437952998, + "grad_norm": 2.1558802127838135, + "learning_rate": 5e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.7733746767044067, + "num_tokens": 634160319.0, + "step": 24507 + }, + { + "epoch": 2.691412255655612, + "grad_norm": 2.0621347427368164, + "learning_rate": 5e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7571449279785156, + "num_tokens": 634189490.0, + "step": 24508 + }, + { + "epoch": 2.6915220733582252, + "grad_norm": 2.1557350158691406, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7617371678352356, + "num_tokens": 634215213.0, + "step": 24509 + }, + { + "epoch": 2.691631891060839, + "grad_norm": 1.910248875617981, + "learning_rate": 5e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7608280777931213, + "num_tokens": 634245291.0, + "step": 24510 + }, + { + "epoch": 2.6917417087634528, + "grad_norm": 1.8691833019256592, + "learning_rate": 5e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.737290620803833, + "num_tokens": 634278245.0, + "step": 24511 + }, + { + "epoch": 2.691851526466066, + "grad_norm": 2.096212863922119, + "learning_rate": 5e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7673661708831787, + "num_tokens": 634304093.0, + "step": 24512 + }, + { + "epoch": 2.69196134416868, + "grad_norm": 1.9409940242767334, + "learning_rate": 5e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.7557060122489929, + "num_tokens": 634331401.0, + "step": 24513 + }, + { + "epoch": 2.6920711618712936, + "grad_norm": 1.9469892978668213, + "learning_rate": 5e-06, + "loss": 0.6418, + "mean_token_accuracy": 0.7874606847763062, + "num_tokens": 634358344.0, + "step": 24514 + }, + { + "epoch": 2.6921809795739073, + "grad_norm": 1.982324481010437, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7628669738769531, + "num_tokens": 634389359.0, + "step": 24515 + }, + { + "epoch": 2.692290797276521, + "grad_norm": 2.2174012660980225, + "learning_rate": 5e-06, + "loss": 0.6433, + "mean_token_accuracy": 0.7805365920066833, + "num_tokens": 634409938.0, + "step": 24516 + }, + { + "epoch": 2.6924006149791344, + "grad_norm": 2.13067889213562, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7578569054603577, + "num_tokens": 634435317.0, + "step": 24517 + }, + { + "epoch": 2.692510432681748, + "grad_norm": 2.5342977046966553, + "learning_rate": 5e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.7876952290534973, + "num_tokens": 634453784.0, + "step": 24518 + }, + { + "epoch": 2.692620250384362, + "grad_norm": 2.2237627506256104, + "learning_rate": 5e-06, + "loss": 0.6001, + "mean_token_accuracy": 0.7984049916267395, + "num_tokens": 634476667.0, + "step": 24519 + }, + { + "epoch": 2.6927300680869757, + "grad_norm": 2.049417495727539, + "learning_rate": 5e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7601631283760071, + "num_tokens": 634502516.0, + "step": 24520 + }, + { + "epoch": 2.6928398857895894, + "grad_norm": 2.0377392768859863, + "learning_rate": 5e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7409701347351074, + "num_tokens": 634529405.0, + "step": 24521 + }, + { + "epoch": 2.6929497034922028, + "grad_norm": 2.1426682472229004, + "learning_rate": 5e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7623255252838135, + "num_tokens": 634550908.0, + "step": 24522 + }, + { + "epoch": 2.6930595211948165, + "grad_norm": 2.5726444721221924, + "learning_rate": 5e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7523520588874817, + "num_tokens": 634573702.0, + "step": 24523 + }, + { + "epoch": 2.6931693388974303, + "grad_norm": 2.0825252532958984, + "learning_rate": 5e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7693582773208618, + "num_tokens": 634598642.0, + "step": 24524 + }, + { + "epoch": 2.693279156600044, + "grad_norm": 2.395482063293457, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7637461423873901, + "num_tokens": 634619261.0, + "step": 24525 + }, + { + "epoch": 2.693388974302658, + "grad_norm": 2.0505857467651367, + "learning_rate": 5e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7632020711898804, + "num_tokens": 634645450.0, + "step": 24526 + }, + { + "epoch": 2.693498792005271, + "grad_norm": 2.137115716934204, + "learning_rate": 5e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7651821374893188, + "num_tokens": 634669595.0, + "step": 24527 + }, + { + "epoch": 2.693608609707885, + "grad_norm": 1.9695013761520386, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7559396624565125, + "num_tokens": 634697349.0, + "step": 24528 + }, + { + "epoch": 2.6937184274104986, + "grad_norm": 2.008359670639038, + "learning_rate": 5e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7362374067306519, + "num_tokens": 634728474.0, + "step": 24529 + }, + { + "epoch": 2.6938282451131124, + "grad_norm": 2.069873094558716, + "learning_rate": 5e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7524682283401489, + "num_tokens": 634755682.0, + "step": 24530 + }, + { + "epoch": 2.693938062815726, + "grad_norm": 2.0312108993530273, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7579379081726074, + "num_tokens": 634784472.0, + "step": 24531 + }, + { + "epoch": 2.6940478805183394, + "grad_norm": 2.0303189754486084, + "learning_rate": 5e-06, + "loss": 0.7634, + "mean_token_accuracy": 0.7471121549606323, + "num_tokens": 634812348.0, + "step": 24532 + }, + { + "epoch": 2.694157698220953, + "grad_norm": 2.27248215675354, + "learning_rate": 5e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7762978076934814, + "num_tokens": 634832646.0, + "step": 24533 + }, + { + "epoch": 2.694267515923567, + "grad_norm": 2.0837011337280273, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7484339475631714, + "num_tokens": 634859092.0, + "step": 24534 + }, + { + "epoch": 2.6943773336261807, + "grad_norm": 1.910901665687561, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7436822056770325, + "num_tokens": 634891398.0, + "step": 24535 + }, + { + "epoch": 2.6944871513287945, + "grad_norm": 2.116265058517456, + "learning_rate": 5e-06, + "loss": 0.6981, + "mean_token_accuracy": 0.7637070417404175, + "num_tokens": 634913543.0, + "step": 24536 + }, + { + "epoch": 2.6945969690314078, + "grad_norm": 2.030730962753296, + "learning_rate": 5e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7588064074516296, + "num_tokens": 634941441.0, + "step": 24537 + }, + { + "epoch": 2.6947067867340215, + "grad_norm": 2.181507110595703, + "learning_rate": 5e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.7592720985412598, + "num_tokens": 634967120.0, + "step": 24538 + }, + { + "epoch": 2.6948166044366353, + "grad_norm": 2.1216378211975098, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7719844579696655, + "num_tokens": 634991208.0, + "step": 24539 + }, + { + "epoch": 2.6949264221392486, + "grad_norm": 2.07180118560791, + "learning_rate": 5e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7289379239082336, + "num_tokens": 635019694.0, + "step": 24540 + }, + { + "epoch": 2.6950362398418624, + "grad_norm": 2.0263023376464844, + "learning_rate": 5e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7608571648597717, + "num_tokens": 635047987.0, + "step": 24541 + }, + { + "epoch": 2.695146057544476, + "grad_norm": 2.3306796550750732, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.7527415752410889, + "num_tokens": 635070627.0, + "step": 24542 + }, + { + "epoch": 2.69525587524709, + "grad_norm": 2.3723092079162598, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7692829966545105, + "num_tokens": 635090241.0, + "step": 24543 + }, + { + "epoch": 2.6953656929497036, + "grad_norm": 2.0972092151641846, + "learning_rate": 5e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7492407560348511, + "num_tokens": 635116398.0, + "step": 24544 + }, + { + "epoch": 2.695475510652317, + "grad_norm": 1.730483055114746, + "learning_rate": 5e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7716596722602844, + "num_tokens": 635150194.0, + "step": 24545 + }, + { + "epoch": 2.6955853283549307, + "grad_norm": 1.8523918390274048, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7606741189956665, + "num_tokens": 635181910.0, + "step": 24546 + }, + { + "epoch": 2.6956951460575445, + "grad_norm": 2.3245530128479004, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7570105195045471, + "num_tokens": 635206812.0, + "step": 24547 + }, + { + "epoch": 2.695804963760158, + "grad_norm": 2.165952682495117, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7565869688987732, + "num_tokens": 635233500.0, + "step": 24548 + }, + { + "epoch": 2.695914781462772, + "grad_norm": 2.054008960723877, + "learning_rate": 5e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7712540626525879, + "num_tokens": 635258787.0, + "step": 24549 + }, + { + "epoch": 2.6960245991653853, + "grad_norm": 1.9525697231292725, + "learning_rate": 5e-06, + "loss": 0.7742, + "mean_token_accuracy": 0.7514994144439697, + "num_tokens": 635287878.0, + "step": 24550 + }, + { + "epoch": 2.696134416867999, + "grad_norm": 1.836395502090454, + "learning_rate": 5e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7514678239822388, + "num_tokens": 635323370.0, + "step": 24551 + }, + { + "epoch": 2.696244234570613, + "grad_norm": 1.9413995742797852, + "learning_rate": 5e-06, + "loss": 0.6703, + "mean_token_accuracy": 0.775564968585968, + "num_tokens": 635350718.0, + "step": 24552 + }, + { + "epoch": 2.6963540522732266, + "grad_norm": 2.0988175868988037, + "learning_rate": 5e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7785472273826599, + "num_tokens": 635375067.0, + "step": 24553 + }, + { + "epoch": 2.6964638699758403, + "grad_norm": 1.9466193914413452, + "learning_rate": 5e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.7495537400245667, + "num_tokens": 635404934.0, + "step": 24554 + }, + { + "epoch": 2.6965736876784536, + "grad_norm": 2.065995693206787, + "learning_rate": 5e-06, + "loss": 0.6501, + "mean_token_accuracy": 0.7838971614837646, + "num_tokens": 635428268.0, + "step": 24555 + }, + { + "epoch": 2.6966835053810674, + "grad_norm": 2.2507543563842773, + "learning_rate": 5e-06, + "loss": 0.6896, + "mean_token_accuracy": 0.7739455699920654, + "num_tokens": 635449316.0, + "step": 24556 + }, + { + "epoch": 2.696793323083681, + "grad_norm": 2.0963783264160156, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7677065134048462, + "num_tokens": 635474694.0, + "step": 24557 + }, + { + "epoch": 2.696903140786295, + "grad_norm": 2.452498435974121, + "learning_rate": 5e-06, + "loss": 0.646, + "mean_token_accuracy": 0.7789968252182007, + "num_tokens": 635494057.0, + "step": 24558 + }, + { + "epoch": 2.6970129584889087, + "grad_norm": 2.0169217586517334, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7781552672386169, + "num_tokens": 635521478.0, + "step": 24559 + }, + { + "epoch": 2.697122776191522, + "grad_norm": 2.044355630874634, + "learning_rate": 5e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.739541232585907, + "num_tokens": 635549836.0, + "step": 24560 + }, + { + "epoch": 2.6972325938941357, + "grad_norm": 2.0413572788238525, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7590643167495728, + "num_tokens": 635575841.0, + "step": 24561 + }, + { + "epoch": 2.6973424115967495, + "grad_norm": 2.034351348876953, + "learning_rate": 5e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.770463228225708, + "num_tokens": 635601846.0, + "step": 24562 + }, + { + "epoch": 2.697452229299363, + "grad_norm": 2.1038060188293457, + "learning_rate": 5e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.7587578296661377, + "num_tokens": 635629420.0, + "step": 24563 + }, + { + "epoch": 2.697562047001977, + "grad_norm": 2.1454272270202637, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7661417126655579, + "num_tokens": 635652930.0, + "step": 24564 + }, + { + "epoch": 2.6976718647045903, + "grad_norm": 1.9231133460998535, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7543284893035889, + "num_tokens": 635682592.0, + "step": 24565 + }, + { + "epoch": 2.697781682407204, + "grad_norm": 2.198633909225464, + "learning_rate": 5e-06, + "loss": 0.7021, + "mean_token_accuracy": 0.7706566452980042, + "num_tokens": 635705595.0, + "step": 24566 + }, + { + "epoch": 2.697891500109818, + "grad_norm": 2.4184815883636475, + "learning_rate": 5e-06, + "loss": 0.6417, + "mean_token_accuracy": 0.781550407409668, + "num_tokens": 635724746.0, + "step": 24567 + }, + { + "epoch": 2.698001317812431, + "grad_norm": 2.120586633682251, + "learning_rate": 5e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7464420795440674, + "num_tokens": 635749601.0, + "step": 24568 + }, + { + "epoch": 2.698111135515045, + "grad_norm": 2.304320812225342, + "learning_rate": 5e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.7788796424865723, + "num_tokens": 635770120.0, + "step": 24569 + }, + { + "epoch": 2.6982209532176586, + "grad_norm": 1.9749934673309326, + "learning_rate": 5e-06, + "loss": 0.6958, + "mean_token_accuracy": 0.7720590829849243, + "num_tokens": 635799984.0, + "step": 24570 + }, + { + "epoch": 2.6983307709202724, + "grad_norm": 2.4360079765319824, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7736630439758301, + "num_tokens": 635820468.0, + "step": 24571 + }, + { + "epoch": 2.698440588622886, + "grad_norm": 1.9648414850234985, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7686645984649658, + "num_tokens": 635845501.0, + "step": 24572 + }, + { + "epoch": 2.6985504063254995, + "grad_norm": 1.9579445123672485, + "learning_rate": 5e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7707940936088562, + "num_tokens": 635873992.0, + "step": 24573 + }, + { + "epoch": 2.6986602240281132, + "grad_norm": 2.2148666381835938, + "learning_rate": 5e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.7770028114318848, + "num_tokens": 635894032.0, + "step": 24574 + }, + { + "epoch": 2.698770041730727, + "grad_norm": 1.9165234565734863, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7604444622993469, + "num_tokens": 635923541.0, + "step": 24575 + }, + { + "epoch": 2.6988798594333407, + "grad_norm": 2.1508688926696777, + "learning_rate": 5e-06, + "loss": 0.7276, + "mean_token_accuracy": 0.768067479133606, + "num_tokens": 635949076.0, + "step": 24576 + }, + { + "epoch": 2.6989896771359545, + "grad_norm": 2.1859397888183594, + "learning_rate": 5e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7553938627243042, + "num_tokens": 635973852.0, + "step": 24577 + }, + { + "epoch": 2.699099494838568, + "grad_norm": 2.073639392852783, + "learning_rate": 5e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.7715904712677002, + "num_tokens": 635997887.0, + "step": 24578 + }, + { + "epoch": 2.6992093125411816, + "grad_norm": 2.0823516845703125, + "learning_rate": 5e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7627547979354858, + "num_tokens": 636025365.0, + "step": 24579 + }, + { + "epoch": 2.6993191302437953, + "grad_norm": 2.1650707721710205, + "learning_rate": 5e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7667083740234375, + "num_tokens": 636047794.0, + "step": 24580 + }, + { + "epoch": 2.699428947946409, + "grad_norm": 2.0187771320343018, + "learning_rate": 5e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.767632246017456, + "num_tokens": 636074975.0, + "step": 24581 + }, + { + "epoch": 2.699538765649023, + "grad_norm": 2.158430337905884, + "learning_rate": 5e-06, + "loss": 0.626, + "mean_token_accuracy": 0.7885223627090454, + "num_tokens": 636097466.0, + "step": 24582 + }, + { + "epoch": 2.699648583351636, + "grad_norm": 2.084014415740967, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.751457154750824, + "num_tokens": 636124216.0, + "step": 24583 + }, + { + "epoch": 2.69975840105425, + "grad_norm": 2.160752534866333, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7575202584266663, + "num_tokens": 636149624.0, + "step": 24584 + }, + { + "epoch": 2.6998682187568637, + "grad_norm": 2.04465651512146, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7659779787063599, + "num_tokens": 636175137.0, + "step": 24585 + }, + { + "epoch": 2.6999780364594774, + "grad_norm": 2.105255126953125, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7500776052474976, + "num_tokens": 636201632.0, + "step": 24586 + }, + { + "epoch": 2.700087854162091, + "grad_norm": 2.2582147121429443, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7505620718002319, + "num_tokens": 636225948.0, + "step": 24587 + }, + { + "epoch": 2.7001976718647045, + "grad_norm": 2.2399373054504395, + "learning_rate": 5e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.7873581647872925, + "num_tokens": 636245723.0, + "step": 24588 + }, + { + "epoch": 2.7003074895673183, + "grad_norm": 2.0666332244873047, + "learning_rate": 5e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7609230279922485, + "num_tokens": 636271270.0, + "step": 24589 + }, + { + "epoch": 2.700417307269932, + "grad_norm": 1.9700615406036377, + "learning_rate": 5e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7582650184631348, + "num_tokens": 636300465.0, + "step": 24590 + }, + { + "epoch": 2.7005271249725453, + "grad_norm": 2.199803113937378, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7615888714790344, + "num_tokens": 636325498.0, + "step": 24591 + }, + { + "epoch": 2.700636942675159, + "grad_norm": 2.0913631916046143, + "learning_rate": 5e-06, + "loss": 0.7017, + "mean_token_accuracy": 0.7687814235687256, + "num_tokens": 636351915.0, + "step": 24592 + }, + { + "epoch": 2.700746760377773, + "grad_norm": 1.9501296281814575, + "learning_rate": 5e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7593945860862732, + "num_tokens": 636380611.0, + "step": 24593 + }, + { + "epoch": 2.7008565780803866, + "grad_norm": 2.3536763191223145, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7694320678710938, + "num_tokens": 636400376.0, + "step": 24594 + }, + { + "epoch": 2.7009663957830004, + "grad_norm": 2.0990893840789795, + "learning_rate": 5e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7704950571060181, + "num_tokens": 636423911.0, + "step": 24595 + }, + { + "epoch": 2.7010762134856137, + "grad_norm": 2.3113350868225098, + "learning_rate": 5e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.7784968614578247, + "num_tokens": 636443817.0, + "step": 24596 + }, + { + "epoch": 2.7011860311882274, + "grad_norm": 2.2132418155670166, + "learning_rate": 5e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.7821369171142578, + "num_tokens": 636464849.0, + "step": 24597 + }, + { + "epoch": 2.701295848890841, + "grad_norm": 2.089582920074463, + "learning_rate": 5e-06, + "loss": 0.7471, + "mean_token_accuracy": 0.7571740746498108, + "num_tokens": 636490715.0, + "step": 24598 + }, + { + "epoch": 2.701405666593455, + "grad_norm": 2.180838108062744, + "learning_rate": 5e-06, + "loss": 0.7353, + "mean_token_accuracy": 0.761042058467865, + "num_tokens": 636515218.0, + "step": 24599 + }, + { + "epoch": 2.7015154842960687, + "grad_norm": 1.8892568349838257, + "learning_rate": 5e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7583922743797302, + "num_tokens": 636544071.0, + "step": 24600 + }, + { + "epoch": 2.701625301998682, + "grad_norm": 2.2531518936157227, + "learning_rate": 5e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.7969671487808228, + "num_tokens": 636566977.0, + "step": 24601 + }, + { + "epoch": 2.7017351197012958, + "grad_norm": 2.1159183979034424, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7686117887496948, + "num_tokens": 636592363.0, + "step": 24602 + }, + { + "epoch": 2.7018449374039095, + "grad_norm": 1.9647161960601807, + "learning_rate": 5e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.762808620929718, + "num_tokens": 636621525.0, + "step": 24603 + }, + { + "epoch": 2.7019547551065233, + "grad_norm": 2.0263988971710205, + "learning_rate": 5e-06, + "loss": 0.663, + "mean_token_accuracy": 0.7794196009635925, + "num_tokens": 636646135.0, + "step": 24604 + }, + { + "epoch": 2.702064572809137, + "grad_norm": 1.8166369199752808, + "learning_rate": 5e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7484578490257263, + "num_tokens": 636677158.0, + "step": 24605 + }, + { + "epoch": 2.7021743905117503, + "grad_norm": 2.1632394790649414, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7788111567497253, + "num_tokens": 636699448.0, + "step": 24606 + }, + { + "epoch": 2.702284208214364, + "grad_norm": 2.222010374069214, + "learning_rate": 5e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7542717456817627, + "num_tokens": 636725261.0, + "step": 24607 + }, + { + "epoch": 2.702394025916978, + "grad_norm": 2.2880609035491943, + "learning_rate": 5e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7740503549575806, + "num_tokens": 636745581.0, + "step": 24608 + }, + { + "epoch": 2.7025038436195916, + "grad_norm": 2.022268056869507, + "learning_rate": 5e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7703229784965515, + "num_tokens": 636770787.0, + "step": 24609 + }, + { + "epoch": 2.7026136613222054, + "grad_norm": 1.9048850536346436, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.7577527761459351, + "num_tokens": 636801816.0, + "step": 24610 + }, + { + "epoch": 2.7027234790248187, + "grad_norm": 2.1217432022094727, + "learning_rate": 5e-06, + "loss": 0.7402, + "mean_token_accuracy": 0.7573169469833374, + "num_tokens": 636826636.0, + "step": 24611 + }, + { + "epoch": 2.7028332967274324, + "grad_norm": 2.083448886871338, + "learning_rate": 5e-06, + "loss": 0.7424, + "mean_token_accuracy": 0.7560948133468628, + "num_tokens": 636854204.0, + "step": 24612 + }, + { + "epoch": 2.702943114430046, + "grad_norm": 2.1489250659942627, + "learning_rate": 5e-06, + "loss": 0.7449, + "mean_token_accuracy": 0.7531880140304565, + "num_tokens": 636879590.0, + "step": 24613 + }, + { + "epoch": 2.7030529321326595, + "grad_norm": 2.0990748405456543, + "learning_rate": 5e-06, + "loss": 0.6907, + "mean_token_accuracy": 0.7717511653900146, + "num_tokens": 636904604.0, + "step": 24614 + }, + { + "epoch": 2.7031627498352737, + "grad_norm": 2.014094114303589, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7672469615936279, + "num_tokens": 636930737.0, + "step": 24615 + }, + { + "epoch": 2.703272567537887, + "grad_norm": 2.320230007171631, + "learning_rate": 5e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7623461484909058, + "num_tokens": 636954623.0, + "step": 24616 + }, + { + "epoch": 2.703382385240501, + "grad_norm": 2.0361852645874023, + "learning_rate": 5e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7412201166152954, + "num_tokens": 636984908.0, + "step": 24617 + }, + { + "epoch": 2.7034922029431145, + "grad_norm": 2.034053087234497, + "learning_rate": 5e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7740452885627747, + "num_tokens": 637011406.0, + "step": 24618 + }, + { + "epoch": 2.703602020645728, + "grad_norm": 2.322680950164795, + "learning_rate": 5e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7751505970954895, + "num_tokens": 637032009.0, + "step": 24619 + }, + { + "epoch": 2.7037118383483416, + "grad_norm": 2.160663604736328, + "learning_rate": 5e-06, + "loss": 0.6989, + "mean_token_accuracy": 0.7656404972076416, + "num_tokens": 637056441.0, + "step": 24620 + }, + { + "epoch": 2.7038216560509554, + "grad_norm": 2.1054227352142334, + "learning_rate": 5e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7771899700164795, + "num_tokens": 637080323.0, + "step": 24621 + }, + { + "epoch": 2.703931473753569, + "grad_norm": 2.0642833709716797, + "learning_rate": 5e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7449214458465576, + "num_tokens": 637107857.0, + "step": 24622 + }, + { + "epoch": 2.704041291456183, + "grad_norm": 2.15108060836792, + "learning_rate": 5e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7606720328330994, + "num_tokens": 637132409.0, + "step": 24623 + }, + { + "epoch": 2.704151109158796, + "grad_norm": 2.1979880332946777, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7655224800109863, + "num_tokens": 637156851.0, + "step": 24624 + }, + { + "epoch": 2.70426092686141, + "grad_norm": 2.185767889022827, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7697063684463501, + "num_tokens": 637180925.0, + "step": 24625 + }, + { + "epoch": 2.7043707445640237, + "grad_norm": 1.9844647645950317, + "learning_rate": 5e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7381560206413269, + "num_tokens": 637211116.0, + "step": 24626 + }, + { + "epoch": 2.7044805622666375, + "grad_norm": 2.1522481441497803, + "learning_rate": 5e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.7598721981048584, + "num_tokens": 637235605.0, + "step": 24627 + }, + { + "epoch": 2.7045903799692512, + "grad_norm": 2.1321592330932617, + "learning_rate": 5e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7569102644920349, + "num_tokens": 637260207.0, + "step": 24628 + }, + { + "epoch": 2.7047001976718645, + "grad_norm": 2.137037992477417, + "learning_rate": 5e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7547659873962402, + "num_tokens": 637284761.0, + "step": 24629 + }, + { + "epoch": 2.7048100153744783, + "grad_norm": 2.009216785430908, + "learning_rate": 5e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.7464779615402222, + "num_tokens": 637312859.0, + "step": 24630 + }, + { + "epoch": 2.704919833077092, + "grad_norm": 1.9918406009674072, + "learning_rate": 5e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7676786780357361, + "num_tokens": 637337622.0, + "step": 24631 + }, + { + "epoch": 2.705029650779706, + "grad_norm": 2.1105802059173584, + "learning_rate": 5e-06, + "loss": 0.6368, + "mean_token_accuracy": 0.7869842052459717, + "num_tokens": 637361222.0, + "step": 24632 + }, + { + "epoch": 2.7051394684823196, + "grad_norm": 1.9619024991989136, + "learning_rate": 5e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.7565742135047913, + "num_tokens": 637390994.0, + "step": 24633 + }, + { + "epoch": 2.705249286184933, + "grad_norm": 1.8939149379730225, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7612763047218323, + "num_tokens": 637419398.0, + "step": 24634 + }, + { + "epoch": 2.7053591038875466, + "grad_norm": 2.257704257965088, + "learning_rate": 5e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.7727615237236023, + "num_tokens": 637440549.0, + "step": 24635 + }, + { + "epoch": 2.7054689215901604, + "grad_norm": 2.347522497177124, + "learning_rate": 5e-06, + "loss": 0.6389, + "mean_token_accuracy": 0.7808808088302612, + "num_tokens": 637461712.0, + "step": 24636 + }, + { + "epoch": 2.705578739292774, + "grad_norm": 2.0029280185699463, + "learning_rate": 5e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7441703081130981, + "num_tokens": 637492004.0, + "step": 24637 + }, + { + "epoch": 2.705688556995388, + "grad_norm": 1.9653961658477783, + "learning_rate": 5e-06, + "loss": 0.6767, + "mean_token_accuracy": 0.7781441807746887, + "num_tokens": 637518722.0, + "step": 24638 + }, + { + "epoch": 2.705798374698001, + "grad_norm": 2.189011573791504, + "learning_rate": 5e-06, + "loss": 0.657, + "mean_token_accuracy": 0.7824772000312805, + "num_tokens": 637540278.0, + "step": 24639 + }, + { + "epoch": 2.705908192400615, + "grad_norm": 2.0741047859191895, + "learning_rate": 5e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7576401233673096, + "num_tokens": 637565462.0, + "step": 24640 + }, + { + "epoch": 2.7060180101032287, + "grad_norm": 1.9927444458007812, + "learning_rate": 5e-06, + "loss": 0.7279, + "mean_token_accuracy": 0.7608857154846191, + "num_tokens": 637597232.0, + "step": 24641 + }, + { + "epoch": 2.706127827805842, + "grad_norm": 2.0998499393463135, + "learning_rate": 5e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.7805845141410828, + "num_tokens": 637621995.0, + "step": 24642 + }, + { + "epoch": 2.706237645508456, + "grad_norm": 2.0301601886749268, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7554980516433716, + "num_tokens": 637649367.0, + "step": 24643 + }, + { + "epoch": 2.7063474632110696, + "grad_norm": 2.122342348098755, + "learning_rate": 5e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7624058723449707, + "num_tokens": 637673204.0, + "step": 24644 + }, + { + "epoch": 2.7064572809136833, + "grad_norm": 1.948662519454956, + "learning_rate": 5e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7804129719734192, + "num_tokens": 637699574.0, + "step": 24645 + }, + { + "epoch": 2.706567098616297, + "grad_norm": 2.037649393081665, + "learning_rate": 5e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.7759192585945129, + "num_tokens": 637724889.0, + "step": 24646 + }, + { + "epoch": 2.7066769163189104, + "grad_norm": 2.111586093902588, + "learning_rate": 5e-06, + "loss": 0.7058, + "mean_token_accuracy": 0.7678737044334412, + "num_tokens": 637748383.0, + "step": 24647 + }, + { + "epoch": 2.706786734021524, + "grad_norm": 2.13041090965271, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7644696235656738, + "num_tokens": 637772336.0, + "step": 24648 + }, + { + "epoch": 2.706896551724138, + "grad_norm": 1.9877598285675049, + "learning_rate": 5e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7636914253234863, + "num_tokens": 637800970.0, + "step": 24649 + }, + { + "epoch": 2.7070063694267517, + "grad_norm": 2.152073621749878, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7774677276611328, + "num_tokens": 637823264.0, + "step": 24650 + }, + { + "epoch": 2.7071161871293654, + "grad_norm": 2.1854662895202637, + "learning_rate": 5e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7672455310821533, + "num_tokens": 637847645.0, + "step": 24651 + }, + { + "epoch": 2.7072260048319787, + "grad_norm": 2.0613114833831787, + "learning_rate": 5e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7492594718933105, + "num_tokens": 637874050.0, + "step": 24652 + }, + { + "epoch": 2.7073358225345925, + "grad_norm": 1.9297189712524414, + "learning_rate": 5e-06, + "loss": 0.613, + "mean_token_accuracy": 0.7943098545074463, + "num_tokens": 637900241.0, + "step": 24653 + }, + { + "epoch": 2.7074456402372062, + "grad_norm": 2.241159439086914, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7676249146461487, + "num_tokens": 637923715.0, + "step": 24654 + }, + { + "epoch": 2.70755545793982, + "grad_norm": 2.399225950241089, + "learning_rate": 5e-06, + "loss": 0.6574, + "mean_token_accuracy": 0.777334451675415, + "num_tokens": 637944487.0, + "step": 24655 + }, + { + "epoch": 2.7076652756424338, + "grad_norm": 2.05898118019104, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.744953989982605, + "num_tokens": 637972252.0, + "step": 24656 + }, + { + "epoch": 2.707775093345047, + "grad_norm": 2.250053644180298, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7651596069335938, + "num_tokens": 637995282.0, + "step": 24657 + }, + { + "epoch": 2.707884911047661, + "grad_norm": 2.102407217025757, + "learning_rate": 5e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.767989456653595, + "num_tokens": 638020321.0, + "step": 24658 + }, + { + "epoch": 2.7079947287502746, + "grad_norm": 2.1699142456054688, + "learning_rate": 5e-06, + "loss": 0.7232, + "mean_token_accuracy": 0.7606808543205261, + "num_tokens": 638044867.0, + "step": 24659 + }, + { + "epoch": 2.7081045464528883, + "grad_norm": 1.896572470664978, + "learning_rate": 5e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7453582882881165, + "num_tokens": 638078677.0, + "step": 24660 + }, + { + "epoch": 2.708214364155502, + "grad_norm": 1.9286531209945679, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7444474697113037, + "num_tokens": 638109277.0, + "step": 24661 + }, + { + "epoch": 2.7083241818581154, + "grad_norm": 2.3735764026641846, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7562166452407837, + "num_tokens": 638132343.0, + "step": 24662 + }, + { + "epoch": 2.708433999560729, + "grad_norm": 1.96005380153656, + "learning_rate": 5e-06, + "loss": 0.655, + "mean_token_accuracy": 0.7861118316650391, + "num_tokens": 638157799.0, + "step": 24663 + }, + { + "epoch": 2.708543817263343, + "grad_norm": 1.8689451217651367, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7421886324882507, + "num_tokens": 638188526.0, + "step": 24664 + }, + { + "epoch": 2.7086536349659562, + "grad_norm": 2.0618462562561035, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7600351572036743, + "num_tokens": 638213877.0, + "step": 24665 + }, + { + "epoch": 2.7087634526685704, + "grad_norm": 2.025085687637329, + "learning_rate": 5e-06, + "loss": 0.6741, + "mean_token_accuracy": 0.7788326740264893, + "num_tokens": 638238173.0, + "step": 24666 + }, + { + "epoch": 2.7088732703711838, + "grad_norm": 1.933121681213379, + "learning_rate": 5e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7455031275749207, + "num_tokens": 638267784.0, + "step": 24667 + }, + { + "epoch": 2.7089830880737975, + "grad_norm": 1.909103274345398, + "learning_rate": 5e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.751008152961731, + "num_tokens": 638297666.0, + "step": 24668 + }, + { + "epoch": 2.7090929057764113, + "grad_norm": 2.440577507019043, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7655739784240723, + "num_tokens": 638317838.0, + "step": 24669 + }, + { + "epoch": 2.7092027234790246, + "grad_norm": 2.0171635150909424, + "learning_rate": 5e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7484565377235413, + "num_tokens": 638344895.0, + "step": 24670 + }, + { + "epoch": 2.7093125411816383, + "grad_norm": 2.1688828468322754, + "learning_rate": 5e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7650727033615112, + "num_tokens": 638368902.0, + "step": 24671 + }, + { + "epoch": 2.709422358884252, + "grad_norm": 1.9466627836227417, + "learning_rate": 5e-06, + "loss": 0.6175, + "mean_token_accuracy": 0.7903624773025513, + "num_tokens": 638395943.0, + "step": 24672 + }, + { + "epoch": 2.709532176586866, + "grad_norm": 2.497992992401123, + "learning_rate": 5e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7562742233276367, + "num_tokens": 638416703.0, + "step": 24673 + }, + { + "epoch": 2.7096419942894796, + "grad_norm": 2.4495115280151367, + "learning_rate": 5e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.7855688333511353, + "num_tokens": 638436653.0, + "step": 24674 + }, + { + "epoch": 2.709751811992093, + "grad_norm": 2.096353054046631, + "learning_rate": 5e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7620696425437927, + "num_tokens": 638461905.0, + "step": 24675 + }, + { + "epoch": 2.7098616296947067, + "grad_norm": 2.404188394546509, + "learning_rate": 5e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.741783082485199, + "num_tokens": 638487949.0, + "step": 24676 + }, + { + "epoch": 2.7099714473973204, + "grad_norm": 1.9486030340194702, + "learning_rate": 5e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7738993167877197, + "num_tokens": 638515603.0, + "step": 24677 + }, + { + "epoch": 2.710081265099934, + "grad_norm": 2.073007583618164, + "learning_rate": 5e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7528054118156433, + "num_tokens": 638542392.0, + "step": 24678 + }, + { + "epoch": 2.710191082802548, + "grad_norm": 2.045933485031128, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7712012529373169, + "num_tokens": 638568361.0, + "step": 24679 + }, + { + "epoch": 2.7103009005051613, + "grad_norm": 2.161754608154297, + "learning_rate": 5e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7615691423416138, + "num_tokens": 638594176.0, + "step": 24680 + }, + { + "epoch": 2.710410718207775, + "grad_norm": 2.0376970767974854, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7655798196792603, + "num_tokens": 638621167.0, + "step": 24681 + }, + { + "epoch": 2.7105205359103888, + "grad_norm": 1.9494404792785645, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.753504753112793, + "num_tokens": 638649123.0, + "step": 24682 + }, + { + "epoch": 2.7106303536130025, + "grad_norm": 1.9704647064208984, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7367415428161621, + "num_tokens": 638678289.0, + "step": 24683 + }, + { + "epoch": 2.7107401713156163, + "grad_norm": 2.003000259399414, + "learning_rate": 5e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.764624834060669, + "num_tokens": 638705409.0, + "step": 24684 + }, + { + "epoch": 2.7108499890182296, + "grad_norm": 2.005244493484497, + "learning_rate": 5e-06, + "loss": 0.7004, + "mean_token_accuracy": 0.7665033340454102, + "num_tokens": 638732290.0, + "step": 24685 + }, + { + "epoch": 2.7109598067208434, + "grad_norm": 2.6284165382385254, + "learning_rate": 5e-06, + "loss": 0.6137, + "mean_token_accuracy": 0.7961537837982178, + "num_tokens": 638749110.0, + "step": 24686 + }, + { + "epoch": 2.711069624423457, + "grad_norm": 1.980881690979004, + "learning_rate": 5e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.738056480884552, + "num_tokens": 638780250.0, + "step": 24687 + }, + { + "epoch": 2.711179442126071, + "grad_norm": 2.3181216716766357, + "learning_rate": 5e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7671537399291992, + "num_tokens": 638802763.0, + "step": 24688 + }, + { + "epoch": 2.7112892598286846, + "grad_norm": 2.3579554557800293, + "learning_rate": 5e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.7692543268203735, + "num_tokens": 638824638.0, + "step": 24689 + }, + { + "epoch": 2.711399077531298, + "grad_norm": 2.063359022140503, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7559245228767395, + "num_tokens": 638852440.0, + "step": 24690 + }, + { + "epoch": 2.7115088952339117, + "grad_norm": 2.3849616050720215, + "learning_rate": 5e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.7584781646728516, + "num_tokens": 638870843.0, + "step": 24691 + }, + { + "epoch": 2.7116187129365255, + "grad_norm": 2.012568712234497, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7718181610107422, + "num_tokens": 638897532.0, + "step": 24692 + }, + { + "epoch": 2.7117285306391388, + "grad_norm": 1.9830855131149292, + "learning_rate": 5e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7365577816963196, + "num_tokens": 638929481.0, + "step": 24693 + }, + { + "epoch": 2.7118383483417525, + "grad_norm": 1.9974712133407593, + "learning_rate": 5e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7391314506530762, + "num_tokens": 638959731.0, + "step": 24694 + }, + { + "epoch": 2.7119481660443663, + "grad_norm": 1.899552583694458, + "learning_rate": 5e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7417596578598022, + "num_tokens": 638987910.0, + "step": 24695 + }, + { + "epoch": 2.71205798374698, + "grad_norm": 1.8837296962738037, + "learning_rate": 5e-06, + "loss": 0.6575, + "mean_token_accuracy": 0.7812440991401672, + "num_tokens": 639016674.0, + "step": 24696 + }, + { + "epoch": 2.712167801449594, + "grad_norm": 1.8616118431091309, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.758913516998291, + "num_tokens": 639049040.0, + "step": 24697 + }, + { + "epoch": 2.712277619152207, + "grad_norm": 1.8091256618499756, + "learning_rate": 5e-06, + "loss": 0.6232, + "mean_token_accuracy": 0.7970805764198303, + "num_tokens": 639078608.0, + "step": 24698 + }, + { + "epoch": 2.712387436854821, + "grad_norm": 2.156881809234619, + "learning_rate": 5e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7510208487510681, + "num_tokens": 639101718.0, + "step": 24699 + }, + { + "epoch": 2.7124972545574346, + "grad_norm": 1.9377690553665161, + "learning_rate": 5e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7559494972229004, + "num_tokens": 639129679.0, + "step": 24700 + }, + { + "epoch": 2.7126070722600484, + "grad_norm": 2.375389337539673, + "learning_rate": 5e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7601906657218933, + "num_tokens": 639152927.0, + "step": 24701 + }, + { + "epoch": 2.712716889962662, + "grad_norm": 2.313809871673584, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.756547212600708, + "num_tokens": 639177321.0, + "step": 24702 + }, + { + "epoch": 2.7128267076652754, + "grad_norm": 2.1265788078308105, + "learning_rate": 5e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7640864253044128, + "num_tokens": 639201121.0, + "step": 24703 + }, + { + "epoch": 2.712936525367889, + "grad_norm": 2.1238481998443604, + "learning_rate": 5e-06, + "loss": 0.7455, + "mean_token_accuracy": 0.752814531326294, + "num_tokens": 639227160.0, + "step": 24704 + }, + { + "epoch": 2.713046343070503, + "grad_norm": 1.849812626838684, + "learning_rate": 5e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7514093518257141, + "num_tokens": 639259351.0, + "step": 24705 + }, + { + "epoch": 2.7131561607731167, + "grad_norm": 2.153496265411377, + "learning_rate": 5e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.759854793548584, + "num_tokens": 639283874.0, + "step": 24706 + }, + { + "epoch": 2.7132659784757305, + "grad_norm": 2.1086997985839844, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7536295652389526, + "num_tokens": 639309447.0, + "step": 24707 + }, + { + "epoch": 2.713375796178344, + "grad_norm": 2.0648601055145264, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7726608514785767, + "num_tokens": 639334696.0, + "step": 24708 + }, + { + "epoch": 2.7134856138809575, + "grad_norm": 2.157257556915283, + "learning_rate": 5e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7635247707366943, + "num_tokens": 639359507.0, + "step": 24709 + }, + { + "epoch": 2.7135954315835713, + "grad_norm": 2.031831979751587, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7533442974090576, + "num_tokens": 639383618.0, + "step": 24710 + }, + { + "epoch": 2.713705249286185, + "grad_norm": 2.3586323261260986, + "learning_rate": 5e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7677135467529297, + "num_tokens": 639406556.0, + "step": 24711 + }, + { + "epoch": 2.713815066988799, + "grad_norm": 1.9849828481674194, + "learning_rate": 5e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.7662786245346069, + "num_tokens": 639432988.0, + "step": 24712 + }, + { + "epoch": 2.713924884691412, + "grad_norm": 2.183288097381592, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7587788105010986, + "num_tokens": 639455018.0, + "step": 24713 + }, + { + "epoch": 2.714034702394026, + "grad_norm": 1.9309626817703247, + "learning_rate": 5e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7489757537841797, + "num_tokens": 639485684.0, + "step": 24714 + }, + { + "epoch": 2.7141445200966396, + "grad_norm": 2.051064968109131, + "learning_rate": 5e-06, + "loss": 0.7298, + "mean_token_accuracy": 0.7585709691047668, + "num_tokens": 639511330.0, + "step": 24715 + }, + { + "epoch": 2.7142543377992534, + "grad_norm": 1.9184563159942627, + "learning_rate": 5e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.7812014222145081, + "num_tokens": 639538055.0, + "step": 24716 + }, + { + "epoch": 2.714364155501867, + "grad_norm": 1.7962990999221802, + "learning_rate": 5e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7615168690681458, + "num_tokens": 639570658.0, + "step": 24717 + }, + { + "epoch": 2.7144739732044805, + "grad_norm": 2.3076534271240234, + "learning_rate": 5e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7650554180145264, + "num_tokens": 639593776.0, + "step": 24718 + }, + { + "epoch": 2.7145837909070942, + "grad_norm": 2.069056510925293, + "learning_rate": 5e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7344075441360474, + "num_tokens": 639619783.0, + "step": 24719 + }, + { + "epoch": 2.714693608609708, + "grad_norm": 2.1385414600372314, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7664589285850525, + "num_tokens": 639642151.0, + "step": 24720 + }, + { + "epoch": 2.7148034263123213, + "grad_norm": 1.7881051301956177, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7526421546936035, + "num_tokens": 639676806.0, + "step": 24721 + }, + { + "epoch": 2.714913244014935, + "grad_norm": 1.9659780263900757, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7491099238395691, + "num_tokens": 639705195.0, + "step": 24722 + }, + { + "epoch": 2.715023061717549, + "grad_norm": 1.837952733039856, + "learning_rate": 5e-06, + "loss": 0.7656, + "mean_token_accuracy": 0.7513543963432312, + "num_tokens": 639735544.0, + "step": 24723 + }, + { + "epoch": 2.7151328794201626, + "grad_norm": 2.1683189868927, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.736996054649353, + "num_tokens": 639759668.0, + "step": 24724 + }, + { + "epoch": 2.7152426971227763, + "grad_norm": 2.232635736465454, + "learning_rate": 5e-06, + "loss": 0.6731, + "mean_token_accuracy": 0.778867244720459, + "num_tokens": 639782157.0, + "step": 24725 + }, + { + "epoch": 2.7153525148253896, + "grad_norm": 1.8945128917694092, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7528441548347473, + "num_tokens": 639814674.0, + "step": 24726 + }, + { + "epoch": 2.7154623325280034, + "grad_norm": 2.3822147846221924, + "learning_rate": 5e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.7838048934936523, + "num_tokens": 639834842.0, + "step": 24727 + }, + { + "epoch": 2.715572150230617, + "grad_norm": 2.143996238708496, + "learning_rate": 5e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7587912082672119, + "num_tokens": 639859147.0, + "step": 24728 + }, + { + "epoch": 2.715681967933231, + "grad_norm": 2.071096897125244, + "learning_rate": 5e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7650326490402222, + "num_tokens": 639884883.0, + "step": 24729 + }, + { + "epoch": 2.7157917856358447, + "grad_norm": 2.0747897624969482, + "learning_rate": 5e-06, + "loss": 0.7013, + "mean_token_accuracy": 0.7656106948852539, + "num_tokens": 639910558.0, + "step": 24730 + }, + { + "epoch": 2.715901603338458, + "grad_norm": 1.940946340560913, + "learning_rate": 5e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.754736065864563, + "num_tokens": 639940062.0, + "step": 24731 + }, + { + "epoch": 2.7160114210410717, + "grad_norm": 1.8869980573654175, + "learning_rate": 5e-06, + "loss": 0.634, + "mean_token_accuracy": 0.7867996692657471, + "num_tokens": 639968340.0, + "step": 24732 + }, + { + "epoch": 2.7161212387436855, + "grad_norm": 2.286832094192505, + "learning_rate": 5e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7649532556533813, + "num_tokens": 639990287.0, + "step": 24733 + }, + { + "epoch": 2.7162310564462993, + "grad_norm": 1.9312535524368286, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.773324191570282, + "num_tokens": 640015797.0, + "step": 24734 + }, + { + "epoch": 2.716340874148913, + "grad_norm": 1.8708254098892212, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7649171352386475, + "num_tokens": 640046061.0, + "step": 24735 + }, + { + "epoch": 2.7164506918515263, + "grad_norm": 2.025510549545288, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7439028024673462, + "num_tokens": 640072815.0, + "step": 24736 + }, + { + "epoch": 2.71656050955414, + "grad_norm": 1.9899086952209473, + "learning_rate": 5e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7460198402404785, + "num_tokens": 640101941.0, + "step": 24737 + }, + { + "epoch": 2.716670327256754, + "grad_norm": 2.1477317810058594, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7702928185462952, + "num_tokens": 640125857.0, + "step": 24738 + }, + { + "epoch": 2.7167801449593676, + "grad_norm": 2.0378916263580322, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7663127779960632, + "num_tokens": 640155020.0, + "step": 24739 + }, + { + "epoch": 2.7168899626619814, + "grad_norm": 1.9607056379318237, + "learning_rate": 5e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.771422803401947, + "num_tokens": 640182579.0, + "step": 24740 + }, + { + "epoch": 2.7169997803645947, + "grad_norm": 1.8949283361434937, + "learning_rate": 5e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7484383583068848, + "num_tokens": 640212674.0, + "step": 24741 + }, + { + "epoch": 2.7171095980672084, + "grad_norm": 2.4393856525421143, + "learning_rate": 5e-06, + "loss": 0.6309, + "mean_token_accuracy": 0.7834041118621826, + "num_tokens": 640231073.0, + "step": 24742 + }, + { + "epoch": 2.717219415769822, + "grad_norm": 1.9732544422149658, + "learning_rate": 5e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7409908771514893, + "num_tokens": 640260553.0, + "step": 24743 + }, + { + "epoch": 2.7173292334724355, + "grad_norm": 2.253552198410034, + "learning_rate": 5e-06, + "loss": 0.6509, + "mean_token_accuracy": 0.7804293632507324, + "num_tokens": 640283130.0, + "step": 24744 + }, + { + "epoch": 2.7174390511750497, + "grad_norm": 2.270172595977783, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7717187404632568, + "num_tokens": 640307480.0, + "step": 24745 + }, + { + "epoch": 2.717548868877663, + "grad_norm": 2.2137298583984375, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7606551647186279, + "num_tokens": 640332638.0, + "step": 24746 + }, + { + "epoch": 2.7176586865802768, + "grad_norm": 1.9306349754333496, + "learning_rate": 5e-06, + "loss": 0.6563, + "mean_token_accuracy": 0.7787891626358032, + "num_tokens": 640360529.0, + "step": 24747 + }, + { + "epoch": 2.7177685042828905, + "grad_norm": 2.3751628398895264, + "learning_rate": 5e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7716615200042725, + "num_tokens": 640381985.0, + "step": 24748 + }, + { + "epoch": 2.717878321985504, + "grad_norm": 2.0272128582000732, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7506917715072632, + "num_tokens": 640412518.0, + "step": 24749 + }, + { + "epoch": 2.7179881396881176, + "grad_norm": 2.1775004863739014, + "learning_rate": 5e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.7619035243988037, + "num_tokens": 640436793.0, + "step": 24750 + }, + { + "epoch": 2.7180979573907313, + "grad_norm": 1.9704594612121582, + "learning_rate": 5e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7641806602478027, + "num_tokens": 640462370.0, + "step": 24751 + }, + { + "epoch": 2.718207775093345, + "grad_norm": 1.8775579929351807, + "learning_rate": 5e-06, + "loss": 0.6894, + "mean_token_accuracy": 0.7732183933258057, + "num_tokens": 640491733.0, + "step": 24752 + }, + { + "epoch": 2.718317592795959, + "grad_norm": 2.0169153213500977, + "learning_rate": 5e-06, + "loss": 0.6904, + "mean_token_accuracy": 0.7724161148071289, + "num_tokens": 640517565.0, + "step": 24753 + }, + { + "epoch": 2.718427410498572, + "grad_norm": 2.0334579944610596, + "learning_rate": 5e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.7726151347160339, + "num_tokens": 640543268.0, + "step": 24754 + }, + { + "epoch": 2.718537228201186, + "grad_norm": 2.2462830543518066, + "learning_rate": 5e-06, + "loss": 0.6728, + "mean_token_accuracy": 0.7755844593048096, + "num_tokens": 640564450.0, + "step": 24755 + }, + { + "epoch": 2.7186470459037997, + "grad_norm": 1.9511690139770508, + "learning_rate": 5e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7742148637771606, + "num_tokens": 640592959.0, + "step": 24756 + }, + { + "epoch": 2.7187568636064134, + "grad_norm": 1.8429818153381348, + "learning_rate": 5e-06, + "loss": 0.696, + "mean_token_accuracy": 0.7743023633956909, + "num_tokens": 640621073.0, + "step": 24757 + }, + { + "epoch": 2.718866681309027, + "grad_norm": 1.9901955127716064, + "learning_rate": 5e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7485978603363037, + "num_tokens": 640648686.0, + "step": 24758 + }, + { + "epoch": 2.7189764990116405, + "grad_norm": 1.7539881467819214, + "learning_rate": 5e-06, + "loss": 0.6527, + "mean_token_accuracy": 0.7875065803527832, + "num_tokens": 640683540.0, + "step": 24759 + }, + { + "epoch": 2.7190863167142543, + "grad_norm": 1.8118500709533691, + "learning_rate": 5e-06, + "loss": 0.6644, + "mean_token_accuracy": 0.776532769203186, + "num_tokens": 640715222.0, + "step": 24760 + }, + { + "epoch": 2.719196134416868, + "grad_norm": 2.1137895584106445, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7774917483329773, + "num_tokens": 640738320.0, + "step": 24761 + }, + { + "epoch": 2.719305952119482, + "grad_norm": 2.041438579559326, + "learning_rate": 5e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7470231056213379, + "num_tokens": 640764524.0, + "step": 24762 + }, + { + "epoch": 2.7194157698220955, + "grad_norm": 2.2787816524505615, + "learning_rate": 5e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7530984878540039, + "num_tokens": 640787972.0, + "step": 24763 + }, + { + "epoch": 2.719525587524709, + "grad_norm": 2.1553473472595215, + "learning_rate": 5e-06, + "loss": 0.6706, + "mean_token_accuracy": 0.7824310660362244, + "num_tokens": 640809726.0, + "step": 24764 + }, + { + "epoch": 2.7196354052273226, + "grad_norm": 2.389415740966797, + "learning_rate": 5e-06, + "loss": 0.689, + "mean_token_accuracy": 0.7814978361129761, + "num_tokens": 640830447.0, + "step": 24765 + }, + { + "epoch": 2.7197452229299364, + "grad_norm": 2.0795907974243164, + "learning_rate": 5e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7401127815246582, + "num_tokens": 640858708.0, + "step": 24766 + }, + { + "epoch": 2.71985504063255, + "grad_norm": 2.089407444000244, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7553874850273132, + "num_tokens": 640884641.0, + "step": 24767 + }, + { + "epoch": 2.719964858335164, + "grad_norm": 1.981753945350647, + "learning_rate": 5e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7717831134796143, + "num_tokens": 640911473.0, + "step": 24768 + }, + { + "epoch": 2.720074676037777, + "grad_norm": 1.7827786207199097, + "learning_rate": 5e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.7689805626869202, + "num_tokens": 640943587.0, + "step": 24769 + }, + { + "epoch": 2.720184493740391, + "grad_norm": 2.1746339797973633, + "learning_rate": 5e-06, + "loss": 0.7203, + "mean_token_accuracy": 0.7642536759376526, + "num_tokens": 640966853.0, + "step": 24770 + }, + { + "epoch": 2.7202943114430047, + "grad_norm": 2.2799596786499023, + "learning_rate": 5e-06, + "loss": 0.6321, + "mean_token_accuracy": 0.7864265441894531, + "num_tokens": 640988081.0, + "step": 24771 + }, + { + "epoch": 2.720404129145618, + "grad_norm": 1.9123985767364502, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.755474328994751, + "num_tokens": 641018461.0, + "step": 24772 + }, + { + "epoch": 2.720513946848232, + "grad_norm": 2.0236966609954834, + "learning_rate": 5e-06, + "loss": 0.6637, + "mean_token_accuracy": 0.7867296934127808, + "num_tokens": 641043509.0, + "step": 24773 + }, + { + "epoch": 2.7206237645508455, + "grad_norm": 2.295572519302368, + "learning_rate": 5e-06, + "loss": 0.6565, + "mean_token_accuracy": 0.7845290303230286, + "num_tokens": 641064619.0, + "step": 24774 + }, + { + "epoch": 2.7207335822534593, + "grad_norm": 2.4426262378692627, + "learning_rate": 5e-06, + "loss": 0.661, + "mean_token_accuracy": 0.7796187400817871, + "num_tokens": 641083837.0, + "step": 24775 + }, + { + "epoch": 2.720843399956073, + "grad_norm": 2.000136375427246, + "learning_rate": 5e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.764601469039917, + "num_tokens": 641110532.0, + "step": 24776 + }, + { + "epoch": 2.7209532176586864, + "grad_norm": 2.133415460586548, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7589197754859924, + "num_tokens": 641135760.0, + "step": 24777 + }, + { + "epoch": 2.7210630353613, + "grad_norm": 1.9233874082565308, + "learning_rate": 5e-06, + "loss": 0.6588, + "mean_token_accuracy": 0.7778968811035156, + "num_tokens": 641163500.0, + "step": 24778 + }, + { + "epoch": 2.721172853063914, + "grad_norm": 2.156406879425049, + "learning_rate": 5e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7599083781242371, + "num_tokens": 641189754.0, + "step": 24779 + }, + { + "epoch": 2.7212826707665276, + "grad_norm": 2.135486364364624, + "learning_rate": 5e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.76398104429245, + "num_tokens": 641213203.0, + "step": 24780 + }, + { + "epoch": 2.7213924884691414, + "grad_norm": 2.2298600673675537, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7686269283294678, + "num_tokens": 641234509.0, + "step": 24781 + }, + { + "epoch": 2.7215023061717547, + "grad_norm": 2.108987808227539, + "learning_rate": 5e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.7712429761886597, + "num_tokens": 641256920.0, + "step": 24782 + }, + { + "epoch": 2.7216121238743685, + "grad_norm": 2.1580440998077393, + "learning_rate": 5e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7584112882614136, + "num_tokens": 641280840.0, + "step": 24783 + }, + { + "epoch": 2.721721941576982, + "grad_norm": 1.9624685049057007, + "learning_rate": 5e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7502375841140747, + "num_tokens": 641310460.0, + "step": 24784 + }, + { + "epoch": 2.721831759279596, + "grad_norm": 1.8991074562072754, + "learning_rate": 5e-06, + "loss": 0.6518, + "mean_token_accuracy": 0.7851129174232483, + "num_tokens": 641337768.0, + "step": 24785 + }, + { + "epoch": 2.7219415769822097, + "grad_norm": 2.2564098834991455, + "learning_rate": 5e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.7847698330879211, + "num_tokens": 641359400.0, + "step": 24786 + }, + { + "epoch": 2.722051394684823, + "grad_norm": 1.9634299278259277, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7514997720718384, + "num_tokens": 641386771.0, + "step": 24787 + }, + { + "epoch": 2.722161212387437, + "grad_norm": 1.8388895988464355, + "learning_rate": 5e-06, + "loss": 0.7215, + "mean_token_accuracy": 0.7584666013717651, + "num_tokens": 641416775.0, + "step": 24788 + }, + { + "epoch": 2.7222710300900506, + "grad_norm": 1.9936952590942383, + "learning_rate": 5e-06, + "loss": 0.7191, + "mean_token_accuracy": 0.7605098485946655, + "num_tokens": 641444485.0, + "step": 24789 + }, + { + "epoch": 2.7223808477926643, + "grad_norm": 2.0118184089660645, + "learning_rate": 5e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7453054785728455, + "num_tokens": 641473218.0, + "step": 24790 + }, + { + "epoch": 2.722490665495278, + "grad_norm": 1.8874388933181763, + "learning_rate": 5e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7661449909210205, + "num_tokens": 641502613.0, + "step": 24791 + }, + { + "epoch": 2.7226004831978914, + "grad_norm": 2.1315510272979736, + "learning_rate": 5e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7567994594573975, + "num_tokens": 641526981.0, + "step": 24792 + }, + { + "epoch": 2.722710300900505, + "grad_norm": 2.1080052852630615, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7742559313774109, + "num_tokens": 641548927.0, + "step": 24793 + }, + { + "epoch": 2.722820118603119, + "grad_norm": 1.87800931930542, + "learning_rate": 5e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.7517865300178528, + "num_tokens": 641577881.0, + "step": 24794 + }, + { + "epoch": 2.722929936305732, + "grad_norm": 2.0100221633911133, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7667170166969299, + "num_tokens": 641605162.0, + "step": 24795 + }, + { + "epoch": 2.7230397540083464, + "grad_norm": 2.20864200592041, + "learning_rate": 5e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.763152539730072, + "num_tokens": 641630691.0, + "step": 24796 + }, + { + "epoch": 2.7231495717109597, + "grad_norm": 2.193098783493042, + "learning_rate": 5e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.7857388854026794, + "num_tokens": 641652482.0, + "step": 24797 + }, + { + "epoch": 2.7232593894135735, + "grad_norm": 2.1882705688476562, + "learning_rate": 5e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.775830864906311, + "num_tokens": 641673671.0, + "step": 24798 + }, + { + "epoch": 2.7233692071161872, + "grad_norm": 2.178588390350342, + "learning_rate": 5e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.7740542888641357, + "num_tokens": 641696619.0, + "step": 24799 + }, + { + "epoch": 2.7234790248188006, + "grad_norm": 2.2049431800842285, + "learning_rate": 5e-06, + "loss": 0.6616, + "mean_token_accuracy": 0.7747208476066589, + "num_tokens": 641720688.0, + "step": 24800 + }, + { + "epoch": 2.7235888425214143, + "grad_norm": 1.9733948707580566, + "learning_rate": 5e-06, + "loss": 0.6302, + "mean_token_accuracy": 0.7870388627052307, + "num_tokens": 641748734.0, + "step": 24801 + }, + { + "epoch": 2.723698660224028, + "grad_norm": 1.9390565156936646, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7680627703666687, + "num_tokens": 641776618.0, + "step": 24802 + }, + { + "epoch": 2.723808477926642, + "grad_norm": 1.9727509021759033, + "learning_rate": 5e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7736660242080688, + "num_tokens": 641803057.0, + "step": 24803 + }, + { + "epoch": 2.7239182956292556, + "grad_norm": 2.2495036125183105, + "learning_rate": 5e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7391571998596191, + "num_tokens": 641829099.0, + "step": 24804 + }, + { + "epoch": 2.724028113331869, + "grad_norm": 2.2460708618164062, + "learning_rate": 5e-06, + "loss": 0.6658, + "mean_token_accuracy": 0.7819219827651978, + "num_tokens": 641851205.0, + "step": 24805 + }, + { + "epoch": 2.7241379310344827, + "grad_norm": 1.9846943616867065, + "learning_rate": 5e-06, + "loss": 0.599, + "mean_token_accuracy": 0.7984461784362793, + "num_tokens": 641876088.0, + "step": 24806 + }, + { + "epoch": 2.7242477487370964, + "grad_norm": 2.0797643661499023, + "learning_rate": 5e-06, + "loss": 0.6354, + "mean_token_accuracy": 0.783836841583252, + "num_tokens": 641900391.0, + "step": 24807 + }, + { + "epoch": 2.72435756643971, + "grad_norm": 2.109311580657959, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7489482760429382, + "num_tokens": 641926301.0, + "step": 24808 + }, + { + "epoch": 2.724467384142324, + "grad_norm": 2.2441651821136475, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7580698728561401, + "num_tokens": 641955843.0, + "step": 24809 + }, + { + "epoch": 2.7245772018449372, + "grad_norm": 2.1692419052124023, + "learning_rate": 5e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.7704217433929443, + "num_tokens": 641978160.0, + "step": 24810 + }, + { + "epoch": 2.724687019547551, + "grad_norm": 1.924647569656372, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7579946517944336, + "num_tokens": 642007824.0, + "step": 24811 + }, + { + "epoch": 2.7247968372501647, + "grad_norm": 2.0947160720825195, + "learning_rate": 5e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7554301619529724, + "num_tokens": 642034571.0, + "step": 24812 + }, + { + "epoch": 2.7249066549527785, + "grad_norm": 2.141183614730835, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7448952198028564, + "num_tokens": 642062192.0, + "step": 24813 + }, + { + "epoch": 2.7250164726553923, + "grad_norm": 1.9365360736846924, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7567216157913208, + "num_tokens": 642093630.0, + "step": 24814 + }, + { + "epoch": 2.7251262903580056, + "grad_norm": 1.9761481285095215, + "learning_rate": 5e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.7695009112358093, + "num_tokens": 642120571.0, + "step": 24815 + }, + { + "epoch": 2.7252361080606193, + "grad_norm": 1.9145342111587524, + "learning_rate": 5e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7626439929008484, + "num_tokens": 642150761.0, + "step": 24816 + }, + { + "epoch": 2.725345925763233, + "grad_norm": 2.2011754512786865, + "learning_rate": 5e-06, + "loss": 0.64, + "mean_token_accuracy": 0.7820965051651001, + "num_tokens": 642173555.0, + "step": 24817 + }, + { + "epoch": 2.725455743465847, + "grad_norm": 1.9618310928344727, + "learning_rate": 5e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7695476412773132, + "num_tokens": 642200077.0, + "step": 24818 + }, + { + "epoch": 2.7255655611684606, + "grad_norm": 2.229823589324951, + "learning_rate": 5e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.7755874991416931, + "num_tokens": 642222911.0, + "step": 24819 + }, + { + "epoch": 2.725675378871074, + "grad_norm": 1.9427356719970703, + "learning_rate": 5e-06, + "loss": 0.6422, + "mean_token_accuracy": 0.7868654727935791, + "num_tokens": 642249030.0, + "step": 24820 + }, + { + "epoch": 2.7257851965736877, + "grad_norm": 2.0079450607299805, + "learning_rate": 5e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7624438405036926, + "num_tokens": 642275322.0, + "step": 24821 + }, + { + "epoch": 2.7258950142763014, + "grad_norm": 1.9707914590835571, + "learning_rate": 5e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7439062595367432, + "num_tokens": 642305568.0, + "step": 24822 + }, + { + "epoch": 2.7260048319789147, + "grad_norm": 2.2222626209259033, + "learning_rate": 5e-06, + "loss": 0.6903, + "mean_token_accuracy": 0.7700141668319702, + "num_tokens": 642331572.0, + "step": 24823 + }, + { + "epoch": 2.7261146496815285, + "grad_norm": 1.994923710823059, + "learning_rate": 5e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7374638319015503, + "num_tokens": 642360713.0, + "step": 24824 + }, + { + "epoch": 2.7262244673841423, + "grad_norm": 2.185695171356201, + "learning_rate": 5e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.7748842239379883, + "num_tokens": 642384679.0, + "step": 24825 + }, + { + "epoch": 2.726334285086756, + "grad_norm": 2.2194764614105225, + "learning_rate": 5e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7747828960418701, + "num_tokens": 642405411.0, + "step": 24826 + }, + { + "epoch": 2.7264441027893698, + "grad_norm": 2.2559235095977783, + "learning_rate": 5e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.7813950777053833, + "num_tokens": 642428000.0, + "step": 24827 + }, + { + "epoch": 2.726553920491983, + "grad_norm": 2.2399117946624756, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.765568733215332, + "num_tokens": 642450916.0, + "step": 24828 + }, + { + "epoch": 2.726663738194597, + "grad_norm": 2.0409677028656006, + "learning_rate": 5e-06, + "loss": 0.7359, + "mean_token_accuracy": 0.7596232891082764, + "num_tokens": 642479871.0, + "step": 24829 + }, + { + "epoch": 2.7267735558972106, + "grad_norm": 2.0178442001342773, + "learning_rate": 5e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.777332603931427, + "num_tokens": 642505188.0, + "step": 24830 + }, + { + "epoch": 2.7268833735998244, + "grad_norm": 2.132448673248291, + "learning_rate": 5e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7705576419830322, + "num_tokens": 642528576.0, + "step": 24831 + }, + { + "epoch": 2.726993191302438, + "grad_norm": 1.9616920948028564, + "learning_rate": 5e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7526830434799194, + "num_tokens": 642556319.0, + "step": 24832 + }, + { + "epoch": 2.7271030090050514, + "grad_norm": 2.1343958377838135, + "learning_rate": 5e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7409624457359314, + "num_tokens": 642584986.0, + "step": 24833 + }, + { + "epoch": 2.727212826707665, + "grad_norm": 1.8855360746383667, + "learning_rate": 5e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7518119812011719, + "num_tokens": 642615336.0, + "step": 24834 + }, + { + "epoch": 2.727322644410279, + "grad_norm": 2.075296640396118, + "learning_rate": 5e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7689735889434814, + "num_tokens": 642640275.0, + "step": 24835 + }, + { + "epoch": 2.7274324621128927, + "grad_norm": 2.0578367710113525, + "learning_rate": 5e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7443728446960449, + "num_tokens": 642666940.0, + "step": 24836 + }, + { + "epoch": 2.7275422798155065, + "grad_norm": 2.1863276958465576, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7613173723220825, + "num_tokens": 642692919.0, + "step": 24837 + }, + { + "epoch": 2.7276520975181198, + "grad_norm": 1.863038420677185, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7471559643745422, + "num_tokens": 642722984.0, + "step": 24838 + }, + { + "epoch": 2.7277619152207335, + "grad_norm": 2.0568013191223145, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7670393586158752, + "num_tokens": 642750167.0, + "step": 24839 + }, + { + "epoch": 2.7278717329233473, + "grad_norm": 2.0452823638916016, + "learning_rate": 5e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.764010488986969, + "num_tokens": 642775369.0, + "step": 24840 + }, + { + "epoch": 2.727981550625961, + "grad_norm": 2.1878936290740967, + "learning_rate": 5e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.7753630876541138, + "num_tokens": 642798579.0, + "step": 24841 + }, + { + "epoch": 2.728091368328575, + "grad_norm": 1.9847112894058228, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7515673637390137, + "num_tokens": 642829756.0, + "step": 24842 + }, + { + "epoch": 2.728201186031188, + "grad_norm": 1.9838978052139282, + "learning_rate": 5e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7478500604629517, + "num_tokens": 642860059.0, + "step": 24843 + }, + { + "epoch": 2.728311003733802, + "grad_norm": 1.9121456146240234, + "learning_rate": 5e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7436631917953491, + "num_tokens": 642894389.0, + "step": 24844 + }, + { + "epoch": 2.7284208214364156, + "grad_norm": 1.9855364561080933, + "learning_rate": 5e-06, + "loss": 0.7293, + "mean_token_accuracy": 0.7599719166755676, + "num_tokens": 642924168.0, + "step": 24845 + }, + { + "epoch": 2.728530639139029, + "grad_norm": 2.1592395305633545, + "learning_rate": 5e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.7767786979675293, + "num_tokens": 642948948.0, + "step": 24846 + }, + { + "epoch": 2.728640456841643, + "grad_norm": 2.184415102005005, + "learning_rate": 5e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.776549220085144, + "num_tokens": 642971072.0, + "step": 24847 + }, + { + "epoch": 2.7287502745442564, + "grad_norm": 1.915872573852539, + "learning_rate": 5e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7488534450531006, + "num_tokens": 643000409.0, + "step": 24848 + }, + { + "epoch": 2.72886009224687, + "grad_norm": 2.03631591796875, + "learning_rate": 5e-06, + "loss": 0.7542, + "mean_token_accuracy": 0.7612448334693909, + "num_tokens": 643027527.0, + "step": 24849 + }, + { + "epoch": 2.728969909949484, + "grad_norm": 2.1138224601745605, + "learning_rate": 5e-06, + "loss": 0.5763, + "mean_token_accuracy": 0.8070007562637329, + "num_tokens": 643048023.0, + "step": 24850 + }, + { + "epoch": 2.7290797276520973, + "grad_norm": 2.1005640029907227, + "learning_rate": 5e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.7450841665267944, + "num_tokens": 643074941.0, + "step": 24851 + }, + { + "epoch": 2.729189545354711, + "grad_norm": 1.9603102207183838, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7641626000404358, + "num_tokens": 643104671.0, + "step": 24852 + }, + { + "epoch": 2.729299363057325, + "grad_norm": 2.0714104175567627, + "learning_rate": 5e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7827647924423218, + "num_tokens": 643129333.0, + "step": 24853 + }, + { + "epoch": 2.7294091807599385, + "grad_norm": 2.0499818325042725, + "learning_rate": 5e-06, + "loss": 0.608, + "mean_token_accuracy": 0.7889727354049683, + "num_tokens": 643154233.0, + "step": 24854 + }, + { + "epoch": 2.7295189984625523, + "grad_norm": 2.0413389205932617, + "learning_rate": 5e-06, + "loss": 0.6716, + "mean_token_accuracy": 0.7725939750671387, + "num_tokens": 643178672.0, + "step": 24855 + }, + { + "epoch": 2.7296288161651656, + "grad_norm": 1.9966089725494385, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7490123510360718, + "num_tokens": 643204729.0, + "step": 24856 + }, + { + "epoch": 2.7297386338677794, + "grad_norm": 1.8273249864578247, + "learning_rate": 5e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.749026358127594, + "num_tokens": 643237882.0, + "step": 24857 + }, + { + "epoch": 2.729848451570393, + "grad_norm": 1.920366883277893, + "learning_rate": 5e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.7477749586105347, + "num_tokens": 643267318.0, + "step": 24858 + }, + { + "epoch": 2.729958269273007, + "grad_norm": 2.2791435718536377, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.760798454284668, + "num_tokens": 643290013.0, + "step": 24859 + }, + { + "epoch": 2.7300680869756206, + "grad_norm": 2.0269253253936768, + "learning_rate": 5e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.746753454208374, + "num_tokens": 643315496.0, + "step": 24860 + }, + { + "epoch": 2.730177904678234, + "grad_norm": 2.0577986240386963, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7686648368835449, + "num_tokens": 643339178.0, + "step": 24861 + }, + { + "epoch": 2.7302877223808477, + "grad_norm": 1.986984372138977, + "learning_rate": 5e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7619007229804993, + "num_tokens": 643365969.0, + "step": 24862 + }, + { + "epoch": 2.7303975400834615, + "grad_norm": 2.1500351428985596, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7741869688034058, + "num_tokens": 643387071.0, + "step": 24863 + }, + { + "epoch": 2.7305073577860752, + "grad_norm": 2.2052407264709473, + "learning_rate": 5e-06, + "loss": 0.6767, + "mean_token_accuracy": 0.7762548327445984, + "num_tokens": 643409348.0, + "step": 24864 + }, + { + "epoch": 2.730617175488689, + "grad_norm": 2.3300211429595947, + "learning_rate": 5e-06, + "loss": 0.6482, + "mean_token_accuracy": 0.7849709987640381, + "num_tokens": 643428228.0, + "step": 24865 + }, + { + "epoch": 2.7307269931913023, + "grad_norm": 2.0966477394104004, + "learning_rate": 5e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7535550594329834, + "num_tokens": 643454583.0, + "step": 24866 + }, + { + "epoch": 2.730836810893916, + "grad_norm": 1.9846711158752441, + "learning_rate": 5e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7595182657241821, + "num_tokens": 643482087.0, + "step": 24867 + }, + { + "epoch": 2.73094662859653, + "grad_norm": 2.0165743827819824, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7658922672271729, + "num_tokens": 643509985.0, + "step": 24868 + }, + { + "epoch": 2.7310564462991436, + "grad_norm": 2.138036012649536, + "learning_rate": 5e-06, + "loss": 0.6866, + "mean_token_accuracy": 0.7705912590026855, + "num_tokens": 643534023.0, + "step": 24869 + }, + { + "epoch": 2.7311662640017573, + "grad_norm": 2.3341224193573, + "learning_rate": 5e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7663654088973999, + "num_tokens": 643556119.0, + "step": 24870 + }, + { + "epoch": 2.7312760817043706, + "grad_norm": 2.1102616786956787, + "learning_rate": 5e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7752530574798584, + "num_tokens": 643581912.0, + "step": 24871 + }, + { + "epoch": 2.7313858994069844, + "grad_norm": 2.150984525680542, + "learning_rate": 5e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7628977298736572, + "num_tokens": 643604803.0, + "step": 24872 + }, + { + "epoch": 2.731495717109598, + "grad_norm": 2.081779718399048, + "learning_rate": 5e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7518810629844666, + "num_tokens": 643630928.0, + "step": 24873 + }, + { + "epoch": 2.7316055348122115, + "grad_norm": 1.9366816282272339, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7485884428024292, + "num_tokens": 643663308.0, + "step": 24874 + }, + { + "epoch": 2.7317153525148252, + "grad_norm": 1.9709110260009766, + "learning_rate": 5e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7516282796859741, + "num_tokens": 643692240.0, + "step": 24875 + }, + { + "epoch": 2.731825170217439, + "grad_norm": 2.040628433227539, + "learning_rate": 5e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7698708176612854, + "num_tokens": 643717640.0, + "step": 24876 + }, + { + "epoch": 2.7319349879200527, + "grad_norm": 2.1407485008239746, + "learning_rate": 5e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7575680017471313, + "num_tokens": 643742338.0, + "step": 24877 + }, + { + "epoch": 2.7320448056226665, + "grad_norm": 2.067715883255005, + "learning_rate": 5e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7521153688430786, + "num_tokens": 643768931.0, + "step": 24878 + }, + { + "epoch": 2.73215462332528, + "grad_norm": 2.012357711791992, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7507414817810059, + "num_tokens": 643796868.0, + "step": 24879 + }, + { + "epoch": 2.7322644410278936, + "grad_norm": 2.070387840270996, + "learning_rate": 5e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7363545298576355, + "num_tokens": 643826953.0, + "step": 24880 + }, + { + "epoch": 2.7323742587305073, + "grad_norm": 1.9499647617340088, + "learning_rate": 5e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7684106826782227, + "num_tokens": 643857936.0, + "step": 24881 + }, + { + "epoch": 2.732484076433121, + "grad_norm": 2.114323854446411, + "learning_rate": 5e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.7770078182220459, + "num_tokens": 643881993.0, + "step": 24882 + }, + { + "epoch": 2.732593894135735, + "grad_norm": 2.08111572265625, + "learning_rate": 5e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7737911343574524, + "num_tokens": 643906856.0, + "step": 24883 + }, + { + "epoch": 2.732703711838348, + "grad_norm": 2.216707468032837, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7472863793373108, + "num_tokens": 643931302.0, + "step": 24884 + }, + { + "epoch": 2.732813529540962, + "grad_norm": 2.4092636108398438, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7667131423950195, + "num_tokens": 643951279.0, + "step": 24885 + }, + { + "epoch": 2.7329233472435757, + "grad_norm": 2.4338440895080566, + "learning_rate": 5e-06, + "loss": 0.6964, + "mean_token_accuracy": 0.7788865566253662, + "num_tokens": 643972401.0, + "step": 24886 + }, + { + "epoch": 2.7330331649461894, + "grad_norm": 2.363497018814087, + "learning_rate": 5e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7647863626480103, + "num_tokens": 643995293.0, + "step": 24887 + }, + { + "epoch": 2.733142982648803, + "grad_norm": 2.1113386154174805, + "learning_rate": 5e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7340115904808044, + "num_tokens": 644023206.0, + "step": 24888 + }, + { + "epoch": 2.7332528003514165, + "grad_norm": 2.1882917881011963, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7727070450782776, + "num_tokens": 644046261.0, + "step": 24889 + }, + { + "epoch": 2.7333626180540302, + "grad_norm": 2.232116222381592, + "learning_rate": 5e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.7573908567428589, + "num_tokens": 644070088.0, + "step": 24890 + }, + { + "epoch": 2.733472435756644, + "grad_norm": 1.9550714492797852, + "learning_rate": 5e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7530666589736938, + "num_tokens": 644096164.0, + "step": 24891 + }, + { + "epoch": 2.7335822534592578, + "grad_norm": 1.997342824935913, + "learning_rate": 5e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7814639210700989, + "num_tokens": 644124330.0, + "step": 24892 + }, + { + "epoch": 2.7336920711618715, + "grad_norm": 1.9018019437789917, + "learning_rate": 5e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7506323456764221, + "num_tokens": 644152199.0, + "step": 24893 + }, + { + "epoch": 2.733801888864485, + "grad_norm": 1.999934196472168, + "learning_rate": 5e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7439895868301392, + "num_tokens": 644178543.0, + "step": 24894 + }, + { + "epoch": 2.7339117065670986, + "grad_norm": 1.9653346538543701, + "learning_rate": 5e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7485281229019165, + "num_tokens": 644206372.0, + "step": 24895 + }, + { + "epoch": 2.7340215242697123, + "grad_norm": 2.264909267425537, + "learning_rate": 5e-06, + "loss": 0.6392, + "mean_token_accuracy": 0.7917343378067017, + "num_tokens": 644225548.0, + "step": 24896 + }, + { + "epoch": 2.734131341972326, + "grad_norm": 2.225954055786133, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7729241251945496, + "num_tokens": 644247204.0, + "step": 24897 + }, + { + "epoch": 2.73424115967494, + "grad_norm": 1.977260947227478, + "learning_rate": 5e-06, + "loss": 0.785, + "mean_token_accuracy": 0.755746603012085, + "num_tokens": 644275340.0, + "step": 24898 + }, + { + "epoch": 2.734350977377553, + "grad_norm": 2.0717227458953857, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7460607290267944, + "num_tokens": 644298570.0, + "step": 24899 + }, + { + "epoch": 2.734460795080167, + "grad_norm": 2.165416955947876, + "learning_rate": 5e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.764988124370575, + "num_tokens": 644323048.0, + "step": 24900 + }, + { + "epoch": 2.7345706127827807, + "grad_norm": 2.1418919563293457, + "learning_rate": 5e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.7779874205589294, + "num_tokens": 644346214.0, + "step": 24901 + }, + { + "epoch": 2.734680430485394, + "grad_norm": 2.088881015777588, + "learning_rate": 5e-06, + "loss": 0.6879, + "mean_token_accuracy": 0.7793007493019104, + "num_tokens": 644369902.0, + "step": 24902 + }, + { + "epoch": 2.7347902481880078, + "grad_norm": 1.713675856590271, + "learning_rate": 5e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7334272265434265, + "num_tokens": 644407412.0, + "step": 24903 + }, + { + "epoch": 2.7349000658906215, + "grad_norm": 2.178205728530884, + "learning_rate": 5e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7665771245956421, + "num_tokens": 644432643.0, + "step": 24904 + }, + { + "epoch": 2.7350098835932353, + "grad_norm": 2.1095900535583496, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.7570146322250366, + "num_tokens": 644459084.0, + "step": 24905 + }, + { + "epoch": 2.735119701295849, + "grad_norm": 1.8892608880996704, + "learning_rate": 5e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7341946959495544, + "num_tokens": 644490196.0, + "step": 24906 + }, + { + "epoch": 2.7352295189984623, + "grad_norm": 2.029527187347412, + "learning_rate": 5e-06, + "loss": 0.725, + "mean_token_accuracy": 0.758002519607544, + "num_tokens": 644515677.0, + "step": 24907 + }, + { + "epoch": 2.735339336701076, + "grad_norm": 2.0036497116088867, + "learning_rate": 5e-06, + "loss": 0.742, + "mean_token_accuracy": 0.7564622163772583, + "num_tokens": 644543454.0, + "step": 24908 + }, + { + "epoch": 2.73544915440369, + "grad_norm": 2.0787808895111084, + "learning_rate": 5e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7753555178642273, + "num_tokens": 644566643.0, + "step": 24909 + }, + { + "epoch": 2.7355589721063036, + "grad_norm": 2.0908234119415283, + "learning_rate": 5e-06, + "loss": 0.6823, + "mean_token_accuracy": 0.7724581956863403, + "num_tokens": 644591212.0, + "step": 24910 + }, + { + "epoch": 2.7356687898089174, + "grad_norm": 1.970679521560669, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.756171703338623, + "num_tokens": 644621923.0, + "step": 24911 + }, + { + "epoch": 2.7357786075115307, + "grad_norm": 1.958554983139038, + "learning_rate": 5e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.7804301977157593, + "num_tokens": 644647186.0, + "step": 24912 + }, + { + "epoch": 2.7358884252141444, + "grad_norm": 2.0793299674987793, + "learning_rate": 5e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.7754305601119995, + "num_tokens": 644670942.0, + "step": 24913 + }, + { + "epoch": 2.735998242916758, + "grad_norm": 2.0457327365875244, + "learning_rate": 5e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7685114145278931, + "num_tokens": 644696263.0, + "step": 24914 + }, + { + "epoch": 2.736108060619372, + "grad_norm": 2.1610946655273438, + "learning_rate": 5e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7571659684181213, + "num_tokens": 644721452.0, + "step": 24915 + }, + { + "epoch": 2.7362178783219857, + "grad_norm": 1.9538975954055786, + "learning_rate": 5e-06, + "loss": 0.6823, + "mean_token_accuracy": 0.7761862277984619, + "num_tokens": 644748354.0, + "step": 24916 + }, + { + "epoch": 2.736327696024599, + "grad_norm": 1.9189097881317139, + "learning_rate": 5e-06, + "loss": 0.6846, + "mean_token_accuracy": 0.7711399793624878, + "num_tokens": 644776215.0, + "step": 24917 + }, + { + "epoch": 2.736437513727213, + "grad_norm": 2.073188304901123, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7590852379798889, + "num_tokens": 644801673.0, + "step": 24918 + }, + { + "epoch": 2.7365473314298265, + "grad_norm": 2.342904806137085, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7648752927780151, + "num_tokens": 644821901.0, + "step": 24919 + }, + { + "epoch": 2.7366571491324403, + "grad_norm": 1.778259038925171, + "learning_rate": 5e-06, + "loss": 0.7719, + "mean_token_accuracy": 0.7450056076049805, + "num_tokens": 644857396.0, + "step": 24920 + }, + { + "epoch": 2.736766966835054, + "grad_norm": 1.956421136856079, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7602546215057373, + "num_tokens": 644886417.0, + "step": 24921 + }, + { + "epoch": 2.7368767845376674, + "grad_norm": 2.158216714859009, + "learning_rate": 5e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7560059428215027, + "num_tokens": 644911121.0, + "step": 24922 + }, + { + "epoch": 2.736986602240281, + "grad_norm": 1.9302738904953003, + "learning_rate": 5e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7495191097259521, + "num_tokens": 644938797.0, + "step": 24923 + }, + { + "epoch": 2.737096419942895, + "grad_norm": 2.0465734004974365, + "learning_rate": 5e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.7815443873405457, + "num_tokens": 644964792.0, + "step": 24924 + }, + { + "epoch": 2.737206237645508, + "grad_norm": 1.961493730545044, + "learning_rate": 5e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7543737888336182, + "num_tokens": 644991427.0, + "step": 24925 + }, + { + "epoch": 2.7373160553481224, + "grad_norm": 2.164445161819458, + "learning_rate": 5e-06, + "loss": 0.7273, + "mean_token_accuracy": 0.7647824883460999, + "num_tokens": 645014054.0, + "step": 24926 + }, + { + "epoch": 2.7374258730507357, + "grad_norm": 2.2339820861816406, + "learning_rate": 5e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7740670442581177, + "num_tokens": 645036630.0, + "step": 24927 + }, + { + "epoch": 2.7375356907533495, + "grad_norm": 2.2885921001434326, + "learning_rate": 5e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.753096878528595, + "num_tokens": 645059554.0, + "step": 24928 + }, + { + "epoch": 2.737645508455963, + "grad_norm": 2.1574196815490723, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7625036239624023, + "num_tokens": 645084512.0, + "step": 24929 + }, + { + "epoch": 2.7377553261585765, + "grad_norm": 2.116422176361084, + "learning_rate": 5e-06, + "loss": 0.7778, + "mean_token_accuracy": 0.744801938533783, + "num_tokens": 645113144.0, + "step": 24930 + }, + { + "epoch": 2.7378651438611903, + "grad_norm": 2.267672538757324, + "learning_rate": 5e-06, + "loss": 0.6761, + "mean_token_accuracy": 0.7772359848022461, + "num_tokens": 645134587.0, + "step": 24931 + }, + { + "epoch": 2.737974961563804, + "grad_norm": 1.9746485948562622, + "learning_rate": 5e-06, + "loss": 0.6718, + "mean_token_accuracy": 0.7771650552749634, + "num_tokens": 645161335.0, + "step": 24932 + }, + { + "epoch": 2.738084779266418, + "grad_norm": 1.8603956699371338, + "learning_rate": 5e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7494536638259888, + "num_tokens": 645196093.0, + "step": 24933 + }, + { + "epoch": 2.7381945969690316, + "grad_norm": 2.1268041133880615, + "learning_rate": 5e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7601063847541809, + "num_tokens": 645220170.0, + "step": 24934 + }, + { + "epoch": 2.738304414671645, + "grad_norm": 1.9454126358032227, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7544504404067993, + "num_tokens": 645250107.0, + "step": 24935 + }, + { + "epoch": 2.7384142323742586, + "grad_norm": 2.003696918487549, + "learning_rate": 5e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7437617778778076, + "num_tokens": 645277947.0, + "step": 24936 + }, + { + "epoch": 2.7385240500768724, + "grad_norm": 2.0019969940185547, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7641513347625732, + "num_tokens": 645305390.0, + "step": 24937 + }, + { + "epoch": 2.738633867779486, + "grad_norm": 1.791539192199707, + "learning_rate": 5e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7329105138778687, + "num_tokens": 645337634.0, + "step": 24938 + }, + { + "epoch": 2.7387436854821, + "grad_norm": 1.9579493999481201, + "learning_rate": 5e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7553057670593262, + "num_tokens": 645367962.0, + "step": 24939 + }, + { + "epoch": 2.738853503184713, + "grad_norm": 2.2062442302703857, + "learning_rate": 5e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7496607899665833, + "num_tokens": 645391871.0, + "step": 24940 + }, + { + "epoch": 2.738963320887327, + "grad_norm": 2.0394978523254395, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7602527141571045, + "num_tokens": 645417916.0, + "step": 24941 + }, + { + "epoch": 2.7390731385899407, + "grad_norm": 1.940586805343628, + "learning_rate": 5e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7668969035148621, + "num_tokens": 645445663.0, + "step": 24942 + }, + { + "epoch": 2.7391829562925545, + "grad_norm": 2.119121789932251, + "learning_rate": 5e-06, + "loss": 0.6849, + "mean_token_accuracy": 0.7725105285644531, + "num_tokens": 645470980.0, + "step": 24943 + }, + { + "epoch": 2.7392927739951682, + "grad_norm": 2.0380783081054688, + "learning_rate": 5e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7721513509750366, + "num_tokens": 645497939.0, + "step": 24944 + }, + { + "epoch": 2.7394025916977816, + "grad_norm": 1.9572484493255615, + "learning_rate": 5e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7410900592803955, + "num_tokens": 645531023.0, + "step": 24945 + }, + { + "epoch": 2.7395124094003953, + "grad_norm": 2.273197650909424, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7667596340179443, + "num_tokens": 645556799.0, + "step": 24946 + }, + { + "epoch": 2.739622227103009, + "grad_norm": 2.5578396320343018, + "learning_rate": 5e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.7831698656082153, + "num_tokens": 645574284.0, + "step": 24947 + }, + { + "epoch": 2.739732044805623, + "grad_norm": 2.1806747913360596, + "learning_rate": 5e-06, + "loss": 0.6641, + "mean_token_accuracy": 0.7845234870910645, + "num_tokens": 645601363.0, + "step": 24948 + }, + { + "epoch": 2.7398418625082366, + "grad_norm": 1.808447241783142, + "learning_rate": 5e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7344957590103149, + "num_tokens": 645639334.0, + "step": 24949 + }, + { + "epoch": 2.73995168021085, + "grad_norm": 2.360591411590576, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.761493444442749, + "num_tokens": 645663096.0, + "step": 24950 + }, + { + "epoch": 2.7400614979134637, + "grad_norm": 1.9633979797363281, + "learning_rate": 5e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7673274278640747, + "num_tokens": 645691170.0, + "step": 24951 + }, + { + "epoch": 2.7401713156160774, + "grad_norm": 1.9246405363082886, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7680925130844116, + "num_tokens": 645718975.0, + "step": 24952 + }, + { + "epoch": 2.7402811333186907, + "grad_norm": 1.970886468887329, + "learning_rate": 5e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.7490581274032593, + "num_tokens": 645748035.0, + "step": 24953 + }, + { + "epoch": 2.7403909510213045, + "grad_norm": 1.9562551975250244, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7663424015045166, + "num_tokens": 645774910.0, + "step": 24954 + }, + { + "epoch": 2.7405007687239182, + "grad_norm": 1.93759286403656, + "learning_rate": 5e-06, + "loss": 0.6131, + "mean_token_accuracy": 0.7899165153503418, + "num_tokens": 645800433.0, + "step": 24955 + }, + { + "epoch": 2.740610586426532, + "grad_norm": 2.393932819366455, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7552696466445923, + "num_tokens": 645822325.0, + "step": 24956 + }, + { + "epoch": 2.7407204041291457, + "grad_norm": 2.1629624366760254, + "learning_rate": 5e-06, + "loss": 0.6632, + "mean_token_accuracy": 0.7795248031616211, + "num_tokens": 645843032.0, + "step": 24957 + }, + { + "epoch": 2.740830221831759, + "grad_norm": 2.107327938079834, + "learning_rate": 5e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.7869058847427368, + "num_tokens": 645864823.0, + "step": 24958 + }, + { + "epoch": 2.740940039534373, + "grad_norm": 1.9747697114944458, + "learning_rate": 5e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.7626227140426636, + "num_tokens": 645890275.0, + "step": 24959 + }, + { + "epoch": 2.7410498572369866, + "grad_norm": 2.176405668258667, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7677440643310547, + "num_tokens": 645913053.0, + "step": 24960 + }, + { + "epoch": 2.7411596749396003, + "grad_norm": 1.8769606351852417, + "learning_rate": 5e-06, + "loss": 0.6566, + "mean_token_accuracy": 0.7804822325706482, + "num_tokens": 645942147.0, + "step": 24961 + }, + { + "epoch": 2.741269492642214, + "grad_norm": 2.2480759620666504, + "learning_rate": 5e-06, + "loss": 0.6704, + "mean_token_accuracy": 0.7788517475128174, + "num_tokens": 645965139.0, + "step": 24962 + }, + { + "epoch": 2.7413793103448274, + "grad_norm": 2.075624465942383, + "learning_rate": 5e-06, + "loss": 0.6469, + "mean_token_accuracy": 0.7872070670127869, + "num_tokens": 645989166.0, + "step": 24963 + }, + { + "epoch": 2.741489128047441, + "grad_norm": 1.8937548398971558, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7681329250335693, + "num_tokens": 646017579.0, + "step": 24964 + }, + { + "epoch": 2.741598945750055, + "grad_norm": 1.9880284070968628, + "learning_rate": 5e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7599755525588989, + "num_tokens": 646044288.0, + "step": 24965 + }, + { + "epoch": 2.7417087634526687, + "grad_norm": 2.454958200454712, + "learning_rate": 5e-06, + "loss": 0.7241, + "mean_token_accuracy": 0.7619232535362244, + "num_tokens": 646065910.0, + "step": 24966 + }, + { + "epoch": 2.7418185811552824, + "grad_norm": 2.0642268657684326, + "learning_rate": 5e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7606235146522522, + "num_tokens": 646093567.0, + "step": 24967 + }, + { + "epoch": 2.7419283988578957, + "grad_norm": 2.503787040710449, + "learning_rate": 5e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7560433149337769, + "num_tokens": 646114314.0, + "step": 24968 + }, + { + "epoch": 2.7420382165605095, + "grad_norm": 2.1263644695281982, + "learning_rate": 5e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.7581924796104431, + "num_tokens": 646140906.0, + "step": 24969 + }, + { + "epoch": 2.7421480342631233, + "grad_norm": 1.859273910522461, + "learning_rate": 5e-06, + "loss": 0.7339, + "mean_token_accuracy": 0.7620142698287964, + "num_tokens": 646172092.0, + "step": 24970 + }, + { + "epoch": 2.742257851965737, + "grad_norm": 1.753179907798767, + "learning_rate": 5e-06, + "loss": 0.6829, + "mean_token_accuracy": 0.7749345302581787, + "num_tokens": 646203772.0, + "step": 24971 + }, + { + "epoch": 2.7423676696683508, + "grad_norm": 2.123068332672119, + "learning_rate": 5e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7612178921699524, + "num_tokens": 646228992.0, + "step": 24972 + }, + { + "epoch": 2.742477487370964, + "grad_norm": 2.131664752960205, + "learning_rate": 5e-06, + "loss": 0.673, + "mean_token_accuracy": 0.7715445160865784, + "num_tokens": 646254481.0, + "step": 24973 + }, + { + "epoch": 2.742587305073578, + "grad_norm": 2.1487672328948975, + "learning_rate": 5e-06, + "loss": 0.7619, + "mean_token_accuracy": 0.7666525840759277, + "num_tokens": 646279197.0, + "step": 24974 + }, + { + "epoch": 2.7426971227761916, + "grad_norm": 1.9188774824142456, + "learning_rate": 5e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.7788131237030029, + "num_tokens": 646305099.0, + "step": 24975 + }, + { + "epoch": 2.742806940478805, + "grad_norm": 2.172621011734009, + "learning_rate": 5e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7570369243621826, + "num_tokens": 646330808.0, + "step": 24976 + }, + { + "epoch": 2.742916758181419, + "grad_norm": 2.112565279006958, + "learning_rate": 5e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.776869535446167, + "num_tokens": 646355554.0, + "step": 24977 + }, + { + "epoch": 2.7430265758840324, + "grad_norm": 1.9529861211776733, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7713969945907593, + "num_tokens": 646382482.0, + "step": 24978 + }, + { + "epoch": 2.743136393586646, + "grad_norm": 2.2495431900024414, + "learning_rate": 5e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.7807340025901794, + "num_tokens": 646404527.0, + "step": 24979 + }, + { + "epoch": 2.74324621128926, + "grad_norm": 1.9581589698791504, + "learning_rate": 5e-06, + "loss": 0.6973, + "mean_token_accuracy": 0.7696142792701721, + "num_tokens": 646431227.0, + "step": 24980 + }, + { + "epoch": 2.7433560289918733, + "grad_norm": 2.0908148288726807, + "learning_rate": 5e-06, + "loss": 0.7836, + "mean_token_accuracy": 0.7461316585540771, + "num_tokens": 646458765.0, + "step": 24981 + }, + { + "epoch": 2.743465846694487, + "grad_norm": 2.1279685497283936, + "learning_rate": 5e-06, + "loss": 0.7047, + "mean_token_accuracy": 0.7645349502563477, + "num_tokens": 646484141.0, + "step": 24982 + }, + { + "epoch": 2.7435756643971008, + "grad_norm": 2.3237931728363037, + "learning_rate": 5e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7469578981399536, + "num_tokens": 646506971.0, + "step": 24983 + }, + { + "epoch": 2.7436854820997145, + "grad_norm": 1.9840877056121826, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.761898398399353, + "num_tokens": 646535347.0, + "step": 24984 + }, + { + "epoch": 2.7437952998023283, + "grad_norm": 2.2123310565948486, + "learning_rate": 5e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.7458509206771851, + "num_tokens": 646560823.0, + "step": 24985 + }, + { + "epoch": 2.7439051175049416, + "grad_norm": 1.9488954544067383, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7464962601661682, + "num_tokens": 646591651.0, + "step": 24986 + }, + { + "epoch": 2.7440149352075554, + "grad_norm": 2.0784945487976074, + "learning_rate": 5e-06, + "loss": 0.7066, + "mean_token_accuracy": 0.7672569155693054, + "num_tokens": 646616448.0, + "step": 24987 + }, + { + "epoch": 2.744124752910169, + "grad_norm": 2.1034531593322754, + "learning_rate": 5e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.7724016904830933, + "num_tokens": 646640860.0, + "step": 24988 + }, + { + "epoch": 2.744234570612783, + "grad_norm": 2.1900224685668945, + "learning_rate": 5e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7694505453109741, + "num_tokens": 646665423.0, + "step": 24989 + }, + { + "epoch": 2.7443443883153966, + "grad_norm": 2.1289939880371094, + "learning_rate": 5e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7627378702163696, + "num_tokens": 646689906.0, + "step": 24990 + }, + { + "epoch": 2.74445420601801, + "grad_norm": 2.082895040512085, + "learning_rate": 5e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7628207206726074, + "num_tokens": 646714720.0, + "step": 24991 + }, + { + "epoch": 2.7445640237206237, + "grad_norm": 1.914957880973816, + "learning_rate": 5e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7611258029937744, + "num_tokens": 646746120.0, + "step": 24992 + }, + { + "epoch": 2.7446738414232374, + "grad_norm": 2.034898042678833, + "learning_rate": 5e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7300968766212463, + "num_tokens": 646775635.0, + "step": 24993 + }, + { + "epoch": 2.744783659125851, + "grad_norm": 2.289900064468384, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7638605833053589, + "num_tokens": 646796239.0, + "step": 24994 + }, + { + "epoch": 2.744893476828465, + "grad_norm": 2.2091634273529053, + "learning_rate": 5e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7475826740264893, + "num_tokens": 646818115.0, + "step": 24995 + }, + { + "epoch": 2.7450032945310783, + "grad_norm": 1.958549976348877, + "learning_rate": 5e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7448782920837402, + "num_tokens": 646848676.0, + "step": 24996 + }, + { + "epoch": 2.745113112233692, + "grad_norm": 2.0431067943573, + "learning_rate": 5e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7848736643791199, + "num_tokens": 646874782.0, + "step": 24997 + }, + { + "epoch": 2.745222929936306, + "grad_norm": 2.267171621322632, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7494786381721497, + "num_tokens": 646900405.0, + "step": 24998 + }, + { + "epoch": 2.7453327476389195, + "grad_norm": 2.1858160495758057, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7678865194320679, + "num_tokens": 646922872.0, + "step": 24999 + }, + { + "epoch": 2.7454425653415333, + "grad_norm": 2.048323154449463, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7756778597831726, + "num_tokens": 646947156.0, + "step": 25000 + }, + { + "epoch": 2.7455523830441466, + "grad_norm": 2.5403573513031006, + "learning_rate": 5e-06, + "loss": 0.5994, + "mean_token_accuracy": 0.7965449094772339, + "num_tokens": 646964840.0, + "step": 25001 + }, + { + "epoch": 2.7456622007467604, + "grad_norm": 2.1391074657440186, + "learning_rate": 5e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.748670220375061, + "num_tokens": 646989967.0, + "step": 25002 + }, + { + "epoch": 2.745772018449374, + "grad_norm": 2.1719746589660645, + "learning_rate": 5e-06, + "loss": 0.7751, + "mean_token_accuracy": 0.7523882985115051, + "num_tokens": 647013412.0, + "step": 25003 + }, + { + "epoch": 2.7458818361519874, + "grad_norm": 2.058239459991455, + "learning_rate": 5e-06, + "loss": 0.6728, + "mean_token_accuracy": 0.7745954990386963, + "num_tokens": 647038950.0, + "step": 25004 + }, + { + "epoch": 2.745991653854601, + "grad_norm": 2.036811113357544, + "learning_rate": 5e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7596210837364197, + "num_tokens": 647067509.0, + "step": 25005 + }, + { + "epoch": 2.746101471557215, + "grad_norm": 1.9934983253479004, + "learning_rate": 5e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7532565593719482, + "num_tokens": 647095401.0, + "step": 25006 + }, + { + "epoch": 2.7462112892598287, + "grad_norm": 2.2713167667388916, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7762503623962402, + "num_tokens": 647118116.0, + "step": 25007 + }, + { + "epoch": 2.7463211069624425, + "grad_norm": 2.165179967880249, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7597004175186157, + "num_tokens": 647141902.0, + "step": 25008 + }, + { + "epoch": 2.746430924665056, + "grad_norm": 1.7526637315750122, + "learning_rate": 5e-06, + "loss": 0.7376, + "mean_token_accuracy": 0.7557539343833923, + "num_tokens": 647174528.0, + "step": 25009 + }, + { + "epoch": 2.7465407423676695, + "grad_norm": 2.115711212158203, + "learning_rate": 5e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.7813451290130615, + "num_tokens": 647198049.0, + "step": 25010 + }, + { + "epoch": 2.7466505600702833, + "grad_norm": 1.9933216571807861, + "learning_rate": 5e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7742979526519775, + "num_tokens": 647224048.0, + "step": 25011 + }, + { + "epoch": 2.746760377772897, + "grad_norm": 2.082070827484131, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7598187923431396, + "num_tokens": 647250764.0, + "step": 25012 + }, + { + "epoch": 2.746870195475511, + "grad_norm": 1.979812502861023, + "learning_rate": 5e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7731302976608276, + "num_tokens": 647275709.0, + "step": 25013 + }, + { + "epoch": 2.746980013178124, + "grad_norm": 2.200058698654175, + "learning_rate": 5e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.7733292579650879, + "num_tokens": 647298151.0, + "step": 25014 + }, + { + "epoch": 2.747089830880738, + "grad_norm": 2.2788939476013184, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7552577257156372, + "num_tokens": 647321045.0, + "step": 25015 + }, + { + "epoch": 2.7471996485833516, + "grad_norm": 2.0326485633850098, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7661668062210083, + "num_tokens": 647346533.0, + "step": 25016 + }, + { + "epoch": 2.7473094662859654, + "grad_norm": 2.1494064331054688, + "learning_rate": 5e-06, + "loss": 0.7712, + "mean_token_accuracy": 0.7479555606842041, + "num_tokens": 647371497.0, + "step": 25017 + }, + { + "epoch": 2.747419283988579, + "grad_norm": 2.1179943084716797, + "learning_rate": 5e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7413196563720703, + "num_tokens": 647400315.0, + "step": 25018 + }, + { + "epoch": 2.7475291016911925, + "grad_norm": 2.0893173217773438, + "learning_rate": 5e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7554436922073364, + "num_tokens": 647427370.0, + "step": 25019 + }, + { + "epoch": 2.7476389193938062, + "grad_norm": 2.2631473541259766, + "learning_rate": 5e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7634820938110352, + "num_tokens": 647448846.0, + "step": 25020 + }, + { + "epoch": 2.74774873709642, + "grad_norm": 1.9473910331726074, + "learning_rate": 5e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7612665891647339, + "num_tokens": 647477820.0, + "step": 25021 + }, + { + "epoch": 2.7478585547990337, + "grad_norm": 2.2020606994628906, + "learning_rate": 5e-06, + "loss": 0.777, + "mean_token_accuracy": 0.7562674880027771, + "num_tokens": 647503687.0, + "step": 25022 + }, + { + "epoch": 2.7479683725016475, + "grad_norm": 2.1095728874206543, + "learning_rate": 5e-06, + "loss": 0.6878, + "mean_token_accuracy": 0.775591254234314, + "num_tokens": 647528883.0, + "step": 25023 + }, + { + "epoch": 2.748078190204261, + "grad_norm": 2.176697254180908, + "learning_rate": 5e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7637220621109009, + "num_tokens": 647554369.0, + "step": 25024 + }, + { + "epoch": 2.7481880079068746, + "grad_norm": 2.022200107574463, + "learning_rate": 5e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7576311230659485, + "num_tokens": 647581862.0, + "step": 25025 + }, + { + "epoch": 2.7482978256094883, + "grad_norm": 2.029724359512329, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7724551558494568, + "num_tokens": 647605841.0, + "step": 25026 + }, + { + "epoch": 2.7484076433121016, + "grad_norm": 2.1253345012664795, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7642298936843872, + "num_tokens": 647630490.0, + "step": 25027 + }, + { + "epoch": 2.748517461014716, + "grad_norm": 1.9411331415176392, + "learning_rate": 5e-06, + "loss": 0.571, + "mean_token_accuracy": 0.8041951060295105, + "num_tokens": 647658204.0, + "step": 25028 + }, + { + "epoch": 2.748627278717329, + "grad_norm": 1.953230857849121, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7638059854507446, + "num_tokens": 647683609.0, + "step": 25029 + }, + { + "epoch": 2.748737096419943, + "grad_norm": 2.138265609741211, + "learning_rate": 5e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.7487460374832153, + "num_tokens": 647710257.0, + "step": 25030 + }, + { + "epoch": 2.7488469141225567, + "grad_norm": 1.8777538537979126, + "learning_rate": 5e-06, + "loss": 0.7922, + "mean_token_accuracy": 0.7537516355514526, + "num_tokens": 647742484.0, + "step": 25031 + }, + { + "epoch": 2.74895673182517, + "grad_norm": 2.5802104473114014, + "learning_rate": 5e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.764176607131958, + "num_tokens": 647761423.0, + "step": 25032 + }, + { + "epoch": 2.7490665495277837, + "grad_norm": 1.930201768875122, + "learning_rate": 5e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7650339007377625, + "num_tokens": 647791108.0, + "step": 25033 + }, + { + "epoch": 2.7491763672303975, + "grad_norm": 1.9823182821273804, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7567175626754761, + "num_tokens": 647820148.0, + "step": 25034 + }, + { + "epoch": 2.7492861849330112, + "grad_norm": 1.93562912940979, + "learning_rate": 5e-06, + "loss": 0.7738, + "mean_token_accuracy": 0.7483962774276733, + "num_tokens": 647850968.0, + "step": 25035 + }, + { + "epoch": 2.749396002635625, + "grad_norm": 2.4803028106689453, + "learning_rate": 5e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7564697861671448, + "num_tokens": 647872126.0, + "step": 25036 + }, + { + "epoch": 2.7495058203382383, + "grad_norm": 2.105631113052368, + "learning_rate": 5e-06, + "loss": 0.6854, + "mean_token_accuracy": 0.7732337713241577, + "num_tokens": 647898545.0, + "step": 25037 + }, + { + "epoch": 2.749615638040852, + "grad_norm": 1.974807620048523, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7602870464324951, + "num_tokens": 647927974.0, + "step": 25038 + }, + { + "epoch": 2.749725455743466, + "grad_norm": 2.1862289905548096, + "learning_rate": 5e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7438747882843018, + "num_tokens": 647955027.0, + "step": 25039 + }, + { + "epoch": 2.7498352734460796, + "grad_norm": 2.3431406021118164, + "learning_rate": 5e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.7756147980690002, + "num_tokens": 647975223.0, + "step": 25040 + }, + { + "epoch": 2.7499450911486933, + "grad_norm": 2.714874744415283, + "learning_rate": 5e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.7688518166542053, + "num_tokens": 647991329.0, + "step": 25041 + }, + { + "epoch": 2.7500549088513067, + "grad_norm": 1.9995743036270142, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7586033940315247, + "num_tokens": 648017189.0, + "step": 25042 + }, + { + "epoch": 2.7501647265539204, + "grad_norm": 2.151951313018799, + "learning_rate": 5e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7626650929450989, + "num_tokens": 648043202.0, + "step": 25043 + }, + { + "epoch": 2.750274544256534, + "grad_norm": 2.282443046569824, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.7704827189445496, + "num_tokens": 648065633.0, + "step": 25044 + }, + { + "epoch": 2.750384361959148, + "grad_norm": 2.0045230388641357, + "learning_rate": 5e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7631657123565674, + "num_tokens": 648093531.0, + "step": 25045 + }, + { + "epoch": 2.7504941796617617, + "grad_norm": 2.1736817359924316, + "learning_rate": 5e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7548996210098267, + "num_tokens": 648120674.0, + "step": 25046 + }, + { + "epoch": 2.750603997364375, + "grad_norm": 2.2734367847442627, + "learning_rate": 5e-06, + "loss": 0.6311, + "mean_token_accuracy": 0.7829467058181763, + "num_tokens": 648140908.0, + "step": 25047 + }, + { + "epoch": 2.7507138150669888, + "grad_norm": 2.0064470767974854, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7750270366668701, + "num_tokens": 648165890.0, + "step": 25048 + }, + { + "epoch": 2.7508236327696025, + "grad_norm": 2.043896436691284, + "learning_rate": 5e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7469847202301025, + "num_tokens": 648193389.0, + "step": 25049 + }, + { + "epoch": 2.7509334504722163, + "grad_norm": 1.82498300075531, + "learning_rate": 5e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7461984157562256, + "num_tokens": 648226184.0, + "step": 25050 + }, + { + "epoch": 2.75104326817483, + "grad_norm": 1.82999849319458, + "learning_rate": 5e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7408807277679443, + "num_tokens": 648260110.0, + "step": 25051 + }, + { + "epoch": 2.7511530858774433, + "grad_norm": 2.4994759559631348, + "learning_rate": 5e-06, + "loss": 0.6431, + "mean_token_accuracy": 0.7837467193603516, + "num_tokens": 648278051.0, + "step": 25052 + }, + { + "epoch": 2.751262903580057, + "grad_norm": 2.298304557800293, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7594796419143677, + "num_tokens": 648300945.0, + "step": 25053 + }, + { + "epoch": 2.751372721282671, + "grad_norm": 1.8154487609863281, + "learning_rate": 5e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7573050260543823, + "num_tokens": 648330353.0, + "step": 25054 + }, + { + "epoch": 2.751482538985284, + "grad_norm": 1.9819587469100952, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.761751115322113, + "num_tokens": 648356131.0, + "step": 25055 + }, + { + "epoch": 2.7515923566878984, + "grad_norm": 1.9680439233779907, + "learning_rate": 5e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7454972267150879, + "num_tokens": 648386262.0, + "step": 25056 + }, + { + "epoch": 2.7517021743905117, + "grad_norm": 2.179917812347412, + "learning_rate": 5e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7705163955688477, + "num_tokens": 648407691.0, + "step": 25057 + }, + { + "epoch": 2.7518119920931254, + "grad_norm": 1.84148108959198, + "learning_rate": 5e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7566317319869995, + "num_tokens": 648436812.0, + "step": 25058 + }, + { + "epoch": 2.751921809795739, + "grad_norm": 2.1370737552642822, + "learning_rate": 5e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.7730140089988708, + "num_tokens": 648461393.0, + "step": 25059 + }, + { + "epoch": 2.7520316274983525, + "grad_norm": 1.99283766746521, + "learning_rate": 5e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.7793492078781128, + "num_tokens": 648488003.0, + "step": 25060 + }, + { + "epoch": 2.7521414452009663, + "grad_norm": 2.6130971908569336, + "learning_rate": 5e-06, + "loss": 0.5919, + "mean_token_accuracy": 0.8013864755630493, + "num_tokens": 648505089.0, + "step": 25061 + }, + { + "epoch": 2.75225126290358, + "grad_norm": 2.155494451522827, + "learning_rate": 5e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7668983936309814, + "num_tokens": 648527723.0, + "step": 25062 + }, + { + "epoch": 2.7523610806061938, + "grad_norm": 2.0166616439819336, + "learning_rate": 5e-06, + "loss": 0.6589, + "mean_token_accuracy": 0.7846497893333435, + "num_tokens": 648552783.0, + "step": 25063 + }, + { + "epoch": 2.7524708983088075, + "grad_norm": 2.2026307582855225, + "learning_rate": 5e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7731444835662842, + "num_tokens": 648577089.0, + "step": 25064 + }, + { + "epoch": 2.752580716011421, + "grad_norm": 2.0850162506103516, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7575908899307251, + "num_tokens": 648602915.0, + "step": 25065 + }, + { + "epoch": 2.7526905337140346, + "grad_norm": 2.2336838245391846, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7709749937057495, + "num_tokens": 648626593.0, + "step": 25066 + }, + { + "epoch": 2.7528003514166484, + "grad_norm": 2.1368398666381836, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.745110273361206, + "num_tokens": 648652813.0, + "step": 25067 + }, + { + "epoch": 2.752910169119262, + "grad_norm": 2.2115392684936523, + "learning_rate": 5e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7522262930870056, + "num_tokens": 648676884.0, + "step": 25068 + }, + { + "epoch": 2.753019986821876, + "grad_norm": 2.06704044342041, + "learning_rate": 5e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.7744827270507812, + "num_tokens": 648700830.0, + "step": 25069 + }, + { + "epoch": 2.753129804524489, + "grad_norm": 2.193713665008545, + "learning_rate": 5e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7379796504974365, + "num_tokens": 648725867.0, + "step": 25070 + }, + { + "epoch": 2.753239622227103, + "grad_norm": 1.9572359323501587, + "learning_rate": 5e-06, + "loss": 0.7732, + "mean_token_accuracy": 0.7563651204109192, + "num_tokens": 648754948.0, + "step": 25071 + }, + { + "epoch": 2.7533494399297167, + "grad_norm": 2.079378128051758, + "learning_rate": 5e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7611505389213562, + "num_tokens": 648780012.0, + "step": 25072 + }, + { + "epoch": 2.7534592576323305, + "grad_norm": 2.1467626094818115, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7668260335922241, + "num_tokens": 648804879.0, + "step": 25073 + }, + { + "epoch": 2.753569075334944, + "grad_norm": 1.9486894607543945, + "learning_rate": 5e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7538021802902222, + "num_tokens": 648834626.0, + "step": 25074 + }, + { + "epoch": 2.7536788930375575, + "grad_norm": 1.840316653251648, + "learning_rate": 5e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7681264877319336, + "num_tokens": 648865704.0, + "step": 25075 + }, + { + "epoch": 2.7537887107401713, + "grad_norm": 1.7796334028244019, + "learning_rate": 5e-06, + "loss": 0.759, + "mean_token_accuracy": 0.753757119178772, + "num_tokens": 648900547.0, + "step": 25076 + }, + { + "epoch": 2.753898528442785, + "grad_norm": 2.0248730182647705, + "learning_rate": 5e-06, + "loss": 0.6244, + "mean_token_accuracy": 0.7954816818237305, + "num_tokens": 648926197.0, + "step": 25077 + }, + { + "epoch": 2.754008346145399, + "grad_norm": 1.9921244382858276, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7683906555175781, + "num_tokens": 648955252.0, + "step": 25078 + }, + { + "epoch": 2.7541181638480126, + "grad_norm": 2.152787446975708, + "learning_rate": 5e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7551530599594116, + "num_tokens": 648981873.0, + "step": 25079 + }, + { + "epoch": 2.754227981550626, + "grad_norm": 1.9139667749404907, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7604579925537109, + "num_tokens": 649012465.0, + "step": 25080 + }, + { + "epoch": 2.7543377992532396, + "grad_norm": 2.362586259841919, + "learning_rate": 5e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.7842345237731934, + "num_tokens": 649035489.0, + "step": 25081 + }, + { + "epoch": 2.7544476169558534, + "grad_norm": 2.047985792160034, + "learning_rate": 5e-06, + "loss": 0.6612, + "mean_token_accuracy": 0.7795658707618713, + "num_tokens": 649058906.0, + "step": 25082 + }, + { + "epoch": 2.7545574346584667, + "grad_norm": 2.1711487770080566, + "learning_rate": 5e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7458574175834656, + "num_tokens": 649084346.0, + "step": 25083 + }, + { + "epoch": 2.7546672523610805, + "grad_norm": 1.8784205913543701, + "learning_rate": 5e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7505815029144287, + "num_tokens": 649116737.0, + "step": 25084 + }, + { + "epoch": 2.754777070063694, + "grad_norm": 1.9226224422454834, + "learning_rate": 5e-06, + "loss": 0.7658, + "mean_token_accuracy": 0.7474490404129028, + "num_tokens": 649147269.0, + "step": 25085 + }, + { + "epoch": 2.754886887766308, + "grad_norm": 2.042156219482422, + "learning_rate": 5e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.739189863204956, + "num_tokens": 649175559.0, + "step": 25086 + }, + { + "epoch": 2.7549967054689217, + "grad_norm": 1.7661248445510864, + "learning_rate": 5e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7503697872161865, + "num_tokens": 649206930.0, + "step": 25087 + }, + { + "epoch": 2.755106523171535, + "grad_norm": 2.0327506065368652, + "learning_rate": 5e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.7843528985977173, + "num_tokens": 649231676.0, + "step": 25088 + }, + { + "epoch": 2.755216340874149, + "grad_norm": 2.431145429611206, + "learning_rate": 5e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.7823328971862793, + "num_tokens": 649251697.0, + "step": 25089 + }, + { + "epoch": 2.7553261585767626, + "grad_norm": 1.8635401725769043, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.763067901134491, + "num_tokens": 649282537.0, + "step": 25090 + }, + { + "epoch": 2.7554359762793763, + "grad_norm": 2.0850601196289062, + "learning_rate": 5e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.7749311923980713, + "num_tokens": 649307177.0, + "step": 25091 + }, + { + "epoch": 2.75554579398199, + "grad_norm": 2.288210153579712, + "learning_rate": 5e-06, + "loss": 0.6486, + "mean_token_accuracy": 0.7830672264099121, + "num_tokens": 649327368.0, + "step": 25092 + }, + { + "epoch": 2.7556556116846034, + "grad_norm": 2.1079514026641846, + "learning_rate": 5e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7375277876853943, + "num_tokens": 649353976.0, + "step": 25093 + }, + { + "epoch": 2.755765429387217, + "grad_norm": 2.0568721294403076, + "learning_rate": 5e-06, + "loss": 0.6359, + "mean_token_accuracy": 0.7811871767044067, + "num_tokens": 649379331.0, + "step": 25094 + }, + { + "epoch": 2.755875247089831, + "grad_norm": 2.252678155899048, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7737147212028503, + "num_tokens": 649406169.0, + "step": 25095 + }, + { + "epoch": 2.7559850647924446, + "grad_norm": 2.1046817302703857, + "learning_rate": 5e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.748092770576477, + "num_tokens": 649432025.0, + "step": 25096 + }, + { + "epoch": 2.7560948824950584, + "grad_norm": 2.186475992202759, + "learning_rate": 5e-06, + "loss": 0.6567, + "mean_token_accuracy": 0.7803283929824829, + "num_tokens": 649455556.0, + "step": 25097 + }, + { + "epoch": 2.7562047001976717, + "grad_norm": 1.9364680051803589, + "learning_rate": 5e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.761053204536438, + "num_tokens": 649484426.0, + "step": 25098 + }, + { + "epoch": 2.7563145179002855, + "grad_norm": 1.8575997352600098, + "learning_rate": 5e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7496342658996582, + "num_tokens": 649517586.0, + "step": 25099 + }, + { + "epoch": 2.7564243356028992, + "grad_norm": 2.0412018299102783, + "learning_rate": 5e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7517074346542358, + "num_tokens": 649543148.0, + "step": 25100 + }, + { + "epoch": 2.756534153305513, + "grad_norm": 2.0776779651641846, + "learning_rate": 5e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7529665231704712, + "num_tokens": 649569399.0, + "step": 25101 + }, + { + "epoch": 2.7566439710081267, + "grad_norm": 2.106323719024658, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7556029558181763, + "num_tokens": 649595971.0, + "step": 25102 + }, + { + "epoch": 2.75675378871074, + "grad_norm": 1.893081545829773, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7586541771888733, + "num_tokens": 649627258.0, + "step": 25103 + }, + { + "epoch": 2.756863606413354, + "grad_norm": 1.9572570323944092, + "learning_rate": 5e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.7661411762237549, + "num_tokens": 649653918.0, + "step": 25104 + }, + { + "epoch": 2.7569734241159676, + "grad_norm": 2.122253179550171, + "learning_rate": 5e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7624672651290894, + "num_tokens": 649678093.0, + "step": 25105 + }, + { + "epoch": 2.757083241818581, + "grad_norm": 2.0932469367980957, + "learning_rate": 5e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7490798234939575, + "num_tokens": 649705792.0, + "step": 25106 + }, + { + "epoch": 2.757193059521195, + "grad_norm": 2.030339241027832, + "learning_rate": 5e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7482149004936218, + "num_tokens": 649737655.0, + "step": 25107 + }, + { + "epoch": 2.7573028772238084, + "grad_norm": 1.9069833755493164, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7528489828109741, + "num_tokens": 649767222.0, + "step": 25108 + }, + { + "epoch": 2.757412694926422, + "grad_norm": 1.9147430658340454, + "learning_rate": 5e-06, + "loss": 0.6925, + "mean_token_accuracy": 0.7679806351661682, + "num_tokens": 649795841.0, + "step": 25109 + }, + { + "epoch": 2.757522512629036, + "grad_norm": 2.0113413333892822, + "learning_rate": 5e-06, + "loss": 0.7335, + "mean_token_accuracy": 0.7576447129249573, + "num_tokens": 649824585.0, + "step": 25110 + }, + { + "epoch": 2.7576323303316492, + "grad_norm": 2.225170373916626, + "learning_rate": 5e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.769680380821228, + "num_tokens": 649847361.0, + "step": 25111 + }, + { + "epoch": 2.757742148034263, + "grad_norm": 2.157487630844116, + "learning_rate": 5e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.7865262031555176, + "num_tokens": 649870507.0, + "step": 25112 + }, + { + "epoch": 2.7578519657368767, + "grad_norm": 2.241764783859253, + "learning_rate": 5e-06, + "loss": 0.6871, + "mean_token_accuracy": 0.778801679611206, + "num_tokens": 649894116.0, + "step": 25113 + }, + { + "epoch": 2.7579617834394905, + "grad_norm": 2.4917891025543213, + "learning_rate": 5e-06, + "loss": 0.6475, + "mean_token_accuracy": 0.7847549915313721, + "num_tokens": 649912784.0, + "step": 25114 + }, + { + "epoch": 2.7580716011421043, + "grad_norm": 2.1222448348999023, + "learning_rate": 5e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.776429295539856, + "num_tokens": 649934971.0, + "step": 25115 + }, + { + "epoch": 2.7581814188447176, + "grad_norm": 2.2209558486938477, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7750444412231445, + "num_tokens": 649957315.0, + "step": 25116 + }, + { + "epoch": 2.7582912365473313, + "grad_norm": 1.9931070804595947, + "learning_rate": 5e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7469474077224731, + "num_tokens": 649983584.0, + "step": 25117 + }, + { + "epoch": 2.758401054249945, + "grad_norm": 1.946775197982788, + "learning_rate": 5e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.742504894733429, + "num_tokens": 650011565.0, + "step": 25118 + }, + { + "epoch": 2.758510871952559, + "grad_norm": 1.9072717428207397, + "learning_rate": 5e-06, + "loss": 0.6823, + "mean_token_accuracy": 0.766279399394989, + "num_tokens": 650041395.0, + "step": 25119 + }, + { + "epoch": 2.7586206896551726, + "grad_norm": 2.01027774810791, + "learning_rate": 5e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7569939494132996, + "num_tokens": 650067797.0, + "step": 25120 + }, + { + "epoch": 2.758730507357786, + "grad_norm": 1.9111019372940063, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7627198696136475, + "num_tokens": 650096872.0, + "step": 25121 + }, + { + "epoch": 2.7588403250603997, + "grad_norm": 2.393704652786255, + "learning_rate": 5e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7486288547515869, + "num_tokens": 650120011.0, + "step": 25122 + }, + { + "epoch": 2.7589501427630134, + "grad_norm": 2.150029420852661, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7593465447425842, + "num_tokens": 650143657.0, + "step": 25123 + }, + { + "epoch": 2.759059960465627, + "grad_norm": 1.8002761602401733, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7728135585784912, + "num_tokens": 650176585.0, + "step": 25124 + }, + { + "epoch": 2.759169778168241, + "grad_norm": 2.1183435916900635, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7528759241104126, + "num_tokens": 650206238.0, + "step": 25125 + }, + { + "epoch": 2.7592795958708543, + "grad_norm": 2.0194554328918457, + "learning_rate": 5e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7584333419799805, + "num_tokens": 650232958.0, + "step": 25126 + }, + { + "epoch": 2.759389413573468, + "grad_norm": 2.128141403198242, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7660776376724243, + "num_tokens": 650255802.0, + "step": 25127 + }, + { + "epoch": 2.7594992312760818, + "grad_norm": 1.8094263076782227, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7750089764595032, + "num_tokens": 650288008.0, + "step": 25128 + }, + { + "epoch": 2.7596090489786955, + "grad_norm": 1.950507640838623, + "learning_rate": 5e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7706637382507324, + "num_tokens": 650316503.0, + "step": 25129 + }, + { + "epoch": 2.7597188666813093, + "grad_norm": 2.0649101734161377, + "learning_rate": 5e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.752239465713501, + "num_tokens": 650342585.0, + "step": 25130 + }, + { + "epoch": 2.7598286843839226, + "grad_norm": 2.1809186935424805, + "learning_rate": 5e-06, + "loss": 0.6327, + "mean_token_accuracy": 0.7876275777816772, + "num_tokens": 650366000.0, + "step": 25131 + }, + { + "epoch": 2.7599385020865363, + "grad_norm": 2.242175340652466, + "learning_rate": 5e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7705051898956299, + "num_tokens": 650389394.0, + "step": 25132 + }, + { + "epoch": 2.76004831978915, + "grad_norm": 2.1538164615631104, + "learning_rate": 5e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.7849316000938416, + "num_tokens": 650413853.0, + "step": 25133 + }, + { + "epoch": 2.7601581374917634, + "grad_norm": 2.201725721359253, + "learning_rate": 5e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.7832737565040588, + "num_tokens": 650436887.0, + "step": 25134 + }, + { + "epoch": 2.760267955194377, + "grad_norm": 2.196664571762085, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7508074045181274, + "num_tokens": 650464741.0, + "step": 25135 + }, + { + "epoch": 2.760377772896991, + "grad_norm": 2.1788218021392822, + "learning_rate": 5e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7668671607971191, + "num_tokens": 650487974.0, + "step": 25136 + }, + { + "epoch": 2.7604875905996047, + "grad_norm": 2.028595447540283, + "learning_rate": 5e-06, + "loss": 0.6273, + "mean_token_accuracy": 0.7866324186325073, + "num_tokens": 650513735.0, + "step": 25137 + }, + { + "epoch": 2.7605974083022184, + "grad_norm": 2.0286498069763184, + "learning_rate": 5e-06, + "loss": 0.6461, + "mean_token_accuracy": 0.7861154079437256, + "num_tokens": 650538702.0, + "step": 25138 + }, + { + "epoch": 2.7607072260048318, + "grad_norm": 2.1101255416870117, + "learning_rate": 5e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7454124689102173, + "num_tokens": 650565799.0, + "step": 25139 + }, + { + "epoch": 2.7608170437074455, + "grad_norm": 2.153754949569702, + "learning_rate": 5e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.749646008014679, + "num_tokens": 650592048.0, + "step": 25140 + }, + { + "epoch": 2.7609268614100593, + "grad_norm": 2.0601818561553955, + "learning_rate": 5e-06, + "loss": 0.7347, + "mean_token_accuracy": 0.752788782119751, + "num_tokens": 650618822.0, + "step": 25141 + }, + { + "epoch": 2.761036679112673, + "grad_norm": 2.3446576595306396, + "learning_rate": 5e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7596434950828552, + "num_tokens": 650640432.0, + "step": 25142 + }, + { + "epoch": 2.761146496815287, + "grad_norm": 2.301968574523926, + "learning_rate": 5e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7655273675918579, + "num_tokens": 650661295.0, + "step": 25143 + }, + { + "epoch": 2.7612563145179, + "grad_norm": 2.106457471847534, + "learning_rate": 5e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7592002153396606, + "num_tokens": 650687730.0, + "step": 25144 + }, + { + "epoch": 2.761366132220514, + "grad_norm": 1.8361616134643555, + "learning_rate": 5e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.767635703086853, + "num_tokens": 650717252.0, + "step": 25145 + }, + { + "epoch": 2.7614759499231276, + "grad_norm": 2.0691795349121094, + "learning_rate": 5e-06, + "loss": 0.6842, + "mean_token_accuracy": 0.7742434740066528, + "num_tokens": 650743698.0, + "step": 25146 + }, + { + "epoch": 2.7615857676257414, + "grad_norm": 2.4172682762145996, + "learning_rate": 5e-06, + "loss": 0.6372, + "mean_token_accuracy": 0.7870585918426514, + "num_tokens": 650762796.0, + "step": 25147 + }, + { + "epoch": 2.761695585328355, + "grad_norm": 2.2886955738067627, + "learning_rate": 5e-06, + "loss": 0.6707, + "mean_token_accuracy": 0.7788193821907043, + "num_tokens": 650783452.0, + "step": 25148 + }, + { + "epoch": 2.7618054030309684, + "grad_norm": 2.2474164962768555, + "learning_rate": 5e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7637267112731934, + "num_tokens": 650805527.0, + "step": 25149 + }, + { + "epoch": 2.761915220733582, + "grad_norm": 2.1916375160217285, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7664210200309753, + "num_tokens": 650828281.0, + "step": 25150 + }, + { + "epoch": 2.762025038436196, + "grad_norm": 2.1116631031036377, + "learning_rate": 5e-06, + "loss": 0.6329, + "mean_token_accuracy": 0.7916187644004822, + "num_tokens": 650850796.0, + "step": 25151 + }, + { + "epoch": 2.7621348561388097, + "grad_norm": 2.3828036785125732, + "learning_rate": 5e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.7756642699241638, + "num_tokens": 650870843.0, + "step": 25152 + }, + { + "epoch": 2.7622446738414235, + "grad_norm": 1.898686408996582, + "learning_rate": 5e-06, + "loss": 0.7518, + "mean_token_accuracy": 0.7529304027557373, + "num_tokens": 650902203.0, + "step": 25153 + }, + { + "epoch": 2.762354491544037, + "grad_norm": 2.30835223197937, + "learning_rate": 5e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.7944767475128174, + "num_tokens": 650922474.0, + "step": 25154 + }, + { + "epoch": 2.7624643092466505, + "grad_norm": 2.117929697036743, + "learning_rate": 5e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7721472978591919, + "num_tokens": 650950377.0, + "step": 25155 + }, + { + "epoch": 2.7625741269492643, + "grad_norm": 2.0142478942871094, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7361186146736145, + "num_tokens": 650980092.0, + "step": 25156 + }, + { + "epoch": 2.7626839446518776, + "grad_norm": 2.1377127170562744, + "learning_rate": 5e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.778064489364624, + "num_tokens": 651002375.0, + "step": 25157 + }, + { + "epoch": 2.762793762354492, + "grad_norm": 1.9103021621704102, + "learning_rate": 5e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7461925745010376, + "num_tokens": 651031401.0, + "step": 25158 + }, + { + "epoch": 2.762903580057105, + "grad_norm": 2.062319755554199, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7561437487602234, + "num_tokens": 651058675.0, + "step": 25159 + }, + { + "epoch": 2.763013397759719, + "grad_norm": 2.2259786128997803, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7628312110900879, + "num_tokens": 651082292.0, + "step": 25160 + }, + { + "epoch": 2.7631232154623326, + "grad_norm": 1.9910962581634521, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7436258792877197, + "num_tokens": 651110645.0, + "step": 25161 + }, + { + "epoch": 2.763233033164946, + "grad_norm": 2.167409658432007, + "learning_rate": 5e-06, + "loss": 0.6693, + "mean_token_accuracy": 0.7754172086715698, + "num_tokens": 651134663.0, + "step": 25162 + }, + { + "epoch": 2.7633428508675597, + "grad_norm": 2.1339874267578125, + "learning_rate": 5e-06, + "loss": 0.7621, + "mean_token_accuracy": 0.7514229416847229, + "num_tokens": 651159799.0, + "step": 25163 + }, + { + "epoch": 2.7634526685701735, + "grad_norm": 2.039769411087036, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7664818167686462, + "num_tokens": 651186867.0, + "step": 25164 + }, + { + "epoch": 2.763562486272787, + "grad_norm": 2.216919422149658, + "learning_rate": 5e-06, + "loss": 0.6931, + "mean_token_accuracy": 0.7679184079170227, + "num_tokens": 651208419.0, + "step": 25165 + }, + { + "epoch": 2.763672303975401, + "grad_norm": 1.9413228034973145, + "learning_rate": 5e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7542609572410583, + "num_tokens": 651239426.0, + "step": 25166 + }, + { + "epoch": 2.7637821216780143, + "grad_norm": 1.950007677078247, + "learning_rate": 5e-06, + "loss": 0.6092, + "mean_token_accuracy": 0.7936975359916687, + "num_tokens": 651265743.0, + "step": 25167 + }, + { + "epoch": 2.763891939380628, + "grad_norm": 2.3546857833862305, + "learning_rate": 5e-06, + "loss": 0.5931, + "mean_token_accuracy": 0.7981246113777161, + "num_tokens": 651283969.0, + "step": 25168 + }, + { + "epoch": 2.764001757083242, + "grad_norm": 2.052253007888794, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7518184781074524, + "num_tokens": 651311598.0, + "step": 25169 + }, + { + "epoch": 2.7641115747858556, + "grad_norm": 2.268644094467163, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.768160343170166, + "num_tokens": 651333063.0, + "step": 25170 + }, + { + "epoch": 2.7642213924884693, + "grad_norm": 2.019116163253784, + "learning_rate": 5e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7760409116744995, + "num_tokens": 651359496.0, + "step": 25171 + }, + { + "epoch": 2.7643312101910826, + "grad_norm": 1.8611924648284912, + "learning_rate": 5e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.7846025824546814, + "num_tokens": 651390618.0, + "step": 25172 + }, + { + "epoch": 2.7644410278936964, + "grad_norm": 2.5193076133728027, + "learning_rate": 5e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.7769114375114441, + "num_tokens": 651409386.0, + "step": 25173 + }, + { + "epoch": 2.76455084559631, + "grad_norm": 2.333085060119629, + "learning_rate": 5e-06, + "loss": 0.637, + "mean_token_accuracy": 0.783324658870697, + "num_tokens": 651430110.0, + "step": 25174 + }, + { + "epoch": 2.764660663298924, + "grad_norm": 2.4402272701263428, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7653892040252686, + "num_tokens": 651451562.0, + "step": 25175 + }, + { + "epoch": 2.7647704810015377, + "grad_norm": 2.0920820236206055, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7829892635345459, + "num_tokens": 651477291.0, + "step": 25176 + }, + { + "epoch": 2.764880298704151, + "grad_norm": 2.018458127975464, + "learning_rate": 5e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7657497525215149, + "num_tokens": 651505713.0, + "step": 25177 + }, + { + "epoch": 2.7649901164067647, + "grad_norm": 2.215668201446533, + "learning_rate": 5e-06, + "loss": 0.5745, + "mean_token_accuracy": 0.8046863079071045, + "num_tokens": 651527790.0, + "step": 25178 + }, + { + "epoch": 2.7650999341093785, + "grad_norm": 1.9643961191177368, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.770565927028656, + "num_tokens": 651557542.0, + "step": 25179 + }, + { + "epoch": 2.7652097518119922, + "grad_norm": 2.1592562198638916, + "learning_rate": 5e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7598303556442261, + "num_tokens": 651581100.0, + "step": 25180 + }, + { + "epoch": 2.765319569514606, + "grad_norm": 2.1599535942077637, + "learning_rate": 5e-06, + "loss": 0.715, + "mean_token_accuracy": 0.7688430547714233, + "num_tokens": 651608072.0, + "step": 25181 + }, + { + "epoch": 2.7654293872172193, + "grad_norm": 2.1979963779449463, + "learning_rate": 5e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7535785436630249, + "num_tokens": 651632802.0, + "step": 25182 + }, + { + "epoch": 2.765539204919833, + "grad_norm": 2.1617488861083984, + "learning_rate": 5e-06, + "loss": 0.7198, + "mean_token_accuracy": 0.7669132947921753, + "num_tokens": 651655333.0, + "step": 25183 + }, + { + "epoch": 2.765649022622447, + "grad_norm": 2.022723436355591, + "learning_rate": 5e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7415102124214172, + "num_tokens": 651682168.0, + "step": 25184 + }, + { + "epoch": 2.76575884032506, + "grad_norm": 1.7564330101013184, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7324358224868774, + "num_tokens": 651717447.0, + "step": 25185 + }, + { + "epoch": 2.765868658027674, + "grad_norm": 2.1189396381378174, + "learning_rate": 5e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7614421844482422, + "num_tokens": 651744230.0, + "step": 25186 + }, + { + "epoch": 2.7659784757302877, + "grad_norm": 2.099816083908081, + "learning_rate": 5e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7706092596054077, + "num_tokens": 651768439.0, + "step": 25187 + }, + { + "epoch": 2.7660882934329014, + "grad_norm": 1.9099981784820557, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7657703757286072, + "num_tokens": 651798327.0, + "step": 25188 + }, + { + "epoch": 2.766198111135515, + "grad_norm": 1.9524028301239014, + "learning_rate": 5e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7613351345062256, + "num_tokens": 651827749.0, + "step": 25189 + }, + { + "epoch": 2.7663079288381285, + "grad_norm": 2.2622628211975098, + "learning_rate": 5e-06, + "loss": 0.6566, + "mean_token_accuracy": 0.7772325277328491, + "num_tokens": 651848546.0, + "step": 25190 + }, + { + "epoch": 2.7664177465407422, + "grad_norm": 2.012817621231079, + "learning_rate": 5e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7598685622215271, + "num_tokens": 651877918.0, + "step": 25191 + }, + { + "epoch": 2.766527564243356, + "grad_norm": 2.209026336669922, + "learning_rate": 5e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7430778741836548, + "num_tokens": 651905953.0, + "step": 25192 + }, + { + "epoch": 2.7666373819459698, + "grad_norm": 1.964996099472046, + "learning_rate": 5e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7529752850532532, + "num_tokens": 651934202.0, + "step": 25193 + }, + { + "epoch": 2.7667471996485835, + "grad_norm": 2.1039187908172607, + "learning_rate": 5e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.7905370593070984, + "num_tokens": 651957559.0, + "step": 25194 + }, + { + "epoch": 2.766857017351197, + "grad_norm": 2.055375576019287, + "learning_rate": 5e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.7964649200439453, + "num_tokens": 651981881.0, + "step": 25195 + }, + { + "epoch": 2.7669668350538106, + "grad_norm": 1.946670413017273, + "learning_rate": 5e-06, + "loss": 0.6814, + "mean_token_accuracy": 0.7727705836296082, + "num_tokens": 652009695.0, + "step": 25196 + }, + { + "epoch": 2.7670766527564243, + "grad_norm": 1.8739033937454224, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7806833386421204, + "num_tokens": 652037958.0, + "step": 25197 + }, + { + "epoch": 2.767186470459038, + "grad_norm": 1.9724414348602295, + "learning_rate": 5e-06, + "loss": 0.733, + "mean_token_accuracy": 0.7651166915893555, + "num_tokens": 652065162.0, + "step": 25198 + }, + { + "epoch": 2.767296288161652, + "grad_norm": 2.066734790802002, + "learning_rate": 5e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7733136415481567, + "num_tokens": 652090266.0, + "step": 25199 + }, + { + "epoch": 2.767406105864265, + "grad_norm": 2.0302250385284424, + "learning_rate": 5e-06, + "loss": 0.6572, + "mean_token_accuracy": 0.7783560156822205, + "num_tokens": 652116411.0, + "step": 25200 + }, + { + "epoch": 2.767515923566879, + "grad_norm": 2.127688407897949, + "learning_rate": 5e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.78277587890625, + "num_tokens": 652140770.0, + "step": 25201 + }, + { + "epoch": 2.7676257412694927, + "grad_norm": 2.4631662368774414, + "learning_rate": 5e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7546662092208862, + "num_tokens": 652159918.0, + "step": 25202 + }, + { + "epoch": 2.7677355589721064, + "grad_norm": 2.071674108505249, + "learning_rate": 5e-06, + "loss": 0.7111, + "mean_token_accuracy": 0.7687252163887024, + "num_tokens": 652183828.0, + "step": 25203 + }, + { + "epoch": 2.76784537667472, + "grad_norm": 1.8340425491333008, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.741400420665741, + "num_tokens": 652217632.0, + "step": 25204 + }, + { + "epoch": 2.7679551943773335, + "grad_norm": 1.9210014343261719, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7444010972976685, + "num_tokens": 652247427.0, + "step": 25205 + }, + { + "epoch": 2.7680650120799473, + "grad_norm": 2.1249032020568848, + "learning_rate": 5e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.7748651504516602, + "num_tokens": 652273222.0, + "step": 25206 + }, + { + "epoch": 2.768174829782561, + "grad_norm": 2.096431255340576, + "learning_rate": 5e-06, + "loss": 0.6742, + "mean_token_accuracy": 0.7725712656974792, + "num_tokens": 652298082.0, + "step": 25207 + }, + { + "epoch": 2.7682846474851748, + "grad_norm": 1.821136236190796, + "learning_rate": 5e-06, + "loss": 0.6338, + "mean_token_accuracy": 0.7878326773643494, + "num_tokens": 652328158.0, + "step": 25208 + }, + { + "epoch": 2.7683944651877885, + "grad_norm": 2.1571547985076904, + "learning_rate": 5e-06, + "loss": 0.6385, + "mean_token_accuracy": 0.7869430780410767, + "num_tokens": 652349560.0, + "step": 25209 + }, + { + "epoch": 2.768504282890402, + "grad_norm": 2.0056910514831543, + "learning_rate": 5e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7509174346923828, + "num_tokens": 652378784.0, + "step": 25210 + }, + { + "epoch": 2.7686141005930156, + "grad_norm": 2.0211946964263916, + "learning_rate": 5e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7580703496932983, + "num_tokens": 652408016.0, + "step": 25211 + }, + { + "epoch": 2.7687239182956294, + "grad_norm": 2.387950897216797, + "learning_rate": 5e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7546417713165283, + "num_tokens": 652429397.0, + "step": 25212 + }, + { + "epoch": 2.7688337359982427, + "grad_norm": 1.9657398462295532, + "learning_rate": 5e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7534867525100708, + "num_tokens": 652458109.0, + "step": 25213 + }, + { + "epoch": 2.7689435537008564, + "grad_norm": 2.0748438835144043, + "learning_rate": 5e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.7453894019126892, + "num_tokens": 652485314.0, + "step": 25214 + }, + { + "epoch": 2.76905337140347, + "grad_norm": 1.7561005353927612, + "learning_rate": 5e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7259653806686401, + "num_tokens": 652524716.0, + "step": 25215 + }, + { + "epoch": 2.769163189106084, + "grad_norm": 2.184375047683716, + "learning_rate": 5e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.7804067134857178, + "num_tokens": 652551939.0, + "step": 25216 + }, + { + "epoch": 2.7692730068086977, + "grad_norm": 2.0920193195343018, + "learning_rate": 5e-06, + "loss": 0.6621, + "mean_token_accuracy": 0.7827540636062622, + "num_tokens": 652576260.0, + "step": 25217 + }, + { + "epoch": 2.769382824511311, + "grad_norm": 2.208026647567749, + "learning_rate": 5e-06, + "loss": 0.6318, + "mean_token_accuracy": 0.7885687351226807, + "num_tokens": 652599091.0, + "step": 25218 + }, + { + "epoch": 2.7694926422139248, + "grad_norm": 2.144040107727051, + "learning_rate": 5e-06, + "loss": 0.6425, + "mean_token_accuracy": 0.7855788469314575, + "num_tokens": 652622249.0, + "step": 25219 + }, + { + "epoch": 2.7696024599165385, + "grad_norm": 2.0848026275634766, + "learning_rate": 5e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.760103166103363, + "num_tokens": 652650260.0, + "step": 25220 + }, + { + "epoch": 2.7697122776191523, + "grad_norm": 1.8123332262039185, + "learning_rate": 5e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.747531533241272, + "num_tokens": 652682825.0, + "step": 25221 + }, + { + "epoch": 2.769822095321766, + "grad_norm": 2.2507498264312744, + "learning_rate": 5e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7669212818145752, + "num_tokens": 652703953.0, + "step": 25222 + }, + { + "epoch": 2.7699319130243794, + "grad_norm": 2.1604788303375244, + "learning_rate": 5e-06, + "loss": 0.6609, + "mean_token_accuracy": 0.7882723808288574, + "num_tokens": 652725876.0, + "step": 25223 + }, + { + "epoch": 2.770041730726993, + "grad_norm": 2.2175745964050293, + "learning_rate": 5e-06, + "loss": 0.699, + "mean_token_accuracy": 0.7792841792106628, + "num_tokens": 652750225.0, + "step": 25224 + }, + { + "epoch": 2.770151548429607, + "grad_norm": 1.921895980834961, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7603212594985962, + "num_tokens": 652781108.0, + "step": 25225 + }, + { + "epoch": 2.7702613661322206, + "grad_norm": 2.190932273864746, + "learning_rate": 5e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.7529970407485962, + "num_tokens": 652806640.0, + "step": 25226 + }, + { + "epoch": 2.7703711838348344, + "grad_norm": 1.9432806968688965, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7562556266784668, + "num_tokens": 652834088.0, + "step": 25227 + }, + { + "epoch": 2.7704810015374477, + "grad_norm": 2.2581303119659424, + "learning_rate": 5e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7319959402084351, + "num_tokens": 652858253.0, + "step": 25228 + }, + { + "epoch": 2.7705908192400615, + "grad_norm": 2.110682249069214, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7509592175483704, + "num_tokens": 652883037.0, + "step": 25229 + }, + { + "epoch": 2.770700636942675, + "grad_norm": 1.8573405742645264, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7620378136634827, + "num_tokens": 652912882.0, + "step": 25230 + }, + { + "epoch": 2.770810454645289, + "grad_norm": 1.8964327573776245, + "learning_rate": 5e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7734767198562622, + "num_tokens": 652939867.0, + "step": 25231 + }, + { + "epoch": 2.7709202723479027, + "grad_norm": 1.9949204921722412, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7614835500717163, + "num_tokens": 652967592.0, + "step": 25232 + }, + { + "epoch": 2.771030090050516, + "grad_norm": 2.322587490081787, + "learning_rate": 5e-06, + "loss": 0.6281, + "mean_token_accuracy": 0.7880670428276062, + "num_tokens": 652988240.0, + "step": 25233 + }, + { + "epoch": 2.77113990775313, + "grad_norm": 1.8805081844329834, + "learning_rate": 5e-06, + "loss": 0.77, + "mean_token_accuracy": 0.747125506401062, + "num_tokens": 653018846.0, + "step": 25234 + }, + { + "epoch": 2.7712497254557436, + "grad_norm": 2.3666391372680664, + "learning_rate": 5e-06, + "loss": 0.6737, + "mean_token_accuracy": 0.7770570516586304, + "num_tokens": 653039751.0, + "step": 25235 + }, + { + "epoch": 2.771359543158357, + "grad_norm": 2.1022841930389404, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7511147260665894, + "num_tokens": 653064384.0, + "step": 25236 + }, + { + "epoch": 2.771469360860971, + "grad_norm": 1.839148759841919, + "learning_rate": 5e-06, + "loss": 0.6769, + "mean_token_accuracy": 0.785043478012085, + "num_tokens": 653092597.0, + "step": 25237 + }, + { + "epoch": 2.7715791785635844, + "grad_norm": 2.034569263458252, + "learning_rate": 5e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7364429235458374, + "num_tokens": 653120969.0, + "step": 25238 + }, + { + "epoch": 2.771688996266198, + "grad_norm": 2.212510347366333, + "learning_rate": 5e-06, + "loss": 0.7244, + "mean_token_accuracy": 0.7607336044311523, + "num_tokens": 653144110.0, + "step": 25239 + }, + { + "epoch": 2.771798813968812, + "grad_norm": 2.5079843997955322, + "learning_rate": 5e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.7523854970932007, + "num_tokens": 653165503.0, + "step": 25240 + }, + { + "epoch": 2.771908631671425, + "grad_norm": 1.9584572315216064, + "learning_rate": 5e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7428977489471436, + "num_tokens": 653193380.0, + "step": 25241 + }, + { + "epoch": 2.772018449374039, + "grad_norm": 2.0786545276641846, + "learning_rate": 5e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7689729928970337, + "num_tokens": 653219180.0, + "step": 25242 + }, + { + "epoch": 2.7721282670766527, + "grad_norm": 2.2227728366851807, + "learning_rate": 5e-06, + "loss": 0.6265, + "mean_token_accuracy": 0.7907384634017944, + "num_tokens": 653238841.0, + "step": 25243 + }, + { + "epoch": 2.7722380847792665, + "grad_norm": 2.175400972366333, + "learning_rate": 5e-06, + "loss": 0.6181, + "mean_token_accuracy": 0.7907253503799438, + "num_tokens": 653262319.0, + "step": 25244 + }, + { + "epoch": 2.7723479024818802, + "grad_norm": 1.9795371294021606, + "learning_rate": 5e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7572377920150757, + "num_tokens": 653289504.0, + "step": 25245 + }, + { + "epoch": 2.7724577201844935, + "grad_norm": 2.0531375408172607, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7652750015258789, + "num_tokens": 653316251.0, + "step": 25246 + }, + { + "epoch": 2.7725675378871073, + "grad_norm": 2.1699886322021484, + "learning_rate": 5e-06, + "loss": 0.6383, + "mean_token_accuracy": 0.7850384712219238, + "num_tokens": 653338996.0, + "step": 25247 + }, + { + "epoch": 2.772677355589721, + "grad_norm": 2.00287127494812, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.75803542137146, + "num_tokens": 653371517.0, + "step": 25248 + }, + { + "epoch": 2.772787173292335, + "grad_norm": 2.1925878524780273, + "learning_rate": 5e-06, + "loss": 0.7132, + "mean_token_accuracy": 0.7693479061126709, + "num_tokens": 653394755.0, + "step": 25249 + }, + { + "epoch": 2.7728969909949486, + "grad_norm": 2.0125510692596436, + "learning_rate": 5e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7766389846801758, + "num_tokens": 653420867.0, + "step": 25250 + }, + { + "epoch": 2.773006808697562, + "grad_norm": 2.241764783859253, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7727056741714478, + "num_tokens": 653443613.0, + "step": 25251 + }, + { + "epoch": 2.7731166264001756, + "grad_norm": 2.1648998260498047, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7653698325157166, + "num_tokens": 653467035.0, + "step": 25252 + }, + { + "epoch": 2.7732264441027894, + "grad_norm": 2.1801397800445557, + "learning_rate": 5e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7746629118919373, + "num_tokens": 653489736.0, + "step": 25253 + }, + { + "epoch": 2.773336261805403, + "grad_norm": 1.9103790521621704, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7709746360778809, + "num_tokens": 653518357.0, + "step": 25254 + }, + { + "epoch": 2.773446079508017, + "grad_norm": 2.1489999294281006, + "learning_rate": 5e-06, + "loss": 0.666, + "mean_token_accuracy": 0.7792873978614807, + "num_tokens": 653542331.0, + "step": 25255 + }, + { + "epoch": 2.7735558972106302, + "grad_norm": 2.001434087753296, + "learning_rate": 5e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.753836989402771, + "num_tokens": 653570572.0, + "step": 25256 + }, + { + "epoch": 2.773665714913244, + "grad_norm": 1.9618092775344849, + "learning_rate": 5e-06, + "loss": 0.6924, + "mean_token_accuracy": 0.7688672542572021, + "num_tokens": 653595790.0, + "step": 25257 + }, + { + "epoch": 2.7737755326158577, + "grad_norm": 1.9646607637405396, + "learning_rate": 5e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.757590651512146, + "num_tokens": 653622489.0, + "step": 25258 + }, + { + "epoch": 2.7738853503184715, + "grad_norm": 2.0391814708709717, + "learning_rate": 5e-06, + "loss": 0.7011, + "mean_token_accuracy": 0.764237105846405, + "num_tokens": 653649027.0, + "step": 25259 + }, + { + "epoch": 2.7739951680210853, + "grad_norm": 2.0121724605560303, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7485177516937256, + "num_tokens": 653675525.0, + "step": 25260 + }, + { + "epoch": 2.7741049857236986, + "grad_norm": 2.239447832107544, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7614094018936157, + "num_tokens": 653697306.0, + "step": 25261 + }, + { + "epoch": 2.7742148034263123, + "grad_norm": 2.1666500568389893, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7678440809249878, + "num_tokens": 653718511.0, + "step": 25262 + }, + { + "epoch": 2.774324621128926, + "grad_norm": 2.2623839378356934, + "learning_rate": 5e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.7758398056030273, + "num_tokens": 653742524.0, + "step": 25263 + }, + { + "epoch": 2.7744344388315394, + "grad_norm": 1.7862039804458618, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7600435018539429, + "num_tokens": 653777222.0, + "step": 25264 + }, + { + "epoch": 2.774544256534153, + "grad_norm": 2.317625045776367, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7754783630371094, + "num_tokens": 653798206.0, + "step": 25265 + }, + { + "epoch": 2.774654074236767, + "grad_norm": 1.974892258644104, + "learning_rate": 5e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7560529708862305, + "num_tokens": 653825988.0, + "step": 25266 + }, + { + "epoch": 2.7747638919393807, + "grad_norm": 1.8475409746170044, + "learning_rate": 5e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7556283473968506, + "num_tokens": 653855865.0, + "step": 25267 + }, + { + "epoch": 2.7748737096419944, + "grad_norm": 2.284198522567749, + "learning_rate": 5e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7687255144119263, + "num_tokens": 653877621.0, + "step": 25268 + }, + { + "epoch": 2.7749835273446077, + "grad_norm": 1.8756426572799683, + "learning_rate": 5e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.7670817375183105, + "num_tokens": 653906559.0, + "step": 25269 + }, + { + "epoch": 2.7750933450472215, + "grad_norm": 1.91641104221344, + "learning_rate": 5e-06, + "loss": 0.7343, + "mean_token_accuracy": 0.7585007548332214, + "num_tokens": 653936170.0, + "step": 25270 + }, + { + "epoch": 2.7752031627498353, + "grad_norm": 2.025465726852417, + "learning_rate": 5e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7433422803878784, + "num_tokens": 653965914.0, + "step": 25271 + }, + { + "epoch": 2.775312980452449, + "grad_norm": 2.051384210586548, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7656543254852295, + "num_tokens": 653992198.0, + "step": 25272 + }, + { + "epoch": 2.7754227981550628, + "grad_norm": 2.119659185409546, + "learning_rate": 5e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.7584536075592041, + "num_tokens": 654019027.0, + "step": 25273 + }, + { + "epoch": 2.775532615857676, + "grad_norm": 2.279197931289673, + "learning_rate": 5e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7709447145462036, + "num_tokens": 654041838.0, + "step": 25274 + }, + { + "epoch": 2.77564243356029, + "grad_norm": 1.814002513885498, + "learning_rate": 5e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7430720329284668, + "num_tokens": 654073673.0, + "step": 25275 + }, + { + "epoch": 2.7757522512629036, + "grad_norm": 1.8076668977737427, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7526282072067261, + "num_tokens": 654106892.0, + "step": 25276 + }, + { + "epoch": 2.7758620689655173, + "grad_norm": 2.212425947189331, + "learning_rate": 5e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7636045217514038, + "num_tokens": 654130109.0, + "step": 25277 + }, + { + "epoch": 2.775971886668131, + "grad_norm": 1.822182297706604, + "learning_rate": 5e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.754173994064331, + "num_tokens": 654162325.0, + "step": 25278 + }, + { + "epoch": 2.7760817043707444, + "grad_norm": 2.2447409629821777, + "learning_rate": 5e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7814109325408936, + "num_tokens": 654184435.0, + "step": 25279 + }, + { + "epoch": 2.776191522073358, + "grad_norm": 2.393052577972412, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7621204853057861, + "num_tokens": 654206691.0, + "step": 25280 + }, + { + "epoch": 2.776301339775972, + "grad_norm": 1.9962477684020996, + "learning_rate": 5e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7681652307510376, + "num_tokens": 654236427.0, + "step": 25281 + }, + { + "epoch": 2.7764111574785857, + "grad_norm": 2.167839527130127, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7558771967887878, + "num_tokens": 654260999.0, + "step": 25282 + }, + { + "epoch": 2.7765209751811994, + "grad_norm": 2.073944568634033, + "learning_rate": 5e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7492167949676514, + "num_tokens": 654284870.0, + "step": 25283 + }, + { + "epoch": 2.7766307928838128, + "grad_norm": 2.0500407218933105, + "learning_rate": 5e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7637515068054199, + "num_tokens": 654308668.0, + "step": 25284 + }, + { + "epoch": 2.7767406105864265, + "grad_norm": 1.9178917407989502, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7516728639602661, + "num_tokens": 654337038.0, + "step": 25285 + }, + { + "epoch": 2.7768504282890403, + "grad_norm": 1.9085103273391724, + "learning_rate": 5e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7350839972496033, + "num_tokens": 654366317.0, + "step": 25286 + }, + { + "epoch": 2.7769602459916536, + "grad_norm": 2.3898770809173584, + "learning_rate": 5e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.7724112272262573, + "num_tokens": 654386354.0, + "step": 25287 + }, + { + "epoch": 2.777070063694268, + "grad_norm": 2.0628671646118164, + "learning_rate": 5e-06, + "loss": 0.7239, + "mean_token_accuracy": 0.7603228092193604, + "num_tokens": 654411876.0, + "step": 25288 + }, + { + "epoch": 2.777179881396881, + "grad_norm": 1.7143659591674805, + "learning_rate": 5e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7334879636764526, + "num_tokens": 654447923.0, + "step": 25289 + }, + { + "epoch": 2.777289699099495, + "grad_norm": 2.2313272953033447, + "learning_rate": 5e-06, + "loss": 0.6916, + "mean_token_accuracy": 0.7675354480743408, + "num_tokens": 654470499.0, + "step": 25290 + }, + { + "epoch": 2.7773995168021086, + "grad_norm": 2.0477373600006104, + "learning_rate": 5e-06, + "loss": 0.7202, + "mean_token_accuracy": 0.764327347278595, + "num_tokens": 654495798.0, + "step": 25291 + }, + { + "epoch": 2.777509334504722, + "grad_norm": 2.1859209537506104, + "learning_rate": 5e-06, + "loss": 0.5902, + "mean_token_accuracy": 0.7974393367767334, + "num_tokens": 654518589.0, + "step": 25292 + }, + { + "epoch": 2.7776191522073357, + "grad_norm": 2.1346957683563232, + "learning_rate": 5e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7348698377609253, + "num_tokens": 654545310.0, + "step": 25293 + }, + { + "epoch": 2.7777289699099494, + "grad_norm": 2.0135703086853027, + "learning_rate": 5e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7689697742462158, + "num_tokens": 654573446.0, + "step": 25294 + }, + { + "epoch": 2.777838787612563, + "grad_norm": 2.0520882606506348, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7646027207374573, + "num_tokens": 654602266.0, + "step": 25295 + }, + { + "epoch": 2.777948605315177, + "grad_norm": 2.1550543308258057, + "learning_rate": 5e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7363177537918091, + "num_tokens": 654628977.0, + "step": 25296 + }, + { + "epoch": 2.7780584230177903, + "grad_norm": 2.1202542781829834, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.76948082447052, + "num_tokens": 654652955.0, + "step": 25297 + }, + { + "epoch": 2.778168240720404, + "grad_norm": 1.9397979974746704, + "learning_rate": 5e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7464644312858582, + "num_tokens": 654680992.0, + "step": 25298 + }, + { + "epoch": 2.778278058423018, + "grad_norm": 1.9662359952926636, + "learning_rate": 5e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.776449978351593, + "num_tokens": 654706394.0, + "step": 25299 + }, + { + "epoch": 2.7783878761256315, + "grad_norm": 1.9230433702468872, + "learning_rate": 5e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7462456226348877, + "num_tokens": 654735788.0, + "step": 25300 + }, + { + "epoch": 2.7784976938282453, + "grad_norm": 2.046168088912964, + "learning_rate": 5e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7745616436004639, + "num_tokens": 654762890.0, + "step": 25301 + }, + { + "epoch": 2.7786075115308586, + "grad_norm": 2.352965831756592, + "learning_rate": 5e-06, + "loss": 0.5914, + "mean_token_accuracy": 0.7932695150375366, + "num_tokens": 654781707.0, + "step": 25302 + }, + { + "epoch": 2.7787173292334724, + "grad_norm": 2.3255910873413086, + "learning_rate": 5e-06, + "loss": 0.7237, + "mean_token_accuracy": 0.7675669193267822, + "num_tokens": 654801653.0, + "step": 25303 + }, + { + "epoch": 2.778827146936086, + "grad_norm": 2.1825430393218994, + "learning_rate": 5e-06, + "loss": 0.6883, + "mean_token_accuracy": 0.7704454660415649, + "num_tokens": 654825387.0, + "step": 25304 + }, + { + "epoch": 2.7789369646387, + "grad_norm": 2.0032942295074463, + "learning_rate": 5e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7565492391586304, + "num_tokens": 654853372.0, + "step": 25305 + }, + { + "epoch": 2.7790467823413136, + "grad_norm": 2.0776329040527344, + "learning_rate": 5e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7659662365913391, + "num_tokens": 654878236.0, + "step": 25306 + }, + { + "epoch": 2.779156600043927, + "grad_norm": 2.2909557819366455, + "learning_rate": 5e-06, + "loss": 0.6121, + "mean_token_accuracy": 0.7954748868942261, + "num_tokens": 654897391.0, + "step": 25307 + }, + { + "epoch": 2.7792664177465407, + "grad_norm": 2.0388925075531006, + "learning_rate": 5e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7705144286155701, + "num_tokens": 654922513.0, + "step": 25308 + }, + { + "epoch": 2.7793762354491545, + "grad_norm": 2.151387929916382, + "learning_rate": 5e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7724725604057312, + "num_tokens": 654946467.0, + "step": 25309 + }, + { + "epoch": 2.779486053151768, + "grad_norm": 2.3788366317749023, + "learning_rate": 5e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7547640204429626, + "num_tokens": 654968185.0, + "step": 25310 + }, + { + "epoch": 2.779595870854382, + "grad_norm": 2.030783176422119, + "learning_rate": 5e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7578327655792236, + "num_tokens": 654995444.0, + "step": 25311 + }, + { + "epoch": 2.7797056885569953, + "grad_norm": 2.4065287113189697, + "learning_rate": 5e-06, + "loss": 0.6344, + "mean_token_accuracy": 0.7870151996612549, + "num_tokens": 655014615.0, + "step": 25312 + }, + { + "epoch": 2.779815506259609, + "grad_norm": 2.0496294498443604, + "learning_rate": 5e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7295387983322144, + "num_tokens": 655044938.0, + "step": 25313 + }, + { + "epoch": 2.779925323962223, + "grad_norm": 1.9718021154403687, + "learning_rate": 5e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7432513236999512, + "num_tokens": 655073012.0, + "step": 25314 + }, + { + "epoch": 2.780035141664836, + "grad_norm": 2.2975876331329346, + "learning_rate": 5e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7695205807685852, + "num_tokens": 655092921.0, + "step": 25315 + }, + { + "epoch": 2.78014495936745, + "grad_norm": 1.9900641441345215, + "learning_rate": 5e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7487192153930664, + "num_tokens": 655120423.0, + "step": 25316 + }, + { + "epoch": 2.7802547770700636, + "grad_norm": 2.1688477993011475, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.757010281085968, + "num_tokens": 655147615.0, + "step": 25317 + }, + { + "epoch": 2.7803645947726774, + "grad_norm": 1.8292806148529053, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7409461140632629, + "num_tokens": 655179925.0, + "step": 25318 + }, + { + "epoch": 2.780474412475291, + "grad_norm": 2.103076696395874, + "learning_rate": 5e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.7559970617294312, + "num_tokens": 655205020.0, + "step": 25319 + }, + { + "epoch": 2.7805842301779045, + "grad_norm": 2.1039187908172607, + "learning_rate": 5e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7637351155281067, + "num_tokens": 655229287.0, + "step": 25320 + }, + { + "epoch": 2.780694047880518, + "grad_norm": 2.223808526992798, + "learning_rate": 5e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7632193565368652, + "num_tokens": 655251815.0, + "step": 25321 + }, + { + "epoch": 2.780803865583132, + "grad_norm": 2.3332600593566895, + "learning_rate": 5e-06, + "loss": 0.6509, + "mean_token_accuracy": 0.7890511751174927, + "num_tokens": 655269669.0, + "step": 25322 + }, + { + "epoch": 2.7809136832857457, + "grad_norm": 2.284308910369873, + "learning_rate": 5e-06, + "loss": 0.6132, + "mean_token_accuracy": 0.7948050498962402, + "num_tokens": 655290922.0, + "step": 25323 + }, + { + "epoch": 2.7810235009883595, + "grad_norm": 2.1049251556396484, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7435133457183838, + "num_tokens": 655315984.0, + "step": 25324 + }, + { + "epoch": 2.781133318690973, + "grad_norm": 2.0012528896331787, + "learning_rate": 5e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7639914155006409, + "num_tokens": 655345186.0, + "step": 25325 + }, + { + "epoch": 2.7812431363935866, + "grad_norm": 2.266533851623535, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7708731889724731, + "num_tokens": 655368181.0, + "step": 25326 + }, + { + "epoch": 2.7813529540962003, + "grad_norm": 1.8061256408691406, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7605999708175659, + "num_tokens": 655399898.0, + "step": 25327 + }, + { + "epoch": 2.781462771798814, + "grad_norm": 2.1226465702056885, + "learning_rate": 5e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7421485185623169, + "num_tokens": 655428837.0, + "step": 25328 + }, + { + "epoch": 2.781572589501428, + "grad_norm": 2.0372049808502197, + "learning_rate": 5e-06, + "loss": 0.6466, + "mean_token_accuracy": 0.7833083868026733, + "num_tokens": 655451167.0, + "step": 25329 + }, + { + "epoch": 2.781682407204041, + "grad_norm": 2.0770492553710938, + "learning_rate": 5e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.7769644260406494, + "num_tokens": 655476682.0, + "step": 25330 + }, + { + "epoch": 2.781792224906655, + "grad_norm": 1.8532756567001343, + "learning_rate": 5e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7378765344619751, + "num_tokens": 655509481.0, + "step": 25331 + }, + { + "epoch": 2.7819020426092687, + "grad_norm": 2.2873659133911133, + "learning_rate": 5e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7663304805755615, + "num_tokens": 655532404.0, + "step": 25332 + }, + { + "epoch": 2.7820118603118824, + "grad_norm": 1.8301031589508057, + "learning_rate": 5e-06, + "loss": 0.6969, + "mean_token_accuracy": 0.7667926549911499, + "num_tokens": 655562628.0, + "step": 25333 + }, + { + "epoch": 2.782121678014496, + "grad_norm": 1.9143686294555664, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.76638263463974, + "num_tokens": 655590904.0, + "step": 25334 + }, + { + "epoch": 2.7822314957171095, + "grad_norm": 2.1362802982330322, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7628313302993774, + "num_tokens": 655616327.0, + "step": 25335 + }, + { + "epoch": 2.7823413134197232, + "grad_norm": 2.2786452770233154, + "learning_rate": 5e-06, + "loss": 0.7177, + "mean_token_accuracy": 0.7670038938522339, + "num_tokens": 655637265.0, + "step": 25336 + }, + { + "epoch": 2.782451131122337, + "grad_norm": 2.1445515155792236, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7772613763809204, + "num_tokens": 655661283.0, + "step": 25337 + }, + { + "epoch": 2.7825609488249503, + "grad_norm": 2.026944637298584, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7355210185050964, + "num_tokens": 655687624.0, + "step": 25338 + }, + { + "epoch": 2.7826707665275645, + "grad_norm": 1.9222869873046875, + "learning_rate": 5e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7603031396865845, + "num_tokens": 655714571.0, + "step": 25339 + }, + { + "epoch": 2.782780584230178, + "grad_norm": 1.9759396314620972, + "learning_rate": 5e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7417356967926025, + "num_tokens": 655743761.0, + "step": 25340 + }, + { + "epoch": 2.7828904019327916, + "grad_norm": 2.0181005001068115, + "learning_rate": 5e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7459009289741516, + "num_tokens": 655771184.0, + "step": 25341 + }, + { + "epoch": 2.7830002196354053, + "grad_norm": 1.893911600112915, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7578073143959045, + "num_tokens": 655799710.0, + "step": 25342 + }, + { + "epoch": 2.7831100373380186, + "grad_norm": 1.995530366897583, + "learning_rate": 5e-06, + "loss": 0.6895, + "mean_token_accuracy": 0.7739765048027039, + "num_tokens": 655826669.0, + "step": 25343 + }, + { + "epoch": 2.7832198550406324, + "grad_norm": 2.1107072830200195, + "learning_rate": 5e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7550173997879028, + "num_tokens": 655851911.0, + "step": 25344 + }, + { + "epoch": 2.783329672743246, + "grad_norm": 2.4546451568603516, + "learning_rate": 5e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7718074321746826, + "num_tokens": 655870629.0, + "step": 25345 + }, + { + "epoch": 2.78343949044586, + "grad_norm": 1.938957691192627, + "learning_rate": 5e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7549400329589844, + "num_tokens": 655900679.0, + "step": 25346 + }, + { + "epoch": 2.7835493081484737, + "grad_norm": 2.406905174255371, + "learning_rate": 5e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7570604681968689, + "num_tokens": 655921885.0, + "step": 25347 + }, + { + "epoch": 2.783659125851087, + "grad_norm": 2.179216146469116, + "learning_rate": 5e-06, + "loss": 0.624, + "mean_token_accuracy": 0.7901744842529297, + "num_tokens": 655943389.0, + "step": 25348 + }, + { + "epoch": 2.7837689435537007, + "grad_norm": 2.013607978820801, + "learning_rate": 5e-06, + "loss": 0.7503, + "mean_token_accuracy": 0.7612496614456177, + "num_tokens": 655970204.0, + "step": 25349 + }, + { + "epoch": 2.7838787612563145, + "grad_norm": 1.9728291034698486, + "learning_rate": 5e-06, + "loss": 0.6781, + "mean_token_accuracy": 0.774554967880249, + "num_tokens": 655996057.0, + "step": 25350 + }, + { + "epoch": 2.7839885789589283, + "grad_norm": 1.9390227794647217, + "learning_rate": 5e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7530705332756042, + "num_tokens": 656025306.0, + "step": 25351 + }, + { + "epoch": 2.784098396661542, + "grad_norm": 2.318876266479492, + "learning_rate": 5e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7537966370582581, + "num_tokens": 656047884.0, + "step": 25352 + }, + { + "epoch": 2.7842082143641553, + "grad_norm": 1.9374743700027466, + "learning_rate": 5e-06, + "loss": 0.6742, + "mean_token_accuracy": 0.7707485556602478, + "num_tokens": 656077003.0, + "step": 25353 + }, + { + "epoch": 2.784318032066769, + "grad_norm": 1.8694093227386475, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7665683031082153, + "num_tokens": 656106496.0, + "step": 25354 + }, + { + "epoch": 2.784427849769383, + "grad_norm": 2.0018653869628906, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7544792294502258, + "num_tokens": 656135273.0, + "step": 25355 + }, + { + "epoch": 2.7845376674719966, + "grad_norm": 2.230947494506836, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7644794583320618, + "num_tokens": 656158009.0, + "step": 25356 + }, + { + "epoch": 2.7846474851746104, + "grad_norm": 1.9876595735549927, + "learning_rate": 5e-06, + "loss": 0.719, + "mean_token_accuracy": 0.760846734046936, + "num_tokens": 656187295.0, + "step": 25357 + }, + { + "epoch": 2.7847573028772237, + "grad_norm": 1.839261531829834, + "learning_rate": 5e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.756496012210846, + "num_tokens": 656219296.0, + "step": 25358 + }, + { + "epoch": 2.7848671205798374, + "grad_norm": 2.00813627243042, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7750786542892456, + "num_tokens": 656247109.0, + "step": 25359 + }, + { + "epoch": 2.784976938282451, + "grad_norm": 1.9344594478607178, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7706605195999146, + "num_tokens": 656276467.0, + "step": 25360 + }, + { + "epoch": 2.785086755985065, + "grad_norm": 2.105088233947754, + "learning_rate": 5e-06, + "loss": 0.6267, + "mean_token_accuracy": 0.791130781173706, + "num_tokens": 656302583.0, + "step": 25361 + }, + { + "epoch": 2.7851965736876787, + "grad_norm": 2.057762861251831, + "learning_rate": 5e-06, + "loss": 0.6694, + "mean_token_accuracy": 0.7712134122848511, + "num_tokens": 656327951.0, + "step": 25362 + }, + { + "epoch": 2.785306391390292, + "grad_norm": 2.084233283996582, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7687026858329773, + "num_tokens": 656354106.0, + "step": 25363 + }, + { + "epoch": 2.7854162090929058, + "grad_norm": 2.057281255722046, + "learning_rate": 5e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.7872112393379211, + "num_tokens": 656380978.0, + "step": 25364 + }, + { + "epoch": 2.7855260267955195, + "grad_norm": 2.130932092666626, + "learning_rate": 5e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7474823594093323, + "num_tokens": 656405733.0, + "step": 25365 + }, + { + "epoch": 2.785635844498133, + "grad_norm": 2.0971803665161133, + "learning_rate": 5e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7582619190216064, + "num_tokens": 656432486.0, + "step": 25366 + }, + { + "epoch": 2.7857456622007466, + "grad_norm": 2.1924917697906494, + "learning_rate": 5e-06, + "loss": 0.5968, + "mean_token_accuracy": 0.8010080456733704, + "num_tokens": 656452944.0, + "step": 25367 + }, + { + "epoch": 2.7858554799033604, + "grad_norm": 2.1487085819244385, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7700234651565552, + "num_tokens": 656476876.0, + "step": 25368 + }, + { + "epoch": 2.785965297605974, + "grad_norm": 1.887610912322998, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7564089894294739, + "num_tokens": 656507905.0, + "step": 25369 + }, + { + "epoch": 2.786075115308588, + "grad_norm": 1.986363172531128, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7618774771690369, + "num_tokens": 656535923.0, + "step": 25370 + }, + { + "epoch": 2.786184933011201, + "grad_norm": 2.0790417194366455, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7589282393455505, + "num_tokens": 656562271.0, + "step": 25371 + }, + { + "epoch": 2.786294750713815, + "grad_norm": 2.227710723876953, + "learning_rate": 5e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7471019625663757, + "num_tokens": 656588766.0, + "step": 25372 + }, + { + "epoch": 2.7864045684164287, + "grad_norm": 1.8951178789138794, + "learning_rate": 5e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7321587204933167, + "num_tokens": 656620821.0, + "step": 25373 + }, + { + "epoch": 2.7865143861190425, + "grad_norm": 2.0729470252990723, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7634881734848022, + "num_tokens": 656645091.0, + "step": 25374 + }, + { + "epoch": 2.786624203821656, + "grad_norm": 2.16828989982605, + "learning_rate": 5e-06, + "loss": 0.7405, + "mean_token_accuracy": 0.757358193397522, + "num_tokens": 656668730.0, + "step": 25375 + }, + { + "epoch": 2.7867340215242695, + "grad_norm": 2.212632417678833, + "learning_rate": 5e-06, + "loss": 0.7037, + "mean_token_accuracy": 0.7622890472412109, + "num_tokens": 656692559.0, + "step": 25376 + }, + { + "epoch": 2.7868438392268833, + "grad_norm": 2.019514560699463, + "learning_rate": 5e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7715873718261719, + "num_tokens": 656719920.0, + "step": 25377 + }, + { + "epoch": 2.786953656929497, + "grad_norm": 1.9751038551330566, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7555509805679321, + "num_tokens": 656747281.0, + "step": 25378 + }, + { + "epoch": 2.787063474632111, + "grad_norm": 1.817291021347046, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7636767625808716, + "num_tokens": 656780834.0, + "step": 25379 + }, + { + "epoch": 2.7871732923347246, + "grad_norm": 1.8974063396453857, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7414703369140625, + "num_tokens": 656811084.0, + "step": 25380 + }, + { + "epoch": 2.787283110037338, + "grad_norm": 1.9475353956222534, + "learning_rate": 5e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7632008790969849, + "num_tokens": 656839402.0, + "step": 25381 + }, + { + "epoch": 2.7873929277399516, + "grad_norm": 2.2548885345458984, + "learning_rate": 5e-06, + "loss": 0.7251, + "mean_token_accuracy": 0.7656274437904358, + "num_tokens": 656862167.0, + "step": 25382 + }, + { + "epoch": 2.7875027454425654, + "grad_norm": 2.1427419185638428, + "learning_rate": 5e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7570512890815735, + "num_tokens": 656885439.0, + "step": 25383 + }, + { + "epoch": 2.787612563145179, + "grad_norm": 1.8243968486785889, + "learning_rate": 5e-06, + "loss": 0.7664, + "mean_token_accuracy": 0.7509878277778625, + "num_tokens": 656918955.0, + "step": 25384 + }, + { + "epoch": 2.787722380847793, + "grad_norm": 2.139875650405884, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7518107295036316, + "num_tokens": 656944066.0, + "step": 25385 + }, + { + "epoch": 2.787832198550406, + "grad_norm": 2.296778440475464, + "learning_rate": 5e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7687009572982788, + "num_tokens": 656967203.0, + "step": 25386 + }, + { + "epoch": 2.78794201625302, + "grad_norm": 2.2482993602752686, + "learning_rate": 5e-06, + "loss": 0.6728, + "mean_token_accuracy": 0.7758723497390747, + "num_tokens": 656988131.0, + "step": 25387 + }, + { + "epoch": 2.7880518339556337, + "grad_norm": 2.040701150894165, + "learning_rate": 5e-06, + "loss": 0.6744, + "mean_token_accuracy": 0.7764787673950195, + "num_tokens": 657012307.0, + "step": 25388 + }, + { + "epoch": 2.7881616516582475, + "grad_norm": 1.875334620475769, + "learning_rate": 5e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.749316394329071, + "num_tokens": 657042380.0, + "step": 25389 + }, + { + "epoch": 2.7882714693608612, + "grad_norm": 1.97993004322052, + "learning_rate": 5e-06, + "loss": 0.6715, + "mean_token_accuracy": 0.7770076990127563, + "num_tokens": 657067978.0, + "step": 25390 + }, + { + "epoch": 2.7883812870634745, + "grad_norm": 2.0659658908843994, + "learning_rate": 5e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7653311491012573, + "num_tokens": 657091734.0, + "step": 25391 + }, + { + "epoch": 2.7884911047660883, + "grad_norm": 2.0922725200653076, + "learning_rate": 5e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7502185702323914, + "num_tokens": 657116840.0, + "step": 25392 + }, + { + "epoch": 2.788600922468702, + "grad_norm": 2.6328961849212646, + "learning_rate": 5e-06, + "loss": 0.649, + "mean_token_accuracy": 0.7846447229385376, + "num_tokens": 657133952.0, + "step": 25393 + }, + { + "epoch": 2.7887107401713154, + "grad_norm": 2.43192458152771, + "learning_rate": 5e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7668780088424683, + "num_tokens": 657154127.0, + "step": 25394 + }, + { + "epoch": 2.788820557873929, + "grad_norm": 2.049058675765991, + "learning_rate": 5e-06, + "loss": 0.6661, + "mean_token_accuracy": 0.7779262065887451, + "num_tokens": 657178041.0, + "step": 25395 + }, + { + "epoch": 2.788930375576543, + "grad_norm": 1.8618415594100952, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7383622527122498, + "num_tokens": 657209615.0, + "step": 25396 + }, + { + "epoch": 2.7890401932791566, + "grad_norm": 1.9604365825653076, + "learning_rate": 5e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7462958097457886, + "num_tokens": 657240341.0, + "step": 25397 + }, + { + "epoch": 2.7891500109817704, + "grad_norm": 2.1594762802124023, + "learning_rate": 5e-06, + "loss": 0.7175, + "mean_token_accuracy": 0.759560227394104, + "num_tokens": 657265588.0, + "step": 25398 + }, + { + "epoch": 2.7892598286843837, + "grad_norm": 2.1230132579803467, + "learning_rate": 5e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7650666832923889, + "num_tokens": 657290193.0, + "step": 25399 + }, + { + "epoch": 2.7893696463869975, + "grad_norm": 2.144472360610962, + "learning_rate": 5e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7519894242286682, + "num_tokens": 657316127.0, + "step": 25400 + }, + { + "epoch": 2.7894794640896112, + "grad_norm": 1.8311316967010498, + "learning_rate": 5e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7368749380111694, + "num_tokens": 657351883.0, + "step": 25401 + }, + { + "epoch": 2.789589281792225, + "grad_norm": 2.1572327613830566, + "learning_rate": 5e-06, + "loss": 0.7086, + "mean_token_accuracy": 0.7662506103515625, + "num_tokens": 657374914.0, + "step": 25402 + }, + { + "epoch": 2.7896990994948387, + "grad_norm": 2.165064573287964, + "learning_rate": 5e-06, + "loss": 0.6425, + "mean_token_accuracy": 0.7877163887023926, + "num_tokens": 657396279.0, + "step": 25403 + }, + { + "epoch": 2.789808917197452, + "grad_norm": 2.2299742698669434, + "learning_rate": 5e-06, + "loss": 0.6227, + "mean_token_accuracy": 0.7913400530815125, + "num_tokens": 657418158.0, + "step": 25404 + }, + { + "epoch": 2.789918734900066, + "grad_norm": 2.044250965118408, + "learning_rate": 5e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7408071756362915, + "num_tokens": 657449062.0, + "step": 25405 + }, + { + "epoch": 2.7900285526026796, + "grad_norm": 2.1804637908935547, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7750306129455566, + "num_tokens": 657473046.0, + "step": 25406 + }, + { + "epoch": 2.7901383703052933, + "grad_norm": 2.1965155601501465, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7773751616477966, + "num_tokens": 657494768.0, + "step": 25407 + }, + { + "epoch": 2.790248188007907, + "grad_norm": 2.059854745864868, + "learning_rate": 5e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.74189293384552, + "num_tokens": 657520903.0, + "step": 25408 + }, + { + "epoch": 2.7903580057105204, + "grad_norm": 2.0252115726470947, + "learning_rate": 5e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.77721107006073, + "num_tokens": 657549060.0, + "step": 25409 + }, + { + "epoch": 2.790467823413134, + "grad_norm": 1.974378228187561, + "learning_rate": 5e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7579467296600342, + "num_tokens": 657577262.0, + "step": 25410 + }, + { + "epoch": 2.790577641115748, + "grad_norm": 1.863174557685852, + "learning_rate": 5e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7621061205863953, + "num_tokens": 657607327.0, + "step": 25411 + }, + { + "epoch": 2.7906874588183617, + "grad_norm": 2.090297222137451, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7490241527557373, + "num_tokens": 657634175.0, + "step": 25412 + }, + { + "epoch": 2.7907972765209754, + "grad_norm": 2.0561718940734863, + "learning_rate": 5e-06, + "loss": 0.7571, + "mean_token_accuracy": 0.7464041113853455, + "num_tokens": 657659504.0, + "step": 25413 + }, + { + "epoch": 2.7909070942235887, + "grad_norm": 2.352769136428833, + "learning_rate": 5e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7866979837417603, + "num_tokens": 657678429.0, + "step": 25414 + }, + { + "epoch": 2.7910169119262025, + "grad_norm": 2.139429807662964, + "learning_rate": 5e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7614847421646118, + "num_tokens": 657702866.0, + "step": 25415 + }, + { + "epoch": 2.7911267296288162, + "grad_norm": 2.076944589614868, + "learning_rate": 5e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.764812707901001, + "num_tokens": 657728928.0, + "step": 25416 + }, + { + "epoch": 2.7912365473314296, + "grad_norm": 2.1630923748016357, + "learning_rate": 5e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.7657303214073181, + "num_tokens": 657754264.0, + "step": 25417 + }, + { + "epoch": 2.7913463650340438, + "grad_norm": 1.9775006771087646, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7647988200187683, + "num_tokens": 657780715.0, + "step": 25418 + }, + { + "epoch": 2.791456182736657, + "grad_norm": 2.1636862754821777, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7646704912185669, + "num_tokens": 657804139.0, + "step": 25419 + }, + { + "epoch": 2.791566000439271, + "grad_norm": 2.145775556564331, + "learning_rate": 5e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7604188919067383, + "num_tokens": 657828648.0, + "step": 25420 + }, + { + "epoch": 2.7916758181418846, + "grad_norm": 1.94718337059021, + "learning_rate": 5e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7614455223083496, + "num_tokens": 657856912.0, + "step": 25421 + }, + { + "epoch": 2.791785635844498, + "grad_norm": 1.8803006410598755, + "learning_rate": 5e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.751436710357666, + "num_tokens": 657887148.0, + "step": 25422 + }, + { + "epoch": 2.7918954535471117, + "grad_norm": 1.7483488321304321, + "learning_rate": 5e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7689806222915649, + "num_tokens": 657919221.0, + "step": 25423 + }, + { + "epoch": 2.7920052712497254, + "grad_norm": 2.1802492141723633, + "learning_rate": 5e-06, + "loss": 0.7061, + "mean_token_accuracy": 0.765669584274292, + "num_tokens": 657942150.0, + "step": 25424 + }, + { + "epoch": 2.792115088952339, + "grad_norm": 2.3295769691467285, + "learning_rate": 5e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7752286195755005, + "num_tokens": 657962499.0, + "step": 25425 + }, + { + "epoch": 2.792224906654953, + "grad_norm": 2.1959240436553955, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7551519870758057, + "num_tokens": 657985739.0, + "step": 25426 + }, + { + "epoch": 2.7923347243575662, + "grad_norm": 1.8862720727920532, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7539491057395935, + "num_tokens": 658015578.0, + "step": 25427 + }, + { + "epoch": 2.79244454206018, + "grad_norm": 2.1017816066741943, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.769730269908905, + "num_tokens": 658038644.0, + "step": 25428 + }, + { + "epoch": 2.7925543597627938, + "grad_norm": 2.2264468669891357, + "learning_rate": 5e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7559656500816345, + "num_tokens": 658060985.0, + "step": 25429 + }, + { + "epoch": 2.7926641774654075, + "grad_norm": 2.1162288188934326, + "learning_rate": 5e-06, + "loss": 0.6612, + "mean_token_accuracy": 0.7863525748252869, + "num_tokens": 658084127.0, + "step": 25430 + }, + { + "epoch": 2.7927739951680213, + "grad_norm": 1.9141595363616943, + "learning_rate": 5e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7470763921737671, + "num_tokens": 658114622.0, + "step": 25431 + }, + { + "epoch": 2.7928838128706346, + "grad_norm": 1.9703614711761475, + "learning_rate": 5e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.7601583003997803, + "num_tokens": 658141014.0, + "step": 25432 + }, + { + "epoch": 2.7929936305732483, + "grad_norm": 2.032897472381592, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7703317403793335, + "num_tokens": 658165416.0, + "step": 25433 + }, + { + "epoch": 2.793103448275862, + "grad_norm": 1.9910733699798584, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7534611821174622, + "num_tokens": 658193620.0, + "step": 25434 + }, + { + "epoch": 2.793213265978476, + "grad_norm": 1.993879795074463, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7737283706665039, + "num_tokens": 658220858.0, + "step": 25435 + }, + { + "epoch": 2.7933230836810896, + "grad_norm": 2.1477808952331543, + "learning_rate": 5e-06, + "loss": 0.7174, + "mean_token_accuracy": 0.7637850046157837, + "num_tokens": 658244969.0, + "step": 25436 + }, + { + "epoch": 2.793432901383703, + "grad_norm": 2.2139129638671875, + "learning_rate": 5e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7476575374603271, + "num_tokens": 658270173.0, + "step": 25437 + }, + { + "epoch": 2.7935427190863167, + "grad_norm": 2.1392862796783447, + "learning_rate": 5e-06, + "loss": 0.6986, + "mean_token_accuracy": 0.7901824712753296, + "num_tokens": 658293769.0, + "step": 25438 + }, + { + "epoch": 2.7936525367889304, + "grad_norm": 1.9126007556915283, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7671321034431458, + "num_tokens": 658320295.0, + "step": 25439 + }, + { + "epoch": 2.793762354491544, + "grad_norm": 2.1784684658050537, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7442409992218018, + "num_tokens": 658345855.0, + "step": 25440 + }, + { + "epoch": 2.793872172194158, + "grad_norm": 2.174272060394287, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7585351467132568, + "num_tokens": 658368950.0, + "step": 25441 + }, + { + "epoch": 2.7939819898967713, + "grad_norm": 2.358315944671631, + "learning_rate": 5e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.784134566783905, + "num_tokens": 658390177.0, + "step": 25442 + }, + { + "epoch": 2.794091807599385, + "grad_norm": 2.2729265689849854, + "learning_rate": 5e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7610344290733337, + "num_tokens": 658411973.0, + "step": 25443 + }, + { + "epoch": 2.794201625301999, + "grad_norm": 1.909227967262268, + "learning_rate": 5e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7453255653381348, + "num_tokens": 658441437.0, + "step": 25444 + }, + { + "epoch": 2.794311443004612, + "grad_norm": 2.2977771759033203, + "learning_rate": 5e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7697522044181824, + "num_tokens": 658461892.0, + "step": 25445 + }, + { + "epoch": 2.794421260707226, + "grad_norm": 1.9456391334533691, + "learning_rate": 5e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.758268415927887, + "num_tokens": 658490935.0, + "step": 25446 + }, + { + "epoch": 2.7945310784098396, + "grad_norm": 2.1233913898468018, + "learning_rate": 5e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.770263671875, + "num_tokens": 658516905.0, + "step": 25447 + }, + { + "epoch": 2.7946408961124534, + "grad_norm": 2.106454372406006, + "learning_rate": 5e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7424379587173462, + "num_tokens": 658543367.0, + "step": 25448 + }, + { + "epoch": 2.794750713815067, + "grad_norm": 2.088207960128784, + "learning_rate": 5e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7531830072402954, + "num_tokens": 658569185.0, + "step": 25449 + }, + { + "epoch": 2.7948605315176804, + "grad_norm": 2.23230242729187, + "learning_rate": 5e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7548156976699829, + "num_tokens": 658594081.0, + "step": 25450 + }, + { + "epoch": 2.794970349220294, + "grad_norm": 2.177229881286621, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7653111815452576, + "num_tokens": 658617739.0, + "step": 25451 + }, + { + "epoch": 2.795080166922908, + "grad_norm": 2.3527138233184814, + "learning_rate": 5e-06, + "loss": 0.6586, + "mean_token_accuracy": 0.7806417942047119, + "num_tokens": 658637253.0, + "step": 25452 + }, + { + "epoch": 2.7951899846255217, + "grad_norm": 2.169884443283081, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7722408175468445, + "num_tokens": 658659712.0, + "step": 25453 + }, + { + "epoch": 2.7952998023281355, + "grad_norm": 1.939969539642334, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.759678840637207, + "num_tokens": 658687437.0, + "step": 25454 + }, + { + "epoch": 2.7954096200307488, + "grad_norm": 1.9450486898422241, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7591767907142639, + "num_tokens": 658716382.0, + "step": 25455 + }, + { + "epoch": 2.7955194377333625, + "grad_norm": 2.0164897441864014, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7548276782035828, + "num_tokens": 658744124.0, + "step": 25456 + }, + { + "epoch": 2.7956292554359763, + "grad_norm": 2.002814769744873, + "learning_rate": 5e-06, + "loss": 0.791, + "mean_token_accuracy": 0.742979884147644, + "num_tokens": 658774148.0, + "step": 25457 + }, + { + "epoch": 2.79573907313859, + "grad_norm": 2.12239408493042, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7601566314697266, + "num_tokens": 658796665.0, + "step": 25458 + }, + { + "epoch": 2.795848890841204, + "grad_norm": 2.057501792907715, + "learning_rate": 5e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7525001764297485, + "num_tokens": 658822313.0, + "step": 25459 + }, + { + "epoch": 2.795958708543817, + "grad_norm": 2.0139517784118652, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7544938325881958, + "num_tokens": 658849908.0, + "step": 25460 + }, + { + "epoch": 2.796068526246431, + "grad_norm": 1.8930214643478394, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7390978336334229, + "num_tokens": 658880482.0, + "step": 25461 + }, + { + "epoch": 2.7961783439490446, + "grad_norm": 2.0370781421661377, + "learning_rate": 5e-06, + "loss": 0.6695, + "mean_token_accuracy": 0.7826941013336182, + "num_tokens": 658904358.0, + "step": 25462 + }, + { + "epoch": 2.7962881616516584, + "grad_norm": 2.380007266998291, + "learning_rate": 5e-06, + "loss": 0.6345, + "mean_token_accuracy": 0.7831753492355347, + "num_tokens": 658924172.0, + "step": 25463 + }, + { + "epoch": 2.796397979354272, + "grad_norm": 2.0225934982299805, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7545409798622131, + "num_tokens": 658950864.0, + "step": 25464 + }, + { + "epoch": 2.7965077970568855, + "grad_norm": 1.9048306941986084, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7646945714950562, + "num_tokens": 658979817.0, + "step": 25465 + }, + { + "epoch": 2.796617614759499, + "grad_norm": 2.1058454513549805, + "learning_rate": 5e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7620218992233276, + "num_tokens": 659005004.0, + "step": 25466 + }, + { + "epoch": 2.796727432462113, + "grad_norm": 2.3067450523376465, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7791242599487305, + "num_tokens": 659026616.0, + "step": 25467 + }, + { + "epoch": 2.7968372501647263, + "grad_norm": 2.130851984024048, + "learning_rate": 5e-06, + "loss": 0.7467, + "mean_token_accuracy": 0.7491657733917236, + "num_tokens": 659051865.0, + "step": 25468 + }, + { + "epoch": 2.7969470678673405, + "grad_norm": 2.302501678466797, + "learning_rate": 5e-06, + "loss": 0.7002, + "mean_token_accuracy": 0.7678948640823364, + "num_tokens": 659074481.0, + "step": 25469 + }, + { + "epoch": 2.797056885569954, + "grad_norm": 1.9588958024978638, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7586502432823181, + "num_tokens": 659104866.0, + "step": 25470 + }, + { + "epoch": 2.7971667032725676, + "grad_norm": 2.103415012359619, + "learning_rate": 5e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.761748194694519, + "num_tokens": 659131502.0, + "step": 25471 + }, + { + "epoch": 2.7972765209751813, + "grad_norm": 2.278709888458252, + "learning_rate": 5e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.7534959316253662, + "num_tokens": 659155776.0, + "step": 25472 + }, + { + "epoch": 2.7973863386777946, + "grad_norm": 2.166443347930908, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7728096842765808, + "num_tokens": 659179889.0, + "step": 25473 + }, + { + "epoch": 2.7974961563804084, + "grad_norm": 2.1927640438079834, + "learning_rate": 5e-06, + "loss": 0.7416, + "mean_token_accuracy": 0.7619304060935974, + "num_tokens": 659201848.0, + "step": 25474 + }, + { + "epoch": 2.797605974083022, + "grad_norm": 2.0435049533843994, + "learning_rate": 5e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7694622874259949, + "num_tokens": 659227513.0, + "step": 25475 + }, + { + "epoch": 2.797715791785636, + "grad_norm": 1.8834010362625122, + "learning_rate": 5e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.785132110118866, + "num_tokens": 659255987.0, + "step": 25476 + }, + { + "epoch": 2.7978256094882497, + "grad_norm": 2.0536680221557617, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7593162655830383, + "num_tokens": 659281307.0, + "step": 25477 + }, + { + "epoch": 2.797935427190863, + "grad_norm": 2.0239126682281494, + "learning_rate": 5e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7305567860603333, + "num_tokens": 659308719.0, + "step": 25478 + }, + { + "epoch": 2.7980452448934767, + "grad_norm": 2.0534937381744385, + "learning_rate": 5e-06, + "loss": 0.6309, + "mean_token_accuracy": 0.7992403507232666, + "num_tokens": 659332446.0, + "step": 25479 + }, + { + "epoch": 2.7981550625960905, + "grad_norm": 2.16580867767334, + "learning_rate": 5e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7756058573722839, + "num_tokens": 659356851.0, + "step": 25480 + }, + { + "epoch": 2.7982648802987042, + "grad_norm": 2.05431866645813, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.7682691812515259, + "num_tokens": 659383392.0, + "step": 25481 + }, + { + "epoch": 2.798374698001318, + "grad_norm": 2.0873262882232666, + "learning_rate": 5e-06, + "loss": 0.6457, + "mean_token_accuracy": 0.779387354850769, + "num_tokens": 659408247.0, + "step": 25482 + }, + { + "epoch": 2.7984845157039313, + "grad_norm": 2.194636344909668, + "learning_rate": 5e-06, + "loss": 0.69, + "mean_token_accuracy": 0.7692599296569824, + "num_tokens": 659430929.0, + "step": 25483 + }, + { + "epoch": 2.798594333406545, + "grad_norm": 2.265702486038208, + "learning_rate": 5e-06, + "loss": 0.7821, + "mean_token_accuracy": 0.7537582516670227, + "num_tokens": 659457508.0, + "step": 25484 + }, + { + "epoch": 2.798704151109159, + "grad_norm": 2.770796537399292, + "learning_rate": 5e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7570747137069702, + "num_tokens": 659481723.0, + "step": 25485 + }, + { + "epoch": 2.7988139688117726, + "grad_norm": 1.9627916812896729, + "learning_rate": 5e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7518345713615417, + "num_tokens": 659510050.0, + "step": 25486 + }, + { + "epoch": 2.7989237865143863, + "grad_norm": 1.7942298650741577, + "learning_rate": 5e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7504814267158508, + "num_tokens": 659542176.0, + "step": 25487 + }, + { + "epoch": 2.7990336042169996, + "grad_norm": 2.112560987472534, + "learning_rate": 5e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.7684841156005859, + "num_tokens": 659567187.0, + "step": 25488 + }, + { + "epoch": 2.7991434219196134, + "grad_norm": 2.0964415073394775, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7702575325965881, + "num_tokens": 659591411.0, + "step": 25489 + }, + { + "epoch": 2.799253239622227, + "grad_norm": 2.1860551834106445, + "learning_rate": 5e-06, + "loss": 0.6247, + "mean_token_accuracy": 0.788025438785553, + "num_tokens": 659612494.0, + "step": 25490 + }, + { + "epoch": 2.799363057324841, + "grad_norm": 2.0822176933288574, + "learning_rate": 5e-06, + "loss": 0.6145, + "mean_token_accuracy": 0.7914128303527832, + "num_tokens": 659635513.0, + "step": 25491 + }, + { + "epoch": 2.7994728750274547, + "grad_norm": 2.0093722343444824, + "learning_rate": 5e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7512267827987671, + "num_tokens": 659662269.0, + "step": 25492 + }, + { + "epoch": 2.799582692730068, + "grad_norm": 1.879220962524414, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7771637439727783, + "num_tokens": 659692835.0, + "step": 25493 + }, + { + "epoch": 2.7996925104326817, + "grad_norm": 2.2777156829833984, + "learning_rate": 5e-06, + "loss": 0.692, + "mean_token_accuracy": 0.7688159346580505, + "num_tokens": 659715026.0, + "step": 25494 + }, + { + "epoch": 2.7998023281352955, + "grad_norm": 1.985533356666565, + "learning_rate": 5e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7381219863891602, + "num_tokens": 659744994.0, + "step": 25495 + }, + { + "epoch": 2.799912145837909, + "grad_norm": 2.0029172897338867, + "learning_rate": 5e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7582432627677917, + "num_tokens": 659772500.0, + "step": 25496 + }, + { + "epoch": 2.8000219635405226, + "grad_norm": 2.0111007690429688, + "learning_rate": 5e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7570710182189941, + "num_tokens": 659798723.0, + "step": 25497 + }, + { + "epoch": 2.8001317812431363, + "grad_norm": 1.9309855699539185, + "learning_rate": 5e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.7715294361114502, + "num_tokens": 659827336.0, + "step": 25498 + }, + { + "epoch": 2.80024159894575, + "grad_norm": 1.9794983863830566, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7552987933158875, + "num_tokens": 659855040.0, + "step": 25499 + }, + { + "epoch": 2.800351416648364, + "grad_norm": 2.042949914932251, + "learning_rate": 5e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7656748294830322, + "num_tokens": 659882418.0, + "step": 25500 + }, + { + "epoch": 2.800461234350977, + "grad_norm": 2.0157108306884766, + "learning_rate": 5e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7410905361175537, + "num_tokens": 659912719.0, + "step": 25501 + }, + { + "epoch": 2.800571052053591, + "grad_norm": 2.0427093505859375, + "learning_rate": 5e-06, + "loss": 0.6532, + "mean_token_accuracy": 0.7819517254829407, + "num_tokens": 659937050.0, + "step": 25502 + }, + { + "epoch": 2.8006808697562047, + "grad_norm": 1.9141154289245605, + "learning_rate": 5e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7458541989326477, + "num_tokens": 659967115.0, + "step": 25503 + }, + { + "epoch": 2.8007906874588184, + "grad_norm": 2.088780403137207, + "learning_rate": 5e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7499967813491821, + "num_tokens": 659994573.0, + "step": 25504 + }, + { + "epoch": 2.800900505161432, + "grad_norm": 1.97043776512146, + "learning_rate": 5e-06, + "loss": 0.6743, + "mean_token_accuracy": 0.7832138538360596, + "num_tokens": 660019424.0, + "step": 25505 + }, + { + "epoch": 2.8010103228640455, + "grad_norm": 1.980161428451538, + "learning_rate": 5e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7724860906600952, + "num_tokens": 660046091.0, + "step": 25506 + }, + { + "epoch": 2.8011201405666593, + "grad_norm": 2.1930394172668457, + "learning_rate": 5e-06, + "loss": 0.7024, + "mean_token_accuracy": 0.7692065834999084, + "num_tokens": 660068566.0, + "step": 25507 + }, + { + "epoch": 2.801229958269273, + "grad_norm": 1.9749354124069214, + "learning_rate": 5e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7690034508705139, + "num_tokens": 660096307.0, + "step": 25508 + }, + { + "epoch": 2.8013397759718868, + "grad_norm": 2.082970142364502, + "learning_rate": 5e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7613742351531982, + "num_tokens": 660125231.0, + "step": 25509 + }, + { + "epoch": 2.8014495936745005, + "grad_norm": 2.282315969467163, + "learning_rate": 5e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7491783499717712, + "num_tokens": 660149103.0, + "step": 25510 + }, + { + "epoch": 2.801559411377114, + "grad_norm": 2.165038824081421, + "learning_rate": 5e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7370473742485046, + "num_tokens": 660176068.0, + "step": 25511 + }, + { + "epoch": 2.8016692290797276, + "grad_norm": 2.086944103240967, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7766112685203552, + "num_tokens": 660200238.0, + "step": 25512 + }, + { + "epoch": 2.8017790467823414, + "grad_norm": 2.0881216526031494, + "learning_rate": 5e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.7550598382949829, + "num_tokens": 660225825.0, + "step": 25513 + }, + { + "epoch": 2.801888864484955, + "grad_norm": 2.0453038215637207, + "learning_rate": 5e-06, + "loss": 0.6435, + "mean_token_accuracy": 0.7870837450027466, + "num_tokens": 660249765.0, + "step": 25514 + }, + { + "epoch": 2.801998682187569, + "grad_norm": 2.1273488998413086, + "learning_rate": 5e-06, + "loss": 0.7238, + "mean_token_accuracy": 0.7639369964599609, + "num_tokens": 660274247.0, + "step": 25515 + }, + { + "epoch": 2.802108499890182, + "grad_norm": 2.0156986713409424, + "learning_rate": 5e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7558643221855164, + "num_tokens": 660300548.0, + "step": 25516 + }, + { + "epoch": 2.802218317592796, + "grad_norm": 1.8455952405929565, + "learning_rate": 5e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7463122606277466, + "num_tokens": 660332967.0, + "step": 25517 + }, + { + "epoch": 2.8023281352954097, + "grad_norm": 2.2841734886169434, + "learning_rate": 5e-06, + "loss": 0.6827, + "mean_token_accuracy": 0.7699152827262878, + "num_tokens": 660353667.0, + "step": 25518 + }, + { + "epoch": 2.802437952998023, + "grad_norm": 2.1015405654907227, + "learning_rate": 5e-06, + "loss": 0.697, + "mean_token_accuracy": 0.7663183808326721, + "num_tokens": 660379513.0, + "step": 25519 + }, + { + "epoch": 2.802547770700637, + "grad_norm": 2.090773582458496, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7790200710296631, + "num_tokens": 660404989.0, + "step": 25520 + }, + { + "epoch": 2.8026575884032505, + "grad_norm": 2.1259655952453613, + "learning_rate": 5e-06, + "loss": 0.6618, + "mean_token_accuracy": 0.7785003185272217, + "num_tokens": 660426280.0, + "step": 25521 + }, + { + "epoch": 2.8027674061058643, + "grad_norm": 2.122478723526001, + "learning_rate": 5e-06, + "loss": 0.7199, + "mean_token_accuracy": 0.7565921545028687, + "num_tokens": 660449976.0, + "step": 25522 + }, + { + "epoch": 2.802877223808478, + "grad_norm": 2.227541923522949, + "learning_rate": 5e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.7918953895568848, + "num_tokens": 660470410.0, + "step": 25523 + }, + { + "epoch": 2.8029870415110913, + "grad_norm": 1.8505334854125977, + "learning_rate": 5e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7534741759300232, + "num_tokens": 660504378.0, + "step": 25524 + }, + { + "epoch": 2.803096859213705, + "grad_norm": 2.303537130355835, + "learning_rate": 5e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.7800449132919312, + "num_tokens": 660524898.0, + "step": 25525 + }, + { + "epoch": 2.803206676916319, + "grad_norm": 2.027170181274414, + "learning_rate": 5e-06, + "loss": 0.6562, + "mean_token_accuracy": 0.7798597812652588, + "num_tokens": 660552000.0, + "step": 25526 + }, + { + "epoch": 2.8033164946189326, + "grad_norm": 2.006387233734131, + "learning_rate": 5e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7543087005615234, + "num_tokens": 660579274.0, + "step": 25527 + }, + { + "epoch": 2.8034263123215464, + "grad_norm": 2.0815255641937256, + "learning_rate": 5e-06, + "loss": 0.7162, + "mean_token_accuracy": 0.7660606503486633, + "num_tokens": 660608768.0, + "step": 25528 + }, + { + "epoch": 2.8035361300241597, + "grad_norm": 2.100114345550537, + "learning_rate": 5e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7736560106277466, + "num_tokens": 660634158.0, + "step": 25529 + }, + { + "epoch": 2.8036459477267734, + "grad_norm": 2.0987274646759033, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7537364959716797, + "num_tokens": 660660727.0, + "step": 25530 + }, + { + "epoch": 2.803755765429387, + "grad_norm": 2.2658777236938477, + "learning_rate": 5e-06, + "loss": 0.6843, + "mean_token_accuracy": 0.7724992036819458, + "num_tokens": 660682726.0, + "step": 25531 + }, + { + "epoch": 2.803865583132001, + "grad_norm": 2.1794962882995605, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7676534652709961, + "num_tokens": 660708283.0, + "step": 25532 + }, + { + "epoch": 2.8039754008346147, + "grad_norm": 2.061716079711914, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7409271597862244, + "num_tokens": 660734383.0, + "step": 25533 + }, + { + "epoch": 2.804085218537228, + "grad_norm": 1.8469743728637695, + "learning_rate": 5e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7589597702026367, + "num_tokens": 660767412.0, + "step": 25534 + }, + { + "epoch": 2.804195036239842, + "grad_norm": 2.1609690189361572, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7729804515838623, + "num_tokens": 660789286.0, + "step": 25535 + }, + { + "epoch": 2.8043048539424555, + "grad_norm": 1.8765238523483276, + "learning_rate": 5e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7681055665016174, + "num_tokens": 660818208.0, + "step": 25536 + }, + { + "epoch": 2.8044146716450693, + "grad_norm": 1.888043761253357, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7602513432502747, + "num_tokens": 660845674.0, + "step": 25537 + }, + { + "epoch": 2.804524489347683, + "grad_norm": 1.9790891408920288, + "learning_rate": 5e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7483950257301331, + "num_tokens": 660874810.0, + "step": 25538 + }, + { + "epoch": 2.8046343070502964, + "grad_norm": 2.0290005207061768, + "learning_rate": 5e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7745699286460876, + "num_tokens": 660902302.0, + "step": 25539 + }, + { + "epoch": 2.80474412475291, + "grad_norm": 2.059657335281372, + "learning_rate": 5e-06, + "loss": 0.6317, + "mean_token_accuracy": 0.78815096616745, + "num_tokens": 660927542.0, + "step": 25540 + }, + { + "epoch": 2.804853942455524, + "grad_norm": 2.0344841480255127, + "learning_rate": 5e-06, + "loss": 0.666, + "mean_token_accuracy": 0.7834028601646423, + "num_tokens": 660952245.0, + "step": 25541 + }, + { + "epoch": 2.8049637601581376, + "grad_norm": 1.9318779706954956, + "learning_rate": 5e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7582453489303589, + "num_tokens": 660982583.0, + "step": 25542 + }, + { + "epoch": 2.8050735778607514, + "grad_norm": 2.5470826625823975, + "learning_rate": 5e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7694852948188782, + "num_tokens": 661001923.0, + "step": 25543 + }, + { + "epoch": 2.8051833955633647, + "grad_norm": 2.186223268508911, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7534447908401489, + "num_tokens": 661025592.0, + "step": 25544 + }, + { + "epoch": 2.8052932132659785, + "grad_norm": 2.3262624740600586, + "learning_rate": 5e-06, + "loss": 0.6779, + "mean_token_accuracy": 0.7755665183067322, + "num_tokens": 661046838.0, + "step": 25545 + }, + { + "epoch": 2.8054030309685922, + "grad_norm": 1.7511630058288574, + "learning_rate": 5e-06, + "loss": 0.7126, + "mean_token_accuracy": 0.7588066458702087, + "num_tokens": 661079888.0, + "step": 25546 + }, + { + "epoch": 2.8055128486712055, + "grad_norm": 2.0107505321502686, + "learning_rate": 5e-06, + "loss": 0.768, + "mean_token_accuracy": 0.7521642446517944, + "num_tokens": 661109883.0, + "step": 25547 + }, + { + "epoch": 2.8056226663738193, + "grad_norm": 2.231440544128418, + "learning_rate": 5e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7707858085632324, + "num_tokens": 661131921.0, + "step": 25548 + }, + { + "epoch": 2.805732484076433, + "grad_norm": 2.3085453510284424, + "learning_rate": 5e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.764548659324646, + "num_tokens": 661155122.0, + "step": 25549 + }, + { + "epoch": 2.805842301779047, + "grad_norm": 2.4806458950042725, + "learning_rate": 5e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.778361439704895, + "num_tokens": 661174664.0, + "step": 25550 + }, + { + "epoch": 2.8059521194816606, + "grad_norm": 1.9719575643539429, + "learning_rate": 5e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7642996311187744, + "num_tokens": 661200170.0, + "step": 25551 + }, + { + "epoch": 2.806061937184274, + "grad_norm": 2.1627845764160156, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7569364309310913, + "num_tokens": 661223550.0, + "step": 25552 + }, + { + "epoch": 2.8061717548868876, + "grad_norm": 2.2635438442230225, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.784432590007782, + "num_tokens": 661244768.0, + "step": 25553 + }, + { + "epoch": 2.8062815725895014, + "grad_norm": 2.55928635597229, + "learning_rate": 5e-06, + "loss": 0.6677, + "mean_token_accuracy": 0.7810978889465332, + "num_tokens": 661265150.0, + "step": 25554 + }, + { + "epoch": 2.806391390292115, + "grad_norm": 1.9390963315963745, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7536771297454834, + "num_tokens": 661293883.0, + "step": 25555 + }, + { + "epoch": 2.806501207994729, + "grad_norm": 1.8596274852752686, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.7522919178009033, + "num_tokens": 661325904.0, + "step": 25556 + }, + { + "epoch": 2.806611025697342, + "grad_norm": 1.8463789224624634, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7699596285820007, + "num_tokens": 661356432.0, + "step": 25557 + }, + { + "epoch": 2.806720843399956, + "grad_norm": 2.4884228706359863, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7831382155418396, + "num_tokens": 661374530.0, + "step": 25558 + }, + { + "epoch": 2.8068306611025697, + "grad_norm": 2.3134305477142334, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7724090218544006, + "num_tokens": 661395650.0, + "step": 25559 + }, + { + "epoch": 2.8069404788051835, + "grad_norm": 2.1764423847198486, + "learning_rate": 5e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.7778264284133911, + "num_tokens": 661417835.0, + "step": 25560 + }, + { + "epoch": 2.8070502965077972, + "grad_norm": 2.065540313720703, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7562764883041382, + "num_tokens": 661442852.0, + "step": 25561 + }, + { + "epoch": 2.8071601142104106, + "grad_norm": 2.1379597187042236, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7533935308456421, + "num_tokens": 661467784.0, + "step": 25562 + }, + { + "epoch": 2.8072699319130243, + "grad_norm": 2.359856367111206, + "learning_rate": 5e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.7867525815963745, + "num_tokens": 661489339.0, + "step": 25563 + }, + { + "epoch": 2.807379749615638, + "grad_norm": 1.9670679569244385, + "learning_rate": 5e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7233985066413879, + "num_tokens": 661522024.0, + "step": 25564 + }, + { + "epoch": 2.807489567318252, + "grad_norm": 2.2859623432159424, + "learning_rate": 5e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7775826454162598, + "num_tokens": 661541935.0, + "step": 25565 + }, + { + "epoch": 2.8075993850208656, + "grad_norm": 1.9538369178771973, + "learning_rate": 5e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7500102519989014, + "num_tokens": 661571972.0, + "step": 25566 + }, + { + "epoch": 2.807709202723479, + "grad_norm": 1.9682059288024902, + "learning_rate": 5e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.775348424911499, + "num_tokens": 661599031.0, + "step": 25567 + }, + { + "epoch": 2.8078190204260927, + "grad_norm": 2.109372854232788, + "learning_rate": 5e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.7757581472396851, + "num_tokens": 661620930.0, + "step": 25568 + }, + { + "epoch": 2.8079288381287064, + "grad_norm": 2.037199020385742, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7606372237205505, + "num_tokens": 661647004.0, + "step": 25569 + }, + { + "epoch": 2.80803865583132, + "grad_norm": 2.065716028213501, + "learning_rate": 5e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.772733211517334, + "num_tokens": 661672057.0, + "step": 25570 + }, + { + "epoch": 2.808148473533934, + "grad_norm": 2.0396640300750732, + "learning_rate": 5e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7538626194000244, + "num_tokens": 661700340.0, + "step": 25571 + }, + { + "epoch": 2.8082582912365472, + "grad_norm": 2.1676814556121826, + "learning_rate": 5e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.756505012512207, + "num_tokens": 661727929.0, + "step": 25572 + }, + { + "epoch": 2.808368108939161, + "grad_norm": 2.2883141040802, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7752183675765991, + "num_tokens": 661747655.0, + "step": 25573 + }, + { + "epoch": 2.8084779266417748, + "grad_norm": 1.8541679382324219, + "learning_rate": 5e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7644867897033691, + "num_tokens": 661776813.0, + "step": 25574 + }, + { + "epoch": 2.808587744344388, + "grad_norm": 2.2108511924743652, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7700912356376648, + "num_tokens": 661798625.0, + "step": 25575 + }, + { + "epoch": 2.808697562047002, + "grad_norm": 2.1911981105804443, + "learning_rate": 5e-06, + "loss": 0.7105, + "mean_token_accuracy": 0.7791959047317505, + "num_tokens": 661823428.0, + "step": 25576 + }, + { + "epoch": 2.8088073797496156, + "grad_norm": 1.972912073135376, + "learning_rate": 5e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7420511245727539, + "num_tokens": 661852316.0, + "step": 25577 + }, + { + "epoch": 2.8089171974522293, + "grad_norm": 2.1570966243743896, + "learning_rate": 5e-06, + "loss": 0.7487, + "mean_token_accuracy": 0.7535823583602905, + "num_tokens": 661875995.0, + "step": 25578 + }, + { + "epoch": 2.809027015154843, + "grad_norm": 2.1338951587677, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7674230337142944, + "num_tokens": 661900950.0, + "step": 25579 + }, + { + "epoch": 2.8091368328574564, + "grad_norm": 1.9842451810836792, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7339372634887695, + "num_tokens": 661928717.0, + "step": 25580 + }, + { + "epoch": 2.80924665056007, + "grad_norm": 2.121140480041504, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7556506991386414, + "num_tokens": 661952813.0, + "step": 25581 + }, + { + "epoch": 2.809356468262684, + "grad_norm": 2.0982680320739746, + "learning_rate": 5e-06, + "loss": 0.6829, + "mean_token_accuracy": 0.7732575535774231, + "num_tokens": 661976222.0, + "step": 25582 + }, + { + "epoch": 2.8094662859652977, + "grad_norm": 1.9750738143920898, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7581468224525452, + "num_tokens": 662007739.0, + "step": 25583 + }, + { + "epoch": 2.8095761036679114, + "grad_norm": 1.9210083484649658, + "learning_rate": 5e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7395263910293579, + "num_tokens": 662036578.0, + "step": 25584 + }, + { + "epoch": 2.8096859213705248, + "grad_norm": 1.9637304544448853, + "learning_rate": 5e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7454230189323425, + "num_tokens": 662065511.0, + "step": 25585 + }, + { + "epoch": 2.8097957390731385, + "grad_norm": 2.5519702434539795, + "learning_rate": 5e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7631199359893799, + "num_tokens": 662084192.0, + "step": 25586 + }, + { + "epoch": 2.8099055567757523, + "grad_norm": 2.098792314529419, + "learning_rate": 5e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.7925124168395996, + "num_tokens": 662106062.0, + "step": 25587 + }, + { + "epoch": 2.810015374478366, + "grad_norm": 1.8416554927825928, + "learning_rate": 5e-06, + "loss": 0.7641, + "mean_token_accuracy": 0.7475537061691284, + "num_tokens": 662139300.0, + "step": 25588 + }, + { + "epoch": 2.81012519218098, + "grad_norm": 2.2220234870910645, + "learning_rate": 5e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7399448752403259, + "num_tokens": 662162166.0, + "step": 25589 + }, + { + "epoch": 2.810235009883593, + "grad_norm": 2.151961326599121, + "learning_rate": 5e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7527238130569458, + "num_tokens": 662187351.0, + "step": 25590 + }, + { + "epoch": 2.810344827586207, + "grad_norm": 2.3477485179901123, + "learning_rate": 5e-06, + "loss": 0.6114, + "mean_token_accuracy": 0.7928631901741028, + "num_tokens": 662206004.0, + "step": 25591 + }, + { + "epoch": 2.8104546452888206, + "grad_norm": 1.9925904273986816, + "learning_rate": 5e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7562398910522461, + "num_tokens": 662233929.0, + "step": 25592 + }, + { + "epoch": 2.8105644629914344, + "grad_norm": 2.0355143547058105, + "learning_rate": 5e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7657656669616699, + "num_tokens": 662258555.0, + "step": 25593 + }, + { + "epoch": 2.810674280694048, + "grad_norm": 1.9873522520065308, + "learning_rate": 5e-06, + "loss": 0.7598, + "mean_token_accuracy": 0.7534642219543457, + "num_tokens": 662285229.0, + "step": 25594 + }, + { + "epoch": 2.8107840983966614, + "grad_norm": 2.4187567234039307, + "learning_rate": 5e-06, + "loss": 0.6612, + "mean_token_accuracy": 0.7822141051292419, + "num_tokens": 662304715.0, + "step": 25595 + }, + { + "epoch": 2.810893916099275, + "grad_norm": 2.2340962886810303, + "learning_rate": 5e-06, + "loss": 0.657, + "mean_token_accuracy": 0.7773615717887878, + "num_tokens": 662325389.0, + "step": 25596 + }, + { + "epoch": 2.811003733801889, + "grad_norm": 2.025702476501465, + "learning_rate": 5e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7688953876495361, + "num_tokens": 662350054.0, + "step": 25597 + }, + { + "epoch": 2.8111135515045023, + "grad_norm": 2.091792583465576, + "learning_rate": 5e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7753212451934814, + "num_tokens": 662374422.0, + "step": 25598 + }, + { + "epoch": 2.8112233692071165, + "grad_norm": 2.4686925411224365, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.7598915100097656, + "num_tokens": 662395791.0, + "step": 25599 + }, + { + "epoch": 2.8113331869097298, + "grad_norm": 2.030944585800171, + "learning_rate": 5e-06, + "loss": 0.6791, + "mean_token_accuracy": 0.7832356691360474, + "num_tokens": 662422502.0, + "step": 25600 + }, + { + "epoch": 2.8114430046123435, + "grad_norm": 2.0404903888702393, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7568265199661255, + "num_tokens": 662449367.0, + "step": 25601 + }, + { + "epoch": 2.8115528223149573, + "grad_norm": 2.094510078430176, + "learning_rate": 5e-06, + "loss": 0.743, + "mean_token_accuracy": 0.7653201818466187, + "num_tokens": 662474476.0, + "step": 25602 + }, + { + "epoch": 2.8116626400175706, + "grad_norm": 2.02226185798645, + "learning_rate": 5e-06, + "loss": 0.6957, + "mean_token_accuracy": 0.7682698965072632, + "num_tokens": 662500659.0, + "step": 25603 + }, + { + "epoch": 2.8117724577201844, + "grad_norm": 2.0908286571502686, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.769026517868042, + "num_tokens": 662526057.0, + "step": 25604 + }, + { + "epoch": 2.811882275422798, + "grad_norm": 2.2860066890716553, + "learning_rate": 5e-06, + "loss": 0.5725, + "mean_token_accuracy": 0.8027433753013611, + "num_tokens": 662543507.0, + "step": 25605 + }, + { + "epoch": 2.811992093125412, + "grad_norm": 1.9695889949798584, + "learning_rate": 5e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7626299262046814, + "num_tokens": 662571110.0, + "step": 25606 + }, + { + "epoch": 2.8121019108280256, + "grad_norm": 2.5896379947662354, + "learning_rate": 5e-06, + "loss": 0.6139, + "mean_token_accuracy": 0.7940415143966675, + "num_tokens": 662586845.0, + "step": 25607 + }, + { + "epoch": 2.812211728530639, + "grad_norm": 2.259073257446289, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7681064605712891, + "num_tokens": 662608523.0, + "step": 25608 + }, + { + "epoch": 2.8123215462332527, + "grad_norm": 2.4237380027770996, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7493411302566528, + "num_tokens": 662629158.0, + "step": 25609 + }, + { + "epoch": 2.8124313639358665, + "grad_norm": 1.9516352415084839, + "learning_rate": 5e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.759694516658783, + "num_tokens": 662656276.0, + "step": 25610 + }, + { + "epoch": 2.81254118163848, + "grad_norm": 2.3094589710235596, + "learning_rate": 5e-06, + "loss": 0.6613, + "mean_token_accuracy": 0.7863045930862427, + "num_tokens": 662676474.0, + "step": 25611 + }, + { + "epoch": 2.812650999341094, + "grad_norm": 2.0808048248291016, + "learning_rate": 5e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7427595257759094, + "num_tokens": 662701609.0, + "step": 25612 + }, + { + "epoch": 2.8127608170437073, + "grad_norm": 2.4818248748779297, + "learning_rate": 5e-06, + "loss": 0.6865, + "mean_token_accuracy": 0.7745022773742676, + "num_tokens": 662722330.0, + "step": 25613 + }, + { + "epoch": 2.812870634746321, + "grad_norm": 2.3298511505126953, + "learning_rate": 5e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7614405155181885, + "num_tokens": 662744008.0, + "step": 25614 + }, + { + "epoch": 2.812980452448935, + "grad_norm": 2.109529972076416, + "learning_rate": 5e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7536543607711792, + "num_tokens": 662769986.0, + "step": 25615 + }, + { + "epoch": 2.8130902701515486, + "grad_norm": 2.2711129188537598, + "learning_rate": 5e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7516072988510132, + "num_tokens": 662794823.0, + "step": 25616 + }, + { + "epoch": 2.8132000878541623, + "grad_norm": 2.169482469558716, + "learning_rate": 5e-06, + "loss": 0.7085, + "mean_token_accuracy": 0.7625784277915955, + "num_tokens": 662818213.0, + "step": 25617 + }, + { + "epoch": 2.8133099055567756, + "grad_norm": 2.174414873123169, + "learning_rate": 5e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7406173348426819, + "num_tokens": 662842275.0, + "step": 25618 + }, + { + "epoch": 2.8134197232593894, + "grad_norm": 2.106163501739502, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7635660171508789, + "num_tokens": 662865548.0, + "step": 25619 + }, + { + "epoch": 2.813529540962003, + "grad_norm": 2.0958757400512695, + "learning_rate": 5e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.7963463068008423, + "num_tokens": 662887565.0, + "step": 25620 + }, + { + "epoch": 2.813639358664617, + "grad_norm": 2.371269464492798, + "learning_rate": 5e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7511017918586731, + "num_tokens": 662907207.0, + "step": 25621 + }, + { + "epoch": 2.8137491763672307, + "grad_norm": 1.9557774066925049, + "learning_rate": 5e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7608217597007751, + "num_tokens": 662935427.0, + "step": 25622 + }, + { + "epoch": 2.813858994069844, + "grad_norm": 2.194737434387207, + "learning_rate": 5e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.7618353366851807, + "num_tokens": 662959225.0, + "step": 25623 + }, + { + "epoch": 2.8139688117724577, + "grad_norm": 2.0292556285858154, + "learning_rate": 5e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7507466077804565, + "num_tokens": 662985781.0, + "step": 25624 + }, + { + "epoch": 2.8140786294750715, + "grad_norm": 2.096043586730957, + "learning_rate": 5e-06, + "loss": 0.6826, + "mean_token_accuracy": 0.7811152935028076, + "num_tokens": 663007852.0, + "step": 25625 + }, + { + "epoch": 2.814188447177685, + "grad_norm": 2.268413782119751, + "learning_rate": 5e-06, + "loss": 0.6219, + "mean_token_accuracy": 0.790641188621521, + "num_tokens": 663027573.0, + "step": 25626 + }, + { + "epoch": 2.8142982648802986, + "grad_norm": 1.8316015005111694, + "learning_rate": 5e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.7516837120056152, + "num_tokens": 663060142.0, + "step": 25627 + }, + { + "epoch": 2.8144080825829123, + "grad_norm": 2.249647855758667, + "learning_rate": 5e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.766002893447876, + "num_tokens": 663083743.0, + "step": 25628 + }, + { + "epoch": 2.814517900285526, + "grad_norm": 1.9479936361312866, + "learning_rate": 5e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7569915056228638, + "num_tokens": 663111234.0, + "step": 25629 + }, + { + "epoch": 2.81462771798814, + "grad_norm": 2.181023120880127, + "learning_rate": 5e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7624579071998596, + "num_tokens": 663134169.0, + "step": 25630 + }, + { + "epoch": 2.814737535690753, + "grad_norm": 1.941640853881836, + "learning_rate": 5e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7410946488380432, + "num_tokens": 663163263.0, + "step": 25631 + }, + { + "epoch": 2.814847353393367, + "grad_norm": 1.9325625896453857, + "learning_rate": 5e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7418141961097717, + "num_tokens": 663193706.0, + "step": 25632 + }, + { + "epoch": 2.8149571710959806, + "grad_norm": 2.2257821559906006, + "learning_rate": 5e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.7799800634384155, + "num_tokens": 663216516.0, + "step": 25633 + }, + { + "epoch": 2.8150669887985944, + "grad_norm": 2.070919990539551, + "learning_rate": 5e-06, + "loss": 0.6754, + "mean_token_accuracy": 0.7776429057121277, + "num_tokens": 663240707.0, + "step": 25634 + }, + { + "epoch": 2.815176806501208, + "grad_norm": 1.9452284574508667, + "learning_rate": 5e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7556215524673462, + "num_tokens": 663270795.0, + "step": 25635 + }, + { + "epoch": 2.8152866242038215, + "grad_norm": 2.2866082191467285, + "learning_rate": 5e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.7796605229377747, + "num_tokens": 663290769.0, + "step": 25636 + }, + { + "epoch": 2.8153964419064352, + "grad_norm": 2.164970874786377, + "learning_rate": 5e-06, + "loss": 0.7431, + "mean_token_accuracy": 0.7621526122093201, + "num_tokens": 663313708.0, + "step": 25637 + }, + { + "epoch": 2.815506259609049, + "grad_norm": 1.8787020444869995, + "learning_rate": 5e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7355191707611084, + "num_tokens": 663342226.0, + "step": 25638 + }, + { + "epoch": 2.8156160773116627, + "grad_norm": 2.3090274333953857, + "learning_rate": 5e-06, + "loss": 0.6284, + "mean_token_accuracy": 0.7838320136070251, + "num_tokens": 663360855.0, + "step": 25639 + }, + { + "epoch": 2.8157258950142765, + "grad_norm": 2.0603272914886475, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7386482954025269, + "num_tokens": 663388316.0, + "step": 25640 + }, + { + "epoch": 2.81583571271689, + "grad_norm": 1.9177180528640747, + "learning_rate": 5e-06, + "loss": 0.695, + "mean_token_accuracy": 0.7737648487091064, + "num_tokens": 663419493.0, + "step": 25641 + }, + { + "epoch": 2.8159455304195036, + "grad_norm": 1.9528179168701172, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7467198967933655, + "num_tokens": 663450222.0, + "step": 25642 + }, + { + "epoch": 2.8160553481221173, + "grad_norm": 2.12709903717041, + "learning_rate": 5e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7505294680595398, + "num_tokens": 663475391.0, + "step": 25643 + }, + { + "epoch": 2.816165165824731, + "grad_norm": 2.1894118785858154, + "learning_rate": 5e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7630385756492615, + "num_tokens": 663498723.0, + "step": 25644 + }, + { + "epoch": 2.816274983527345, + "grad_norm": 2.112154483795166, + "learning_rate": 5e-06, + "loss": 0.6922, + "mean_token_accuracy": 0.7775607109069824, + "num_tokens": 663520851.0, + "step": 25645 + }, + { + "epoch": 2.816384801229958, + "grad_norm": 2.402949810028076, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7696794271469116, + "num_tokens": 663540829.0, + "step": 25646 + }, + { + "epoch": 2.816494618932572, + "grad_norm": 1.9414135217666626, + "learning_rate": 5e-06, + "loss": 0.6406, + "mean_token_accuracy": 0.7841848134994507, + "num_tokens": 663568412.0, + "step": 25647 + }, + { + "epoch": 2.8166044366351857, + "grad_norm": 2.143278121948242, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7532578706741333, + "num_tokens": 663592627.0, + "step": 25648 + }, + { + "epoch": 2.816714254337799, + "grad_norm": 1.9249396324157715, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7486749291419983, + "num_tokens": 663620714.0, + "step": 25649 + }, + { + "epoch": 2.816824072040413, + "grad_norm": 2.0382277965545654, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7431102395057678, + "num_tokens": 663651343.0, + "step": 25650 + }, + { + "epoch": 2.8169338897430265, + "grad_norm": 1.7427561283111572, + "learning_rate": 5e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7484517693519592, + "num_tokens": 663684407.0, + "step": 25651 + }, + { + "epoch": 2.8170437074456403, + "grad_norm": 1.8801190853118896, + "learning_rate": 5e-06, + "loss": 0.6309, + "mean_token_accuracy": 0.788632869720459, + "num_tokens": 663711510.0, + "step": 25652 + }, + { + "epoch": 2.817153525148254, + "grad_norm": 2.03055739402771, + "learning_rate": 5e-06, + "loss": 0.6456, + "mean_token_accuracy": 0.7798156142234802, + "num_tokens": 663735933.0, + "step": 25653 + }, + { + "epoch": 2.8172633428508673, + "grad_norm": 1.9415825605392456, + "learning_rate": 5e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7539910078048706, + "num_tokens": 663765861.0, + "step": 25654 + }, + { + "epoch": 2.817373160553481, + "grad_norm": 2.2236979007720947, + "learning_rate": 5e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.7780604958534241, + "num_tokens": 663788703.0, + "step": 25655 + }, + { + "epoch": 2.817482978256095, + "grad_norm": 2.216054677963257, + "learning_rate": 5e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.7807782292366028, + "num_tokens": 663812421.0, + "step": 25656 + }, + { + "epoch": 2.8175927959587086, + "grad_norm": 1.978274941444397, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7688801884651184, + "num_tokens": 663840264.0, + "step": 25657 + }, + { + "epoch": 2.8177026136613224, + "grad_norm": 1.8608741760253906, + "learning_rate": 5e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7354235649108887, + "num_tokens": 663871677.0, + "step": 25658 + }, + { + "epoch": 2.8178124313639357, + "grad_norm": 1.993251085281372, + "learning_rate": 5e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7482407093048096, + "num_tokens": 663900066.0, + "step": 25659 + }, + { + "epoch": 2.8179222490665494, + "grad_norm": 1.8892202377319336, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7566731572151184, + "num_tokens": 663932305.0, + "step": 25660 + }, + { + "epoch": 2.818032066769163, + "grad_norm": 1.9972413778305054, + "learning_rate": 5e-06, + "loss": 0.7121, + "mean_token_accuracy": 0.7676472067832947, + "num_tokens": 663958319.0, + "step": 25661 + }, + { + "epoch": 2.818141884471777, + "grad_norm": 1.931288480758667, + "learning_rate": 5e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.741113007068634, + "num_tokens": 663988630.0, + "step": 25662 + }, + { + "epoch": 2.8182517021743907, + "grad_norm": 2.070089817047119, + "learning_rate": 5e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7601155042648315, + "num_tokens": 664014771.0, + "step": 25663 + }, + { + "epoch": 2.818361519877004, + "grad_norm": 2.1218209266662598, + "learning_rate": 5e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7437267303466797, + "num_tokens": 664040338.0, + "step": 25664 + }, + { + "epoch": 2.8184713375796178, + "grad_norm": 1.9807478189468384, + "learning_rate": 5e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7671186327934265, + "num_tokens": 664068397.0, + "step": 25665 + }, + { + "epoch": 2.8185811552822315, + "grad_norm": 2.0905942916870117, + "learning_rate": 5e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7608444690704346, + "num_tokens": 664091872.0, + "step": 25666 + }, + { + "epoch": 2.8186909729848453, + "grad_norm": 1.9517534971237183, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7715418338775635, + "num_tokens": 664116306.0, + "step": 25667 + }, + { + "epoch": 2.818800790687459, + "grad_norm": 1.9702953100204468, + "learning_rate": 5e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7465251684188843, + "num_tokens": 664143395.0, + "step": 25668 + }, + { + "epoch": 2.8189106083900723, + "grad_norm": 2.149449348449707, + "learning_rate": 5e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7442896366119385, + "num_tokens": 664169747.0, + "step": 25669 + }, + { + "epoch": 2.819020426092686, + "grad_norm": 2.141956329345703, + "learning_rate": 5e-06, + "loss": 0.7374, + "mean_token_accuracy": 0.7573087215423584, + "num_tokens": 664196631.0, + "step": 25670 + }, + { + "epoch": 2.8191302437953, + "grad_norm": 1.9936914443969727, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7619403600692749, + "num_tokens": 664224832.0, + "step": 25671 + }, + { + "epoch": 2.8192400614979136, + "grad_norm": 1.9673444032669067, + "learning_rate": 5e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7338330745697021, + "num_tokens": 664256566.0, + "step": 25672 + }, + { + "epoch": 2.8193498792005274, + "grad_norm": 2.0232627391815186, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7638306617736816, + "num_tokens": 664282235.0, + "step": 25673 + }, + { + "epoch": 2.8194596969031407, + "grad_norm": 1.8912415504455566, + "learning_rate": 5e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7453451752662659, + "num_tokens": 664313588.0, + "step": 25674 + }, + { + "epoch": 2.8195695146057544, + "grad_norm": 1.9897398948669434, + "learning_rate": 5e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7708431482315063, + "num_tokens": 664339649.0, + "step": 25675 + }, + { + "epoch": 2.819679332308368, + "grad_norm": 2.0404140949249268, + "learning_rate": 5e-06, + "loss": 0.7576, + "mean_token_accuracy": 0.746837317943573, + "num_tokens": 664365348.0, + "step": 25676 + }, + { + "epoch": 2.8197891500109815, + "grad_norm": 2.0273549556732178, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7506121397018433, + "num_tokens": 664390544.0, + "step": 25677 + }, + { + "epoch": 2.8198989677135953, + "grad_norm": 1.9469887018203735, + "learning_rate": 5e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7554591298103333, + "num_tokens": 664422073.0, + "step": 25678 + }, + { + "epoch": 2.820008785416209, + "grad_norm": 1.9144200086593628, + "learning_rate": 5e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7441025376319885, + "num_tokens": 664452994.0, + "step": 25679 + }, + { + "epoch": 2.820118603118823, + "grad_norm": 1.9727643728256226, + "learning_rate": 5e-06, + "loss": 0.708, + "mean_token_accuracy": 0.763969898223877, + "num_tokens": 664478777.0, + "step": 25680 + }, + { + "epoch": 2.8202284208214365, + "grad_norm": 2.065474271774292, + "learning_rate": 5e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7473055720329285, + "num_tokens": 664502181.0, + "step": 25681 + }, + { + "epoch": 2.82033823852405, + "grad_norm": 2.002406358718872, + "learning_rate": 5e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.770510733127594, + "num_tokens": 664529977.0, + "step": 25682 + }, + { + "epoch": 2.8204480562266636, + "grad_norm": 2.098687171936035, + "learning_rate": 5e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.7811542749404907, + "num_tokens": 664553072.0, + "step": 25683 + }, + { + "epoch": 2.8205578739292774, + "grad_norm": 2.1026389598846436, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.763626217842102, + "num_tokens": 664576729.0, + "step": 25684 + }, + { + "epoch": 2.820667691631891, + "grad_norm": 1.782619595527649, + "learning_rate": 5e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7345536351203918, + "num_tokens": 664611415.0, + "step": 25685 + }, + { + "epoch": 2.820777509334505, + "grad_norm": 2.308028221130371, + "learning_rate": 5e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.7937870621681213, + "num_tokens": 664632329.0, + "step": 25686 + }, + { + "epoch": 2.820887327037118, + "grad_norm": 2.0382332801818848, + "learning_rate": 5e-06, + "loss": 0.6495, + "mean_token_accuracy": 0.782145619392395, + "num_tokens": 664656517.0, + "step": 25687 + }, + { + "epoch": 2.820997144739732, + "grad_norm": 2.0274789333343506, + "learning_rate": 5e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7447632551193237, + "num_tokens": 664682487.0, + "step": 25688 + }, + { + "epoch": 2.8211069624423457, + "grad_norm": 2.2322750091552734, + "learning_rate": 5e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7397651076316833, + "num_tokens": 664707022.0, + "step": 25689 + }, + { + "epoch": 2.8212167801449595, + "grad_norm": 2.226494550704956, + "learning_rate": 5e-06, + "loss": 0.6071, + "mean_token_accuracy": 0.7939578294754028, + "num_tokens": 664727594.0, + "step": 25690 + }, + { + "epoch": 2.8213265978475732, + "grad_norm": 2.143995761871338, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7742838859558105, + "num_tokens": 664749679.0, + "step": 25691 + }, + { + "epoch": 2.8214364155501865, + "grad_norm": 1.8166557550430298, + "learning_rate": 5e-06, + "loss": 0.7033, + "mean_token_accuracy": 0.7709648609161377, + "num_tokens": 664781783.0, + "step": 25692 + }, + { + "epoch": 2.8215462332528003, + "grad_norm": 1.9302302598953247, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.750504195690155, + "num_tokens": 664812708.0, + "step": 25693 + }, + { + "epoch": 2.821656050955414, + "grad_norm": 2.1824238300323486, + "learning_rate": 5e-06, + "loss": 0.6606, + "mean_token_accuracy": 0.7758619785308838, + "num_tokens": 664835895.0, + "step": 25694 + }, + { + "epoch": 2.821765868658028, + "grad_norm": 2.246612548828125, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7603899240493774, + "num_tokens": 664858120.0, + "step": 25695 + }, + { + "epoch": 2.8218756863606416, + "grad_norm": 1.9740103483200073, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.763011634349823, + "num_tokens": 664885542.0, + "step": 25696 + }, + { + "epoch": 2.821985504063255, + "grad_norm": 2.275831699371338, + "learning_rate": 5e-06, + "loss": 0.6941, + "mean_token_accuracy": 0.7803512811660767, + "num_tokens": 664906481.0, + "step": 25697 + }, + { + "epoch": 2.8220953217658686, + "grad_norm": 1.8251965045928955, + "learning_rate": 5e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7403508424758911, + "num_tokens": 664940275.0, + "step": 25698 + }, + { + "epoch": 2.8222051394684824, + "grad_norm": 2.057866096496582, + "learning_rate": 5e-06, + "loss": 0.6902, + "mean_token_accuracy": 0.7713852524757385, + "num_tokens": 664965699.0, + "step": 25699 + }, + { + "epoch": 2.8223149571710957, + "grad_norm": 2.03645920753479, + "learning_rate": 5e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7523688077926636, + "num_tokens": 664991135.0, + "step": 25700 + }, + { + "epoch": 2.82242477487371, + "grad_norm": 2.2389960289001465, + "learning_rate": 5e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.7718967795372009, + "num_tokens": 665012761.0, + "step": 25701 + }, + { + "epoch": 2.822534592576323, + "grad_norm": 2.0411179065704346, + "learning_rate": 5e-06, + "loss": 0.652, + "mean_token_accuracy": 0.7821911573410034, + "num_tokens": 665036079.0, + "step": 25702 + }, + { + "epoch": 2.822644410278937, + "grad_norm": 2.3908071517944336, + "learning_rate": 5e-06, + "loss": 0.607, + "mean_token_accuracy": 0.8010414838790894, + "num_tokens": 665056209.0, + "step": 25703 + }, + { + "epoch": 2.8227542279815507, + "grad_norm": 2.0425491333007812, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7523265480995178, + "num_tokens": 665086669.0, + "step": 25704 + }, + { + "epoch": 2.822864045684164, + "grad_norm": 2.3938703536987305, + "learning_rate": 5e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7734742164611816, + "num_tokens": 665106739.0, + "step": 25705 + }, + { + "epoch": 2.822973863386778, + "grad_norm": 2.324073553085327, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7745232582092285, + "num_tokens": 665128012.0, + "step": 25706 + }, + { + "epoch": 2.8230836810893916, + "grad_norm": 2.2636871337890625, + "learning_rate": 5e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7459350228309631, + "num_tokens": 665153150.0, + "step": 25707 + }, + { + "epoch": 2.8231934987920053, + "grad_norm": 2.0742406845092773, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7623155117034912, + "num_tokens": 665179337.0, + "step": 25708 + }, + { + "epoch": 2.823303316494619, + "grad_norm": 2.332041025161743, + "learning_rate": 5e-06, + "loss": 0.6874, + "mean_token_accuracy": 0.7718942761421204, + "num_tokens": 665202128.0, + "step": 25709 + }, + { + "epoch": 2.8234131341972324, + "grad_norm": 2.2943215370178223, + "learning_rate": 5e-06, + "loss": 0.699, + "mean_token_accuracy": 0.7728083729743958, + "num_tokens": 665224208.0, + "step": 25710 + }, + { + "epoch": 2.823522951899846, + "grad_norm": 2.0191640853881836, + "learning_rate": 5e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7447776794433594, + "num_tokens": 665253975.0, + "step": 25711 + }, + { + "epoch": 2.82363276960246, + "grad_norm": 2.042832851409912, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7506721019744873, + "num_tokens": 665281388.0, + "step": 25712 + }, + { + "epoch": 2.8237425873050737, + "grad_norm": 2.073359251022339, + "learning_rate": 5e-06, + "loss": 0.701, + "mean_token_accuracy": 0.7668342590332031, + "num_tokens": 665307674.0, + "step": 25713 + }, + { + "epoch": 2.8238524050076874, + "grad_norm": 2.008894443511963, + "learning_rate": 5e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7401769161224365, + "num_tokens": 665335356.0, + "step": 25714 + }, + { + "epoch": 2.8239622227103007, + "grad_norm": 1.9952572584152222, + "learning_rate": 5e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7524369955062866, + "num_tokens": 665363342.0, + "step": 25715 + }, + { + "epoch": 2.8240720404129145, + "grad_norm": 2.0182175636291504, + "learning_rate": 5e-06, + "loss": 0.7062, + "mean_token_accuracy": 0.7714696526527405, + "num_tokens": 665393694.0, + "step": 25716 + }, + { + "epoch": 2.8241818581155282, + "grad_norm": 2.0482518672943115, + "learning_rate": 5e-06, + "loss": 0.6905, + "mean_token_accuracy": 0.7853071689605713, + "num_tokens": 665418168.0, + "step": 25717 + }, + { + "epoch": 2.824291675818142, + "grad_norm": 2.1424810886383057, + "learning_rate": 5e-06, + "loss": 0.6237, + "mean_token_accuracy": 0.794945240020752, + "num_tokens": 665440953.0, + "step": 25718 + }, + { + "epoch": 2.8244014935207558, + "grad_norm": 2.1228880882263184, + "learning_rate": 5e-06, + "loss": 0.8081, + "mean_token_accuracy": 0.750688910484314, + "num_tokens": 665469216.0, + "step": 25719 + }, + { + "epoch": 2.824511311223369, + "grad_norm": 1.8960996866226196, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7576395869255066, + "num_tokens": 665500492.0, + "step": 25720 + }, + { + "epoch": 2.824621128925983, + "grad_norm": 2.099008083343506, + "learning_rate": 5e-06, + "loss": 0.6865, + "mean_token_accuracy": 0.7749208807945251, + "num_tokens": 665527972.0, + "step": 25721 + }, + { + "epoch": 2.8247309466285966, + "grad_norm": 2.109382390975952, + "learning_rate": 5e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7524008750915527, + "num_tokens": 665554631.0, + "step": 25722 + }, + { + "epoch": 2.8248407643312103, + "grad_norm": 2.1263697147369385, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7417353391647339, + "num_tokens": 665581994.0, + "step": 25723 + }, + { + "epoch": 2.824950582033824, + "grad_norm": 2.271422863006592, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7578245997428894, + "num_tokens": 665606172.0, + "step": 25724 + }, + { + "epoch": 2.8250603997364374, + "grad_norm": 2.3731818199157715, + "learning_rate": 5e-06, + "loss": 0.6674, + "mean_token_accuracy": 0.7783421874046326, + "num_tokens": 665626826.0, + "step": 25725 + }, + { + "epoch": 2.825170217439051, + "grad_norm": 2.0424938201904297, + "learning_rate": 5e-06, + "loss": 0.6453, + "mean_token_accuracy": 0.785167932510376, + "num_tokens": 665650074.0, + "step": 25726 + }, + { + "epoch": 2.825280035141665, + "grad_norm": 2.5215890407562256, + "learning_rate": 5e-06, + "loss": 0.6752, + "mean_token_accuracy": 0.7729504704475403, + "num_tokens": 665669065.0, + "step": 25727 + }, + { + "epoch": 2.8253898528442782, + "grad_norm": 2.2531070709228516, + "learning_rate": 5e-06, + "loss": 0.7827, + "mean_token_accuracy": 0.742275059223175, + "num_tokens": 665693540.0, + "step": 25728 + }, + { + "epoch": 2.825499670546892, + "grad_norm": 2.339503049850464, + "learning_rate": 5e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7512602210044861, + "num_tokens": 665716264.0, + "step": 25729 + }, + { + "epoch": 2.8256094882495058, + "grad_norm": 2.0092670917510986, + "learning_rate": 5e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7562114000320435, + "num_tokens": 665743963.0, + "step": 25730 + }, + { + "epoch": 2.8257193059521195, + "grad_norm": 2.082854986190796, + "learning_rate": 5e-06, + "loss": 0.6768, + "mean_token_accuracy": 0.7707788944244385, + "num_tokens": 665770467.0, + "step": 25731 + }, + { + "epoch": 2.8258291236547333, + "grad_norm": 2.0728538036346436, + "learning_rate": 5e-06, + "loss": 0.6414, + "mean_token_accuracy": 0.7850677967071533, + "num_tokens": 665794765.0, + "step": 25732 + }, + { + "epoch": 2.8259389413573466, + "grad_norm": 2.1339354515075684, + "learning_rate": 5e-06, + "loss": 0.7197, + "mean_token_accuracy": 0.7668932676315308, + "num_tokens": 665819209.0, + "step": 25733 + }, + { + "epoch": 2.8260487590599603, + "grad_norm": 2.7095465660095215, + "learning_rate": 5e-06, + "loss": 0.6208, + "mean_token_accuracy": 0.7883567810058594, + "num_tokens": 665835934.0, + "step": 25734 + }, + { + "epoch": 2.826158576762574, + "grad_norm": 1.9576821327209473, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.753690779209137, + "num_tokens": 665864211.0, + "step": 25735 + }, + { + "epoch": 2.826268394465188, + "grad_norm": 2.1304333209991455, + "learning_rate": 5e-06, + "loss": 0.6453, + "mean_token_accuracy": 0.7861278057098389, + "num_tokens": 665888048.0, + "step": 25736 + }, + { + "epoch": 2.8263782121678016, + "grad_norm": 2.274226188659668, + "learning_rate": 5e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7868069410324097, + "num_tokens": 665909838.0, + "step": 25737 + }, + { + "epoch": 2.826488029870415, + "grad_norm": 1.9270859956741333, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7668209075927734, + "num_tokens": 665937783.0, + "step": 25738 + }, + { + "epoch": 2.8265978475730287, + "grad_norm": 1.870185136795044, + "learning_rate": 5e-06, + "loss": 0.6774, + "mean_token_accuracy": 0.7782933115959167, + "num_tokens": 665967450.0, + "step": 25739 + }, + { + "epoch": 2.8267076652756424, + "grad_norm": 2.1173839569091797, + "learning_rate": 5e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7581575512886047, + "num_tokens": 665996511.0, + "step": 25740 + }, + { + "epoch": 2.826817482978256, + "grad_norm": 2.053267002105713, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7624698877334595, + "num_tokens": 666021986.0, + "step": 25741 + }, + { + "epoch": 2.82692730068087, + "grad_norm": 2.0713348388671875, + "learning_rate": 5e-06, + "loss": 0.7534, + "mean_token_accuracy": 0.755293607711792, + "num_tokens": 666046784.0, + "step": 25742 + }, + { + "epoch": 2.8270371183834833, + "grad_norm": 2.0905683040618896, + "learning_rate": 5e-06, + "loss": 0.6753, + "mean_token_accuracy": 0.7805287837982178, + "num_tokens": 666070603.0, + "step": 25743 + }, + { + "epoch": 2.827146936086097, + "grad_norm": 2.154618501663208, + "learning_rate": 5e-06, + "loss": 0.714, + "mean_token_accuracy": 0.7622072696685791, + "num_tokens": 666094307.0, + "step": 25744 + }, + { + "epoch": 2.8272567537887108, + "grad_norm": 2.2097909450531006, + "learning_rate": 5e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7462940216064453, + "num_tokens": 666119047.0, + "step": 25745 + }, + { + "epoch": 2.8273665714913245, + "grad_norm": 2.108687162399292, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7611321210861206, + "num_tokens": 666142757.0, + "step": 25746 + }, + { + "epoch": 2.8274763891939383, + "grad_norm": 2.291588544845581, + "learning_rate": 5e-06, + "loss": 0.6893, + "mean_token_accuracy": 0.7703062295913696, + "num_tokens": 666163993.0, + "step": 25747 + }, + { + "epoch": 2.8275862068965516, + "grad_norm": 1.9086687564849854, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7641202211380005, + "num_tokens": 666193255.0, + "step": 25748 + }, + { + "epoch": 2.8276960245991654, + "grad_norm": 2.0344293117523193, + "learning_rate": 5e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7470951676368713, + "num_tokens": 666218626.0, + "step": 25749 + }, + { + "epoch": 2.827805842301779, + "grad_norm": 1.9233403205871582, + "learning_rate": 5e-06, + "loss": 0.6711, + "mean_token_accuracy": 0.7753899097442627, + "num_tokens": 666248140.0, + "step": 25750 + }, + { + "epoch": 2.827915660004393, + "grad_norm": 2.052129030227661, + "learning_rate": 5e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7522897720336914, + "num_tokens": 666275393.0, + "step": 25751 + }, + { + "epoch": 2.8280254777070066, + "grad_norm": 2.2356925010681152, + "learning_rate": 5e-06, + "loss": 0.7303, + "mean_token_accuracy": 0.7611517906188965, + "num_tokens": 666299880.0, + "step": 25752 + }, + { + "epoch": 2.82813529540962, + "grad_norm": 2.194053888320923, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7493817806243896, + "num_tokens": 666326540.0, + "step": 25753 + }, + { + "epoch": 2.8282451131122337, + "grad_norm": 1.9402191638946533, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7534364461898804, + "num_tokens": 666356695.0, + "step": 25754 + }, + { + "epoch": 2.8283549308148475, + "grad_norm": 2.170729875564575, + "learning_rate": 5e-06, + "loss": 0.698, + "mean_token_accuracy": 0.7697856426239014, + "num_tokens": 666382142.0, + "step": 25755 + }, + { + "epoch": 2.8284647485174608, + "grad_norm": 1.9041000604629517, + "learning_rate": 5e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.733214259147644, + "num_tokens": 666412935.0, + "step": 25756 + }, + { + "epoch": 2.8285745662200745, + "grad_norm": 1.8887065649032593, + "learning_rate": 5e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.771598756313324, + "num_tokens": 666442406.0, + "step": 25757 + }, + { + "epoch": 2.8286843839226883, + "grad_norm": 2.3756282329559326, + "learning_rate": 5e-06, + "loss": 0.653, + "mean_token_accuracy": 0.7832821607589722, + "num_tokens": 666461323.0, + "step": 25758 + }, + { + "epoch": 2.828794201625302, + "grad_norm": 1.9083433151245117, + "learning_rate": 5e-06, + "loss": 0.6778, + "mean_token_accuracy": 0.7697558403015137, + "num_tokens": 666489156.0, + "step": 25759 + }, + { + "epoch": 2.828904019327916, + "grad_norm": 2.107245445251465, + "learning_rate": 5e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7447423934936523, + "num_tokens": 666513452.0, + "step": 25760 + }, + { + "epoch": 2.829013837030529, + "grad_norm": 1.843997836112976, + "learning_rate": 5e-06, + "loss": 0.7375, + "mean_token_accuracy": 0.7554100155830383, + "num_tokens": 666543330.0, + "step": 25761 + }, + { + "epoch": 2.829123654733143, + "grad_norm": 2.340099811553955, + "learning_rate": 5e-06, + "loss": 0.6207, + "mean_token_accuracy": 0.7889471054077148, + "num_tokens": 666563626.0, + "step": 25762 + }, + { + "epoch": 2.8292334724357566, + "grad_norm": 2.014362335205078, + "learning_rate": 5e-06, + "loss": 0.7141, + "mean_token_accuracy": 0.7609695196151733, + "num_tokens": 666589702.0, + "step": 25763 + }, + { + "epoch": 2.8293432901383704, + "grad_norm": 2.010802984237671, + "learning_rate": 5e-06, + "loss": 0.7566, + "mean_token_accuracy": 0.7540962100028992, + "num_tokens": 666615206.0, + "step": 25764 + }, + { + "epoch": 2.829453107840984, + "grad_norm": 1.8985017538070679, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7520750164985657, + "num_tokens": 666645479.0, + "step": 25765 + }, + { + "epoch": 2.8295629255435975, + "grad_norm": 2.0549046993255615, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7726948261260986, + "num_tokens": 666671518.0, + "step": 25766 + }, + { + "epoch": 2.829672743246211, + "grad_norm": 2.1310601234436035, + "learning_rate": 5e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.7612056732177734, + "num_tokens": 666694470.0, + "step": 25767 + }, + { + "epoch": 2.829782560948825, + "grad_norm": 2.116025924682617, + "learning_rate": 5e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.7726802825927734, + "num_tokens": 666717680.0, + "step": 25768 + }, + { + "epoch": 2.8298923786514387, + "grad_norm": 1.9184564352035522, + "learning_rate": 5e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7448645830154419, + "num_tokens": 666749356.0, + "step": 25769 + }, + { + "epoch": 2.8300021963540525, + "grad_norm": 2.110645055770874, + "learning_rate": 5e-06, + "loss": 0.746, + "mean_token_accuracy": 0.7591649889945984, + "num_tokens": 666772297.0, + "step": 25770 + }, + { + "epoch": 2.830112014056666, + "grad_norm": 2.2023117542266846, + "learning_rate": 5e-06, + "loss": 0.7265, + "mean_token_accuracy": 0.7604219913482666, + "num_tokens": 666796207.0, + "step": 25771 + }, + { + "epoch": 2.8302218317592795, + "grad_norm": 1.9504876136779785, + "learning_rate": 5e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7554841041564941, + "num_tokens": 666825638.0, + "step": 25772 + }, + { + "epoch": 2.8303316494618933, + "grad_norm": 2.2386341094970703, + "learning_rate": 5e-06, + "loss": 0.6063, + "mean_token_accuracy": 0.7944167852401733, + "num_tokens": 666846024.0, + "step": 25773 + }, + { + "epoch": 2.830441467164507, + "grad_norm": 2.504767894744873, + "learning_rate": 5e-06, + "loss": 0.6332, + "mean_token_accuracy": 0.7909775972366333, + "num_tokens": 666864981.0, + "step": 25774 + }, + { + "epoch": 2.830551284867121, + "grad_norm": 2.0221073627471924, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7738385200500488, + "num_tokens": 666892496.0, + "step": 25775 + }, + { + "epoch": 2.830661102569734, + "grad_norm": 2.0654282569885254, + "learning_rate": 5e-06, + "loss": 0.6387, + "mean_token_accuracy": 0.7820464372634888, + "num_tokens": 666916446.0, + "step": 25776 + }, + { + "epoch": 2.830770920272348, + "grad_norm": 2.001786947250366, + "learning_rate": 5e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7468584775924683, + "num_tokens": 666945096.0, + "step": 25777 + }, + { + "epoch": 2.8308807379749616, + "grad_norm": 2.3180577754974365, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7677323222160339, + "num_tokens": 666965709.0, + "step": 25778 + }, + { + "epoch": 2.830990555677575, + "grad_norm": 2.187113046646118, + "learning_rate": 5e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7669498920440674, + "num_tokens": 666986321.0, + "step": 25779 + }, + { + "epoch": 2.831100373380189, + "grad_norm": 1.8822154998779297, + "learning_rate": 5e-06, + "loss": 0.7649, + "mean_token_accuracy": 0.7501254081726074, + "num_tokens": 667017714.0, + "step": 25780 + }, + { + "epoch": 2.8312101910828025, + "grad_norm": 2.356671094894409, + "learning_rate": 5e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.7849465608596802, + "num_tokens": 667038606.0, + "step": 25781 + }, + { + "epoch": 2.8313200087854162, + "grad_norm": 1.8506484031677246, + "learning_rate": 5e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7719932794570923, + "num_tokens": 667067888.0, + "step": 25782 + }, + { + "epoch": 2.83142982648803, + "grad_norm": 2.0566136837005615, + "learning_rate": 5e-06, + "loss": 0.7626, + "mean_token_accuracy": 0.7553343772888184, + "num_tokens": 667093679.0, + "step": 25783 + }, + { + "epoch": 2.8315396441906433, + "grad_norm": 2.041353940963745, + "learning_rate": 5e-06, + "loss": 0.6604, + "mean_token_accuracy": 0.7785825729370117, + "num_tokens": 667119296.0, + "step": 25784 + }, + { + "epoch": 2.831649461893257, + "grad_norm": 2.0640621185302734, + "learning_rate": 5e-06, + "loss": 0.6059, + "mean_token_accuracy": 0.7910977005958557, + "num_tokens": 667143312.0, + "step": 25785 + }, + { + "epoch": 2.831759279595871, + "grad_norm": 1.8990919589996338, + "learning_rate": 5e-06, + "loss": 0.6993, + "mean_token_accuracy": 0.7665410041809082, + "num_tokens": 667174665.0, + "step": 25786 + }, + { + "epoch": 2.8318690972984846, + "grad_norm": 2.3155345916748047, + "learning_rate": 5e-06, + "loss": 0.6709, + "mean_token_accuracy": 0.7839847207069397, + "num_tokens": 667195479.0, + "step": 25787 + }, + { + "epoch": 2.8319789150010983, + "grad_norm": 1.914743185043335, + "learning_rate": 5e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7481057047843933, + "num_tokens": 667227527.0, + "step": 25788 + }, + { + "epoch": 2.8320887327037116, + "grad_norm": 1.989954948425293, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7392538785934448, + "num_tokens": 667257274.0, + "step": 25789 + }, + { + "epoch": 2.8321985504063254, + "grad_norm": 2.463192939758301, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7721192836761475, + "num_tokens": 667277189.0, + "step": 25790 + }, + { + "epoch": 2.832308368108939, + "grad_norm": 2.1196627616882324, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7587188482284546, + "num_tokens": 667305161.0, + "step": 25791 + }, + { + "epoch": 2.832418185811553, + "grad_norm": 2.29085111618042, + "learning_rate": 5e-06, + "loss": 0.6715, + "mean_token_accuracy": 0.7779233455657959, + "num_tokens": 667327702.0, + "step": 25792 + }, + { + "epoch": 2.8325280035141667, + "grad_norm": 2.020857095718384, + "learning_rate": 5e-06, + "loss": 0.6177, + "mean_token_accuracy": 0.79087233543396, + "num_tokens": 667352581.0, + "step": 25793 + }, + { + "epoch": 2.83263782121678, + "grad_norm": 2.075504779815674, + "learning_rate": 5e-06, + "loss": 0.693, + "mean_token_accuracy": 0.7778326272964478, + "num_tokens": 667377135.0, + "step": 25794 + }, + { + "epoch": 2.8327476389193937, + "grad_norm": 2.307168483734131, + "learning_rate": 5e-06, + "loss": 0.6061, + "mean_token_accuracy": 0.7969039678573608, + "num_tokens": 667396249.0, + "step": 25795 + }, + { + "epoch": 2.8328574566220075, + "grad_norm": 2.249199151992798, + "learning_rate": 5e-06, + "loss": 0.7509, + "mean_token_accuracy": 0.7505494356155396, + "num_tokens": 667418699.0, + "step": 25796 + }, + { + "epoch": 2.8329672743246213, + "grad_norm": 2.028892993927002, + "learning_rate": 5e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7465632557868958, + "num_tokens": 667447211.0, + "step": 25797 + }, + { + "epoch": 2.833077092027235, + "grad_norm": 1.9719347953796387, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7631070613861084, + "num_tokens": 667473155.0, + "step": 25798 + }, + { + "epoch": 2.8331869097298483, + "grad_norm": 2.0700430870056152, + "learning_rate": 5e-06, + "loss": 0.7962, + "mean_token_accuracy": 0.7443913221359253, + "num_tokens": 667500951.0, + "step": 25799 + }, + { + "epoch": 2.833296727432462, + "grad_norm": 2.1558871269226074, + "learning_rate": 5e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7638788223266602, + "num_tokens": 667526141.0, + "step": 25800 + }, + { + "epoch": 2.833406545135076, + "grad_norm": 2.1141600608825684, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7595500349998474, + "num_tokens": 667552836.0, + "step": 25801 + }, + { + "epoch": 2.8335163628376896, + "grad_norm": 1.8823987245559692, + "learning_rate": 5e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7626032829284668, + "num_tokens": 667583391.0, + "step": 25802 + }, + { + "epoch": 2.8336261805403034, + "grad_norm": 2.0174295902252197, + "learning_rate": 5e-06, + "loss": 0.7635, + "mean_token_accuracy": 0.7515662908554077, + "num_tokens": 667608631.0, + "step": 25803 + }, + { + "epoch": 2.8337359982429167, + "grad_norm": 2.0023860931396484, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.76402747631073, + "num_tokens": 667635243.0, + "step": 25804 + }, + { + "epoch": 2.8338458159455304, + "grad_norm": 2.0979230403900146, + "learning_rate": 5e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7610586881637573, + "num_tokens": 667662644.0, + "step": 25805 + }, + { + "epoch": 2.833955633648144, + "grad_norm": 2.2578296661376953, + "learning_rate": 5e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7651802897453308, + "num_tokens": 667687605.0, + "step": 25806 + }, + { + "epoch": 2.8340654513507575, + "grad_norm": 2.3022220134735107, + "learning_rate": 5e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.7684226036071777, + "num_tokens": 667709054.0, + "step": 25807 + }, + { + "epoch": 2.8341752690533712, + "grad_norm": 1.947224497795105, + "learning_rate": 5e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7673954963684082, + "num_tokens": 667737527.0, + "step": 25808 + }, + { + "epoch": 2.834285086755985, + "grad_norm": 1.9784014225006104, + "learning_rate": 5e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7499746084213257, + "num_tokens": 667765683.0, + "step": 25809 + }, + { + "epoch": 2.8343949044585988, + "grad_norm": 2.3026838302612305, + "learning_rate": 5e-06, + "loss": 0.623, + "mean_token_accuracy": 0.7909443378448486, + "num_tokens": 667788150.0, + "step": 25810 + }, + { + "epoch": 2.8345047221612125, + "grad_norm": 2.105825662612915, + "learning_rate": 5e-06, + "loss": 0.6995, + "mean_token_accuracy": 0.7761154174804688, + "num_tokens": 667812041.0, + "step": 25811 + }, + { + "epoch": 2.834614539863826, + "grad_norm": 2.2006993293762207, + "learning_rate": 5e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7794350981712341, + "num_tokens": 667835143.0, + "step": 25812 + }, + { + "epoch": 2.8347243575664396, + "grad_norm": 1.9999687671661377, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7568424940109253, + "num_tokens": 667862387.0, + "step": 25813 + }, + { + "epoch": 2.8348341752690533, + "grad_norm": 2.3474924564361572, + "learning_rate": 5e-06, + "loss": 0.7398, + "mean_token_accuracy": 0.7628461122512817, + "num_tokens": 667882955.0, + "step": 25814 + }, + { + "epoch": 2.834943992971667, + "grad_norm": 1.7243263721466064, + "learning_rate": 5e-06, + "loss": 0.7378, + "mean_token_accuracy": 0.7561092376708984, + "num_tokens": 667917646.0, + "step": 25815 + }, + { + "epoch": 2.835053810674281, + "grad_norm": 1.808349609375, + "learning_rate": 5e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7480639815330505, + "num_tokens": 667950080.0, + "step": 25816 + }, + { + "epoch": 2.835163628376894, + "grad_norm": 2.005483627319336, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7589006423950195, + "num_tokens": 667978380.0, + "step": 25817 + }, + { + "epoch": 2.835273446079508, + "grad_norm": 2.005383014678955, + "learning_rate": 5e-06, + "loss": 0.7006, + "mean_token_accuracy": 0.7699138522148132, + "num_tokens": 668004881.0, + "step": 25818 + }, + { + "epoch": 2.8353832637821217, + "grad_norm": 2.1687722206115723, + "learning_rate": 5e-06, + "loss": 0.6009, + "mean_token_accuracy": 0.7950934171676636, + "num_tokens": 668026237.0, + "step": 25819 + }, + { + "epoch": 2.8354930814847354, + "grad_norm": 1.8820565938949585, + "learning_rate": 5e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7686299085617065, + "num_tokens": 668053996.0, + "step": 25820 + }, + { + "epoch": 2.835602899187349, + "grad_norm": 2.1409192085266113, + "learning_rate": 5e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7617447376251221, + "num_tokens": 668079861.0, + "step": 25821 + }, + { + "epoch": 2.8357127168899625, + "grad_norm": 2.3255958557128906, + "learning_rate": 5e-06, + "loss": 0.6346, + "mean_token_accuracy": 0.7839617729187012, + "num_tokens": 668100889.0, + "step": 25822 + }, + { + "epoch": 2.8358225345925763, + "grad_norm": 1.960480809211731, + "learning_rate": 5e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7425404787063599, + "num_tokens": 668130121.0, + "step": 25823 + }, + { + "epoch": 2.83593235229519, + "grad_norm": 1.9431813955307007, + "learning_rate": 5e-06, + "loss": 0.6072, + "mean_token_accuracy": 0.7907804250717163, + "num_tokens": 668155650.0, + "step": 25824 + }, + { + "epoch": 2.836042169997804, + "grad_norm": 2.245821952819824, + "learning_rate": 5e-06, + "loss": 0.6921, + "mean_token_accuracy": 0.781222939491272, + "num_tokens": 668177840.0, + "step": 25825 + }, + { + "epoch": 2.8361519877004175, + "grad_norm": 1.881220817565918, + "learning_rate": 5e-06, + "loss": 0.6658, + "mean_token_accuracy": 0.7823305130004883, + "num_tokens": 668203644.0, + "step": 25826 + }, + { + "epoch": 2.836261805403031, + "grad_norm": 1.9203726053237915, + "learning_rate": 5e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7571990489959717, + "num_tokens": 668232184.0, + "step": 25827 + }, + { + "epoch": 2.8363716231056446, + "grad_norm": 1.9139361381530762, + "learning_rate": 5e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7468206882476807, + "num_tokens": 668260560.0, + "step": 25828 + }, + { + "epoch": 2.8364814408082584, + "grad_norm": 2.210379123687744, + "learning_rate": 5e-06, + "loss": 0.7294, + "mean_token_accuracy": 0.7647850513458252, + "num_tokens": 668283318.0, + "step": 25829 + }, + { + "epoch": 2.8365912585108717, + "grad_norm": 2.0055832862854004, + "learning_rate": 5e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7664682269096375, + "num_tokens": 668309296.0, + "step": 25830 + }, + { + "epoch": 2.836701076213486, + "grad_norm": 2.0373568534851074, + "learning_rate": 5e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7653876543045044, + "num_tokens": 668335793.0, + "step": 25831 + }, + { + "epoch": 2.836810893916099, + "grad_norm": 2.082016944885254, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7653194665908813, + "num_tokens": 668362095.0, + "step": 25832 + }, + { + "epoch": 2.836920711618713, + "grad_norm": 2.0342092514038086, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7519500255584717, + "num_tokens": 668389059.0, + "step": 25833 + }, + { + "epoch": 2.8370305293213267, + "grad_norm": 2.242988348007202, + "learning_rate": 5e-06, + "loss": 0.6332, + "mean_token_accuracy": 0.7847914695739746, + "num_tokens": 668411056.0, + "step": 25834 + }, + { + "epoch": 2.83714034702394, + "grad_norm": 2.100768804550171, + "learning_rate": 5e-06, + "loss": 0.5887, + "mean_token_accuracy": 0.7983412742614746, + "num_tokens": 668435041.0, + "step": 25835 + }, + { + "epoch": 2.837250164726554, + "grad_norm": 2.3476150035858154, + "learning_rate": 5e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7735786437988281, + "num_tokens": 668457195.0, + "step": 25836 + }, + { + "epoch": 2.8373599824291675, + "grad_norm": 2.3805456161499023, + "learning_rate": 5e-06, + "loss": 0.6679, + "mean_token_accuracy": 0.7750523090362549, + "num_tokens": 668477337.0, + "step": 25837 + }, + { + "epoch": 2.8374698001317813, + "grad_norm": 2.200356960296631, + "learning_rate": 5e-06, + "loss": 0.6955, + "mean_token_accuracy": 0.7684584856033325, + "num_tokens": 668500760.0, + "step": 25838 + }, + { + "epoch": 2.837579617834395, + "grad_norm": 2.294278144836426, + "learning_rate": 5e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7563577890396118, + "num_tokens": 668521773.0, + "step": 25839 + }, + { + "epoch": 2.8376894355370084, + "grad_norm": 1.9738179445266724, + "learning_rate": 5e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7411452531814575, + "num_tokens": 668551833.0, + "step": 25840 + }, + { + "epoch": 2.837799253239622, + "grad_norm": 1.9735026359558105, + "learning_rate": 5e-06, + "loss": 0.7225, + "mean_token_accuracy": 0.7631134986877441, + "num_tokens": 668582063.0, + "step": 25841 + }, + { + "epoch": 2.837909070942236, + "grad_norm": 2.4320197105407715, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.775732159614563, + "num_tokens": 668600761.0, + "step": 25842 + }, + { + "epoch": 2.8380188886448496, + "grad_norm": 2.1881816387176514, + "learning_rate": 5e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7519159317016602, + "num_tokens": 668625191.0, + "step": 25843 + }, + { + "epoch": 2.8381287063474634, + "grad_norm": 2.2090373039245605, + "learning_rate": 5e-06, + "loss": 0.7514, + "mean_token_accuracy": 0.7548730373382568, + "num_tokens": 668648360.0, + "step": 25844 + }, + { + "epoch": 2.8382385240500767, + "grad_norm": 2.0772387981414795, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.745050311088562, + "num_tokens": 668675274.0, + "step": 25845 + }, + { + "epoch": 2.8383483417526905, + "grad_norm": 2.241137742996216, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7663777470588684, + "num_tokens": 668696536.0, + "step": 25846 + }, + { + "epoch": 2.838458159455304, + "grad_norm": 1.9285372495651245, + "learning_rate": 5e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7614837884902954, + "num_tokens": 668726340.0, + "step": 25847 + }, + { + "epoch": 2.838567977157918, + "grad_norm": 2.0269861221313477, + "learning_rate": 5e-06, + "loss": 0.6554, + "mean_token_accuracy": 0.7814282178878784, + "num_tokens": 668751576.0, + "step": 25848 + }, + { + "epoch": 2.8386777948605317, + "grad_norm": 2.316927909851074, + "learning_rate": 5e-06, + "loss": 0.6496, + "mean_token_accuracy": 0.786490797996521, + "num_tokens": 668773430.0, + "step": 25849 + }, + { + "epoch": 2.838787612563145, + "grad_norm": 1.991982340812683, + "learning_rate": 5e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7713861465454102, + "num_tokens": 668801140.0, + "step": 25850 + }, + { + "epoch": 2.838897430265759, + "grad_norm": 2.001638650894165, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7608790993690491, + "num_tokens": 668830216.0, + "step": 25851 + }, + { + "epoch": 2.8390072479683726, + "grad_norm": 1.9492087364196777, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7454875111579895, + "num_tokens": 668860349.0, + "step": 25852 + }, + { + "epoch": 2.8391170656709863, + "grad_norm": 2.569307565689087, + "learning_rate": 5e-06, + "loss": 0.6257, + "mean_token_accuracy": 0.7879036664962769, + "num_tokens": 668877445.0, + "step": 25853 + }, + { + "epoch": 2.8392268833736, + "grad_norm": 2.1334950923919678, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7476762533187866, + "num_tokens": 668905621.0, + "step": 25854 + }, + { + "epoch": 2.8393367010762134, + "grad_norm": 1.8185666799545288, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7565041780471802, + "num_tokens": 668934122.0, + "step": 25855 + }, + { + "epoch": 2.839446518778827, + "grad_norm": 1.966271162033081, + "learning_rate": 5e-06, + "loss": 0.7151, + "mean_token_accuracy": 0.7603290677070618, + "num_tokens": 668960896.0, + "step": 25856 + }, + { + "epoch": 2.839556336481441, + "grad_norm": 2.0378732681274414, + "learning_rate": 5e-06, + "loss": 0.7817, + "mean_token_accuracy": 0.7445536255836487, + "num_tokens": 668988633.0, + "step": 25857 + }, + { + "epoch": 2.839666154184054, + "grad_norm": 2.3979880809783936, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7722615003585815, + "num_tokens": 669008701.0, + "step": 25858 + }, + { + "epoch": 2.839775971886668, + "grad_norm": 2.098572015762329, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.769235372543335, + "num_tokens": 669032891.0, + "step": 25859 + }, + { + "epoch": 2.8398857895892817, + "grad_norm": 2.3262195587158203, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7661759257316589, + "num_tokens": 669055152.0, + "step": 25860 + }, + { + "epoch": 2.8399956072918955, + "grad_norm": 2.1174020767211914, + "learning_rate": 5e-06, + "loss": 0.6409, + "mean_token_accuracy": 0.7844741344451904, + "num_tokens": 669078754.0, + "step": 25861 + }, + { + "epoch": 2.8401054249945092, + "grad_norm": 2.1863064765930176, + "learning_rate": 5e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7398815155029297, + "num_tokens": 669104316.0, + "step": 25862 + }, + { + "epoch": 2.8402152426971226, + "grad_norm": 2.2298202514648438, + "learning_rate": 5e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7657108902931213, + "num_tokens": 669125631.0, + "step": 25863 + }, + { + "epoch": 2.8403250603997363, + "grad_norm": 1.9625942707061768, + "learning_rate": 5e-06, + "loss": 0.787, + "mean_token_accuracy": 0.742721438407898, + "num_tokens": 669154642.0, + "step": 25864 + }, + { + "epoch": 2.84043487810235, + "grad_norm": 1.9265570640563965, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.760439395904541, + "num_tokens": 669184777.0, + "step": 25865 + }, + { + "epoch": 2.840544695804964, + "grad_norm": 2.1984357833862305, + "learning_rate": 5e-06, + "loss": 0.6471, + "mean_token_accuracy": 0.7809462547302246, + "num_tokens": 669206183.0, + "step": 25866 + }, + { + "epoch": 2.8406545135075776, + "grad_norm": 1.9310410022735596, + "learning_rate": 5e-06, + "loss": 0.7157, + "mean_token_accuracy": 0.7756108045578003, + "num_tokens": 669232728.0, + "step": 25867 + }, + { + "epoch": 2.840764331210191, + "grad_norm": 1.962297797203064, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.761725902557373, + "num_tokens": 669260849.0, + "step": 25868 + }, + { + "epoch": 2.8408741489128047, + "grad_norm": 2.123556137084961, + "learning_rate": 5e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7596083283424377, + "num_tokens": 669287247.0, + "step": 25869 + }, + { + "epoch": 2.8409839666154184, + "grad_norm": 1.8524119853973389, + "learning_rate": 5e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7432814836502075, + "num_tokens": 669318421.0, + "step": 25870 + }, + { + "epoch": 2.841093784318032, + "grad_norm": 1.9370005130767822, + "learning_rate": 5e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7383044958114624, + "num_tokens": 669349361.0, + "step": 25871 + }, + { + "epoch": 2.841203602020646, + "grad_norm": 1.9247632026672363, + "learning_rate": 5e-06, + "loss": 0.7652, + "mean_token_accuracy": 0.7480354309082031, + "num_tokens": 669377209.0, + "step": 25872 + }, + { + "epoch": 2.8413134197232592, + "grad_norm": 2.0118703842163086, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7653615474700928, + "num_tokens": 669404066.0, + "step": 25873 + }, + { + "epoch": 2.841423237425873, + "grad_norm": 2.051074266433716, + "learning_rate": 5e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7341710925102234, + "num_tokens": 669430547.0, + "step": 25874 + }, + { + "epoch": 2.8415330551284868, + "grad_norm": 1.7327237129211426, + "learning_rate": 5e-06, + "loss": 0.7625, + "mean_token_accuracy": 0.7574024200439453, + "num_tokens": 669465359.0, + "step": 25875 + }, + { + "epoch": 2.8416428728311005, + "grad_norm": 1.9678304195404053, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7507214546203613, + "num_tokens": 669493948.0, + "step": 25876 + }, + { + "epoch": 2.8417526905337143, + "grad_norm": 2.053493022918701, + "learning_rate": 5e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.751200795173645, + "num_tokens": 669519657.0, + "step": 25877 + }, + { + "epoch": 2.8418625082363276, + "grad_norm": 2.132904052734375, + "learning_rate": 5e-06, + "loss": 0.7557, + "mean_token_accuracy": 0.7536448240280151, + "num_tokens": 669544924.0, + "step": 25878 + }, + { + "epoch": 2.8419723259389413, + "grad_norm": 1.8877347707748413, + "learning_rate": 5e-06, + "loss": 0.7646, + "mean_token_accuracy": 0.7548506259918213, + "num_tokens": 669575899.0, + "step": 25879 + }, + { + "epoch": 2.842082143641555, + "grad_norm": 1.7781798839569092, + "learning_rate": 5e-06, + "loss": 0.678, + "mean_token_accuracy": 0.7776750922203064, + "num_tokens": 669607358.0, + "step": 25880 + }, + { + "epoch": 2.8421919613441684, + "grad_norm": 2.2710442543029785, + "learning_rate": 5e-06, + "loss": 0.749, + "mean_token_accuracy": 0.7669738531112671, + "num_tokens": 669629901.0, + "step": 25881 + }, + { + "epoch": 2.8423017790467826, + "grad_norm": 2.0436315536499023, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.784286618232727, + "num_tokens": 669657115.0, + "step": 25882 + }, + { + "epoch": 2.842411596749396, + "grad_norm": 2.2660775184631348, + "learning_rate": 5e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.7732955813407898, + "num_tokens": 669678992.0, + "step": 25883 + }, + { + "epoch": 2.8425214144520097, + "grad_norm": 2.1377480030059814, + "learning_rate": 5e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.7851049900054932, + "num_tokens": 669700295.0, + "step": 25884 + }, + { + "epoch": 2.8426312321546234, + "grad_norm": 2.313030242919922, + "learning_rate": 5e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7665122151374817, + "num_tokens": 669721578.0, + "step": 25885 + }, + { + "epoch": 2.8427410498572367, + "grad_norm": 2.062988042831421, + "learning_rate": 5e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.7767630815505981, + "num_tokens": 669748353.0, + "step": 25886 + }, + { + "epoch": 2.8428508675598505, + "grad_norm": 2.0697684288024902, + "learning_rate": 5e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7589970231056213, + "num_tokens": 669772720.0, + "step": 25887 + }, + { + "epoch": 2.8429606852624643, + "grad_norm": 2.0492799282073975, + "learning_rate": 5e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7505453824996948, + "num_tokens": 669799058.0, + "step": 25888 + }, + { + "epoch": 2.843070502965078, + "grad_norm": 2.3277111053466797, + "learning_rate": 5e-06, + "loss": 0.6513, + "mean_token_accuracy": 0.7860288619995117, + "num_tokens": 669820169.0, + "step": 25889 + }, + { + "epoch": 2.8431803206676918, + "grad_norm": 2.1070661544799805, + "learning_rate": 5e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.7792562246322632, + "num_tokens": 669843923.0, + "step": 25890 + }, + { + "epoch": 2.843290138370305, + "grad_norm": 2.0413966178894043, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7609183192253113, + "num_tokens": 669868789.0, + "step": 25891 + }, + { + "epoch": 2.843399956072919, + "grad_norm": 2.0591180324554443, + "learning_rate": 5e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7490541338920593, + "num_tokens": 669893000.0, + "step": 25892 + }, + { + "epoch": 2.8435097737755326, + "grad_norm": 2.118001937866211, + "learning_rate": 5e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7573550939559937, + "num_tokens": 669917409.0, + "step": 25893 + }, + { + "epoch": 2.8436195914781464, + "grad_norm": 2.308711290359497, + "learning_rate": 5e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.7473961114883423, + "num_tokens": 669942695.0, + "step": 25894 + }, + { + "epoch": 2.84372940918076, + "grad_norm": 2.115816831588745, + "learning_rate": 5e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.754867434501648, + "num_tokens": 669966696.0, + "step": 25895 + }, + { + "epoch": 2.8438392268833734, + "grad_norm": 2.22692608833313, + "learning_rate": 5e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.7509917616844177, + "num_tokens": 669991434.0, + "step": 25896 + }, + { + "epoch": 2.843949044585987, + "grad_norm": 1.840380072593689, + "learning_rate": 5e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7498032450675964, + "num_tokens": 670026568.0, + "step": 25897 + }, + { + "epoch": 2.844058862288601, + "grad_norm": 2.1763877868652344, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7555707097053528, + "num_tokens": 670050675.0, + "step": 25898 + }, + { + "epoch": 2.8441686799912147, + "grad_norm": 2.2470109462738037, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7686456441879272, + "num_tokens": 670071978.0, + "step": 25899 + }, + { + "epoch": 2.8442784976938285, + "grad_norm": 2.306534767150879, + "learning_rate": 5e-06, + "loss": 0.7288, + "mean_token_accuracy": 0.7777462005615234, + "num_tokens": 670092278.0, + "step": 25900 + }, + { + "epoch": 2.8443883153964418, + "grad_norm": 2.0918493270874023, + "learning_rate": 5e-06, + "loss": 0.6788, + "mean_token_accuracy": 0.7707824110984802, + "num_tokens": 670115657.0, + "step": 25901 + }, + { + "epoch": 2.8444981330990555, + "grad_norm": 2.0173511505126953, + "learning_rate": 5e-06, + "loss": 0.7586, + "mean_token_accuracy": 0.7513371706008911, + "num_tokens": 670145691.0, + "step": 25902 + }, + { + "epoch": 2.8446079508016693, + "grad_norm": 1.9134163856506348, + "learning_rate": 5e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7413567304611206, + "num_tokens": 670176219.0, + "step": 25903 + }, + { + "epoch": 2.844717768504283, + "grad_norm": 1.9113426208496094, + "learning_rate": 5e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7574966549873352, + "num_tokens": 670203947.0, + "step": 25904 + }, + { + "epoch": 2.844827586206897, + "grad_norm": 2.262479305267334, + "learning_rate": 5e-06, + "loss": 0.7124, + "mean_token_accuracy": 0.7642919421195984, + "num_tokens": 670226712.0, + "step": 25905 + }, + { + "epoch": 2.84493740390951, + "grad_norm": 2.2100040912628174, + "learning_rate": 5e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7501277923583984, + "num_tokens": 670254098.0, + "step": 25906 + }, + { + "epoch": 2.845047221612124, + "grad_norm": 2.044853925704956, + "learning_rate": 5e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7551850080490112, + "num_tokens": 670280851.0, + "step": 25907 + }, + { + "epoch": 2.8451570393147376, + "grad_norm": 2.004769802093506, + "learning_rate": 5e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7596051096916199, + "num_tokens": 670307955.0, + "step": 25908 + }, + { + "epoch": 2.845266857017351, + "grad_norm": 1.8517229557037354, + "learning_rate": 5e-06, + "loss": 0.6159, + "mean_token_accuracy": 0.7862317562103271, + "num_tokens": 670333481.0, + "step": 25909 + }, + { + "epoch": 2.8453766747199647, + "grad_norm": 2.297992467880249, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7676308751106262, + "num_tokens": 670356436.0, + "step": 25910 + }, + { + "epoch": 2.8454864924225785, + "grad_norm": 1.9279788732528687, + "learning_rate": 5e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.764959990978241, + "num_tokens": 670382988.0, + "step": 25911 + }, + { + "epoch": 2.845596310125192, + "grad_norm": 2.160026788711548, + "learning_rate": 5e-06, + "loss": 0.7064, + "mean_token_accuracy": 0.7634103298187256, + "num_tokens": 670405507.0, + "step": 25912 + }, + { + "epoch": 2.845706127827806, + "grad_norm": 1.9805628061294556, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7572364807128906, + "num_tokens": 670433279.0, + "step": 25913 + }, + { + "epoch": 2.8458159455304193, + "grad_norm": 1.9954276084899902, + "learning_rate": 5e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7397817373275757, + "num_tokens": 670462109.0, + "step": 25914 + }, + { + "epoch": 2.845925763233033, + "grad_norm": 1.8936296701431274, + "learning_rate": 5e-06, + "loss": 0.7336, + "mean_token_accuracy": 0.7587403059005737, + "num_tokens": 670492538.0, + "step": 25915 + }, + { + "epoch": 2.846035580935647, + "grad_norm": 2.3503293991088867, + "learning_rate": 5e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7430276870727539, + "num_tokens": 670516265.0, + "step": 25916 + }, + { + "epoch": 2.8461453986382605, + "grad_norm": 1.9915063381195068, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7543323636054993, + "num_tokens": 670544265.0, + "step": 25917 + }, + { + "epoch": 2.8462552163408743, + "grad_norm": 2.1510777473449707, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7745164632797241, + "num_tokens": 670566928.0, + "step": 25918 + }, + { + "epoch": 2.8463650340434876, + "grad_norm": 1.9674005508422852, + "learning_rate": 5e-06, + "loss": 0.6859, + "mean_token_accuracy": 0.7749927639961243, + "num_tokens": 670594152.0, + "step": 25919 + }, + { + "epoch": 2.8464748517461014, + "grad_norm": 1.9488791227340698, + "learning_rate": 5e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.7587809562683105, + "num_tokens": 670623492.0, + "step": 25920 + }, + { + "epoch": 2.846584669448715, + "grad_norm": 1.8703609704971313, + "learning_rate": 5e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.748660683631897, + "num_tokens": 670653977.0, + "step": 25921 + }, + { + "epoch": 2.846694487151329, + "grad_norm": 2.1516358852386475, + "learning_rate": 5e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7661855220794678, + "num_tokens": 670678297.0, + "step": 25922 + }, + { + "epoch": 2.8468043048539426, + "grad_norm": 1.9955796003341675, + "learning_rate": 5e-06, + "loss": 0.735, + "mean_token_accuracy": 0.7587110996246338, + "num_tokens": 670703768.0, + "step": 25923 + }, + { + "epoch": 2.846914122556556, + "grad_norm": 1.9508627653121948, + "learning_rate": 5e-06, + "loss": 0.7114, + "mean_token_accuracy": 0.7677192687988281, + "num_tokens": 670731342.0, + "step": 25924 + }, + { + "epoch": 2.8470239402591697, + "grad_norm": 2.218881607055664, + "learning_rate": 5e-06, + "loss": 0.6977, + "mean_token_accuracy": 0.7719159722328186, + "num_tokens": 670754973.0, + "step": 25925 + }, + { + "epoch": 2.8471337579617835, + "grad_norm": 2.093053102493286, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7617294788360596, + "num_tokens": 670780200.0, + "step": 25926 + }, + { + "epoch": 2.8472435756643972, + "grad_norm": 2.126293182373047, + "learning_rate": 5e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7694671750068665, + "num_tokens": 670804000.0, + "step": 25927 + }, + { + "epoch": 2.847353393367011, + "grad_norm": 1.928609848022461, + "learning_rate": 5e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.759843647480011, + "num_tokens": 670832015.0, + "step": 25928 + }, + { + "epoch": 2.8474632110696243, + "grad_norm": 1.9975336790084839, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7459391355514526, + "num_tokens": 670859763.0, + "step": 25929 + }, + { + "epoch": 2.847573028772238, + "grad_norm": 2.145016670227051, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7667903900146484, + "num_tokens": 670883875.0, + "step": 25930 + }, + { + "epoch": 2.847682846474852, + "grad_norm": 2.0571770668029785, + "learning_rate": 5e-06, + "loss": 0.6962, + "mean_token_accuracy": 0.7692767381668091, + "num_tokens": 670910534.0, + "step": 25931 + }, + { + "epoch": 2.8477926641774656, + "grad_norm": 1.888569712638855, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7608709335327148, + "num_tokens": 670942740.0, + "step": 25932 + }, + { + "epoch": 2.8479024818800793, + "grad_norm": 2.176217794418335, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.752707302570343, + "num_tokens": 670967268.0, + "step": 25933 + }, + { + "epoch": 2.8480122995826926, + "grad_norm": 2.331265449523926, + "learning_rate": 5e-06, + "loss": 0.679, + "mean_token_accuracy": 0.776434063911438, + "num_tokens": 670987821.0, + "step": 25934 + }, + { + "epoch": 2.8481221172853064, + "grad_norm": 2.0802242755889893, + "learning_rate": 5e-06, + "loss": 0.6886, + "mean_token_accuracy": 0.774405837059021, + "num_tokens": 671010541.0, + "step": 25935 + }, + { + "epoch": 2.84823193498792, + "grad_norm": 2.402304172515869, + "learning_rate": 5e-06, + "loss": 0.6363, + "mean_token_accuracy": 0.7863123416900635, + "num_tokens": 671030364.0, + "step": 25936 + }, + { + "epoch": 2.8483417526905335, + "grad_norm": 1.9325964450836182, + "learning_rate": 5e-06, + "loss": 0.7682, + "mean_token_accuracy": 0.7506070733070374, + "num_tokens": 671060158.0, + "step": 25937 + }, + { + "epoch": 2.8484515703931472, + "grad_norm": 1.961242914199829, + "learning_rate": 5e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.755300760269165, + "num_tokens": 671089415.0, + "step": 25938 + }, + { + "epoch": 2.848561388095761, + "grad_norm": 2.1294004917144775, + "learning_rate": 5e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7526552677154541, + "num_tokens": 671115296.0, + "step": 25939 + }, + { + "epoch": 2.8486712057983747, + "grad_norm": 2.1703250408172607, + "learning_rate": 5e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.768122673034668, + "num_tokens": 671139297.0, + "step": 25940 + }, + { + "epoch": 2.8487810235009885, + "grad_norm": 1.922322154045105, + "learning_rate": 5e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7526378631591797, + "num_tokens": 671167630.0, + "step": 25941 + }, + { + "epoch": 2.848890841203602, + "grad_norm": 2.126204252243042, + "learning_rate": 5e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.753146767616272, + "num_tokens": 671193279.0, + "step": 25942 + }, + { + "epoch": 2.8490006589062156, + "grad_norm": 2.128337860107422, + "learning_rate": 5e-06, + "loss": 0.7423, + "mean_token_accuracy": 0.7556062936782837, + "num_tokens": 671219005.0, + "step": 25943 + }, + { + "epoch": 2.8491104766088293, + "grad_norm": 1.9088956117630005, + "learning_rate": 5e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7339552640914917, + "num_tokens": 671250744.0, + "step": 25944 + }, + { + "epoch": 2.849220294311443, + "grad_norm": 2.2162253856658936, + "learning_rate": 5e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7324205040931702, + "num_tokens": 671278048.0, + "step": 25945 + }, + { + "epoch": 2.849330112014057, + "grad_norm": 2.0689635276794434, + "learning_rate": 5e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7458755970001221, + "num_tokens": 671309117.0, + "step": 25946 + }, + { + "epoch": 2.84943992971667, + "grad_norm": 2.0187795162200928, + "learning_rate": 5e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.7494295835494995, + "num_tokens": 671335719.0, + "step": 25947 + }, + { + "epoch": 2.849549747419284, + "grad_norm": 2.120943069458008, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.7650518417358398, + "num_tokens": 671358676.0, + "step": 25948 + }, + { + "epoch": 2.8496595651218977, + "grad_norm": 2.3767406940460205, + "learning_rate": 5e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7640844583511353, + "num_tokens": 671381680.0, + "step": 25949 + }, + { + "epoch": 2.8497693828245114, + "grad_norm": 1.9656089544296265, + "learning_rate": 5e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.769898533821106, + "num_tokens": 671408940.0, + "step": 25950 + }, + { + "epoch": 2.849879200527125, + "grad_norm": 2.2976906299591064, + "learning_rate": 5e-06, + "loss": 0.6707, + "mean_token_accuracy": 0.7782939672470093, + "num_tokens": 671431381.0, + "step": 25951 + }, + { + "epoch": 2.8499890182297385, + "grad_norm": 2.0899457931518555, + "learning_rate": 5e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7496119737625122, + "num_tokens": 671459593.0, + "step": 25952 + }, + { + "epoch": 2.8500988359323522, + "grad_norm": 2.0423126220703125, + "learning_rate": 5e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7450116276741028, + "num_tokens": 671490113.0, + "step": 25953 + }, + { + "epoch": 2.850208653634966, + "grad_norm": 2.2029924392700195, + "learning_rate": 5e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.778793215751648, + "num_tokens": 671511583.0, + "step": 25954 + }, + { + "epoch": 2.8503184713375798, + "grad_norm": 2.146073579788208, + "learning_rate": 5e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7556197643280029, + "num_tokens": 671537608.0, + "step": 25955 + }, + { + "epoch": 2.8504282890401935, + "grad_norm": 2.116380214691162, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7662805318832397, + "num_tokens": 671564431.0, + "step": 25956 + }, + { + "epoch": 2.850538106742807, + "grad_norm": 1.9507442712783813, + "learning_rate": 5e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7443027496337891, + "num_tokens": 671593264.0, + "step": 25957 + }, + { + "epoch": 2.8506479244454206, + "grad_norm": 1.9668596982955933, + "learning_rate": 5e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7698863744735718, + "num_tokens": 671618111.0, + "step": 25958 + }, + { + "epoch": 2.8507577421480343, + "grad_norm": 2.075730800628662, + "learning_rate": 5e-06, + "loss": 0.7647, + "mean_token_accuracy": 0.7569507360458374, + "num_tokens": 671645828.0, + "step": 25959 + }, + { + "epoch": 2.8508675598506477, + "grad_norm": 2.374093532562256, + "learning_rate": 5e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7653840780258179, + "num_tokens": 671667053.0, + "step": 25960 + }, + { + "epoch": 2.850977377553262, + "grad_norm": 1.8878458738327026, + "learning_rate": 5e-06, + "loss": 0.6445, + "mean_token_accuracy": 0.786708652973175, + "num_tokens": 671694726.0, + "step": 25961 + }, + { + "epoch": 2.851087195255875, + "grad_norm": 1.748170018196106, + "learning_rate": 5e-06, + "loss": 0.7936, + "mean_token_accuracy": 0.7471585273742676, + "num_tokens": 671727454.0, + "step": 25962 + }, + { + "epoch": 2.851197012958489, + "grad_norm": 2.169623374938965, + "learning_rate": 5e-06, + "loss": 0.6537, + "mean_token_accuracy": 0.778497576713562, + "num_tokens": 671750808.0, + "step": 25963 + }, + { + "epoch": 2.8513068306611027, + "grad_norm": 1.923883080482483, + "learning_rate": 5e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7339180707931519, + "num_tokens": 671782000.0, + "step": 25964 + }, + { + "epoch": 2.851416648363716, + "grad_norm": 2.2185285091400146, + "learning_rate": 5e-06, + "loss": 0.6936, + "mean_token_accuracy": 0.7753758430480957, + "num_tokens": 671805549.0, + "step": 25965 + }, + { + "epoch": 2.8515264660663298, + "grad_norm": 2.1869964599609375, + "learning_rate": 5e-06, + "loss": 0.7081, + "mean_token_accuracy": 0.7701966762542725, + "num_tokens": 671829926.0, + "step": 25966 + }, + { + "epoch": 2.8516362837689435, + "grad_norm": 2.0918567180633545, + "learning_rate": 5e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7740189433097839, + "num_tokens": 671857445.0, + "step": 25967 + }, + { + "epoch": 2.8517461014715573, + "grad_norm": 2.009070634841919, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7535313963890076, + "num_tokens": 671890181.0, + "step": 25968 + }, + { + "epoch": 2.851855919174171, + "grad_norm": 2.188413619995117, + "learning_rate": 5e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7739177346229553, + "num_tokens": 671913115.0, + "step": 25969 + }, + { + "epoch": 2.8519657368767843, + "grad_norm": 1.896823763847351, + "learning_rate": 5e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7445389032363892, + "num_tokens": 671942952.0, + "step": 25970 + }, + { + "epoch": 2.852075554579398, + "grad_norm": 2.0941696166992188, + "learning_rate": 5e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7711998224258423, + "num_tokens": 671968435.0, + "step": 25971 + }, + { + "epoch": 2.852185372282012, + "grad_norm": 2.2179508209228516, + "learning_rate": 5e-06, + "loss": 0.7858, + "mean_token_accuracy": 0.7465096116065979, + "num_tokens": 671994161.0, + "step": 25972 + }, + { + "epoch": 2.8522951899846256, + "grad_norm": 2.085956335067749, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7676262855529785, + "num_tokens": 672019848.0, + "step": 25973 + }, + { + "epoch": 2.8524050076872394, + "grad_norm": 2.3019418716430664, + "learning_rate": 5e-06, + "loss": 0.6242, + "mean_token_accuracy": 0.7939913868904114, + "num_tokens": 672040920.0, + "step": 25974 + }, + { + "epoch": 2.8525148253898527, + "grad_norm": 2.056525230407715, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7549524903297424, + "num_tokens": 672066020.0, + "step": 25975 + }, + { + "epoch": 2.8526246430924664, + "grad_norm": 1.8838380575180054, + "learning_rate": 5e-06, + "loss": 0.6938, + "mean_token_accuracy": 0.7750133872032166, + "num_tokens": 672094727.0, + "step": 25976 + }, + { + "epoch": 2.85273446079508, + "grad_norm": 2.389214515686035, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7729931473731995, + "num_tokens": 672115706.0, + "step": 25977 + }, + { + "epoch": 2.852844278497694, + "grad_norm": 1.9364471435546875, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7617923021316528, + "num_tokens": 672144344.0, + "step": 25978 + }, + { + "epoch": 2.8529540962003077, + "grad_norm": 2.0634877681732178, + "learning_rate": 5e-06, + "loss": 0.6853, + "mean_token_accuracy": 0.7703315019607544, + "num_tokens": 672168025.0, + "step": 25979 + }, + { + "epoch": 2.853063913902921, + "grad_norm": 2.0825650691986084, + "learning_rate": 5e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7414652109146118, + "num_tokens": 672195181.0, + "step": 25980 + }, + { + "epoch": 2.853173731605535, + "grad_norm": 2.0551743507385254, + "learning_rate": 5e-06, + "loss": 0.6797, + "mean_token_accuracy": 0.7761335372924805, + "num_tokens": 672219107.0, + "step": 25981 + }, + { + "epoch": 2.8532835493081485, + "grad_norm": 2.3023953437805176, + "learning_rate": 5e-06, + "loss": 0.7434, + "mean_token_accuracy": 0.7623344659805298, + "num_tokens": 672241646.0, + "step": 25982 + }, + { + "epoch": 2.8533933670107623, + "grad_norm": 2.2322490215301514, + "learning_rate": 5e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7615573406219482, + "num_tokens": 672265484.0, + "step": 25983 + }, + { + "epoch": 2.853503184713376, + "grad_norm": 2.1079115867614746, + "learning_rate": 5e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7384131550788879, + "num_tokens": 672292417.0, + "step": 25984 + }, + { + "epoch": 2.8536130024159894, + "grad_norm": 2.0456466674804688, + "learning_rate": 5e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7328958511352539, + "num_tokens": 672320392.0, + "step": 25985 + }, + { + "epoch": 2.853722820118603, + "grad_norm": 2.1606104373931885, + "learning_rate": 5e-06, + "loss": 0.6363, + "mean_token_accuracy": 0.7849021553993225, + "num_tokens": 672343481.0, + "step": 25986 + }, + { + "epoch": 2.853832637821217, + "grad_norm": 1.7772881984710693, + "learning_rate": 5e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7669077515602112, + "num_tokens": 672376140.0, + "step": 25987 + }, + { + "epoch": 2.85394245552383, + "grad_norm": 2.0310521125793457, + "learning_rate": 5e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7638598084449768, + "num_tokens": 672401723.0, + "step": 25988 + }, + { + "epoch": 2.854052273226444, + "grad_norm": 1.9892829656600952, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7536038160324097, + "num_tokens": 672430486.0, + "step": 25989 + }, + { + "epoch": 2.8541620909290577, + "grad_norm": 2.132795572280884, + "learning_rate": 5e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7513340711593628, + "num_tokens": 672456596.0, + "step": 25990 + }, + { + "epoch": 2.8542719086316715, + "grad_norm": 2.021967649459839, + "learning_rate": 5e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7661920785903931, + "num_tokens": 672481210.0, + "step": 25991 + }, + { + "epoch": 2.854381726334285, + "grad_norm": 2.078005313873291, + "learning_rate": 5e-06, + "loss": 0.6901, + "mean_token_accuracy": 0.7728066444396973, + "num_tokens": 672504760.0, + "step": 25992 + }, + { + "epoch": 2.8544915440368985, + "grad_norm": 2.244215965270996, + "learning_rate": 5e-06, + "loss": 0.6044, + "mean_token_accuracy": 0.7950007915496826, + "num_tokens": 672525509.0, + "step": 25993 + }, + { + "epoch": 2.8546013617395123, + "grad_norm": 2.008643627166748, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7698980569839478, + "num_tokens": 672552715.0, + "step": 25994 + }, + { + "epoch": 2.854711179442126, + "grad_norm": 2.1549060344696045, + "learning_rate": 5e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7579469680786133, + "num_tokens": 672578841.0, + "step": 25995 + }, + { + "epoch": 2.85482099714474, + "grad_norm": 2.0611202716827393, + "learning_rate": 5e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7641666531562805, + "num_tokens": 672606038.0, + "step": 25996 + }, + { + "epoch": 2.8549308148473536, + "grad_norm": 2.179265260696411, + "learning_rate": 5e-06, + "loss": 0.702, + "mean_token_accuracy": 0.7625106573104858, + "num_tokens": 672628389.0, + "step": 25997 + }, + { + "epoch": 2.855040632549967, + "grad_norm": 2.0599491596221924, + "learning_rate": 5e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7672918438911438, + "num_tokens": 672653501.0, + "step": 25998 + }, + { + "epoch": 2.8551504502525806, + "grad_norm": 2.2522990703582764, + "learning_rate": 5e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.756599485874176, + "num_tokens": 672675910.0, + "step": 25999 + }, + { + "epoch": 2.8552602679551944, + "grad_norm": 2.3446075916290283, + "learning_rate": 5e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7532709836959839, + "num_tokens": 672698753.0, + "step": 26000 + }, + { + "epoch": 2.855370085657808, + "grad_norm": 1.9127931594848633, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7563674449920654, + "num_tokens": 672729106.0, + "step": 26001 + }, + { + "epoch": 2.855479903360422, + "grad_norm": 2.049487590789795, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7746114730834961, + "num_tokens": 672755587.0, + "step": 26002 + }, + { + "epoch": 2.855589721063035, + "grad_norm": 2.055996894836426, + "learning_rate": 5e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7548443675041199, + "num_tokens": 672782954.0, + "step": 26003 + }, + { + "epoch": 2.855699538765649, + "grad_norm": 1.9494459629058838, + "learning_rate": 5e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7438246607780457, + "num_tokens": 672812474.0, + "step": 26004 + }, + { + "epoch": 2.8558093564682627, + "grad_norm": 2.1357767581939697, + "learning_rate": 5e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7707229852676392, + "num_tokens": 672837683.0, + "step": 26005 + }, + { + "epoch": 2.8559191741708765, + "grad_norm": 2.0889179706573486, + "learning_rate": 5e-06, + "loss": 0.7155, + "mean_token_accuracy": 0.7682188749313354, + "num_tokens": 672861769.0, + "step": 26006 + }, + { + "epoch": 2.8560289918734902, + "grad_norm": 2.174592971801758, + "learning_rate": 5e-06, + "loss": 0.6736, + "mean_token_accuracy": 0.7728268504142761, + "num_tokens": 672883339.0, + "step": 26007 + }, + { + "epoch": 2.8561388095761036, + "grad_norm": 2.068939447402954, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7530040740966797, + "num_tokens": 672909308.0, + "step": 26008 + }, + { + "epoch": 2.8562486272787173, + "grad_norm": 2.151724338531494, + "learning_rate": 5e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.7928013801574707, + "num_tokens": 672933635.0, + "step": 26009 + }, + { + "epoch": 2.856358444981331, + "grad_norm": 1.9670907258987427, + "learning_rate": 5e-06, + "loss": 0.7204, + "mean_token_accuracy": 0.7612044811248779, + "num_tokens": 672961183.0, + "step": 26010 + }, + { + "epoch": 2.8564682626839444, + "grad_norm": 2.1392781734466553, + "learning_rate": 5e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7437476515769958, + "num_tokens": 672985472.0, + "step": 26011 + }, + { + "epoch": 2.8565780803865586, + "grad_norm": 2.1901168823242188, + "learning_rate": 5e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7615871429443359, + "num_tokens": 673008639.0, + "step": 26012 + }, + { + "epoch": 2.856687898089172, + "grad_norm": 2.1253626346588135, + "learning_rate": 5e-06, + "loss": 0.618, + "mean_token_accuracy": 0.7927320599555969, + "num_tokens": 673031317.0, + "step": 26013 + }, + { + "epoch": 2.8567977157917857, + "grad_norm": 2.057586669921875, + "learning_rate": 5e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7502961158752441, + "num_tokens": 673057218.0, + "step": 26014 + }, + { + "epoch": 2.8569075334943994, + "grad_norm": 1.88589346408844, + "learning_rate": 5e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7730199098587036, + "num_tokens": 673085535.0, + "step": 26015 + }, + { + "epoch": 2.8570173511970127, + "grad_norm": 2.027893543243408, + "learning_rate": 5e-06, + "loss": 0.6813, + "mean_token_accuracy": 0.7732429504394531, + "num_tokens": 673109610.0, + "step": 26016 + }, + { + "epoch": 2.8571271688996265, + "grad_norm": 1.9906660318374634, + "learning_rate": 5e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.753567636013031, + "num_tokens": 673139530.0, + "step": 26017 + }, + { + "epoch": 2.8572369866022402, + "grad_norm": 2.2119178771972656, + "learning_rate": 5e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7890973687171936, + "num_tokens": 673162888.0, + "step": 26018 + }, + { + "epoch": 2.857346804304854, + "grad_norm": 1.956533670425415, + "learning_rate": 5e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7435052394866943, + "num_tokens": 673189575.0, + "step": 26019 + }, + { + "epoch": 2.8574566220074678, + "grad_norm": 2.095174551010132, + "learning_rate": 5e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7664339542388916, + "num_tokens": 673214973.0, + "step": 26020 + }, + { + "epoch": 2.857566439710081, + "grad_norm": 2.0403220653533936, + "learning_rate": 5e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.7830541133880615, + "num_tokens": 673239468.0, + "step": 26021 + }, + { + "epoch": 2.857676257412695, + "grad_norm": 2.3447906970977783, + "learning_rate": 5e-06, + "loss": 0.5981, + "mean_token_accuracy": 0.7914015650749207, + "num_tokens": 673258776.0, + "step": 26022 + }, + { + "epoch": 2.8577860751153086, + "grad_norm": 1.9695873260498047, + "learning_rate": 5e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7672194838523865, + "num_tokens": 673285006.0, + "step": 26023 + }, + { + "epoch": 2.8578958928179223, + "grad_norm": 2.1307411193847656, + "learning_rate": 5e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.74460768699646, + "num_tokens": 673309935.0, + "step": 26024 + }, + { + "epoch": 2.858005710520536, + "grad_norm": 1.962979793548584, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7607656717300415, + "num_tokens": 673341050.0, + "step": 26025 + }, + { + "epoch": 2.8581155282231494, + "grad_norm": 1.9845367670059204, + "learning_rate": 5e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7634063363075256, + "num_tokens": 673366340.0, + "step": 26026 + }, + { + "epoch": 2.858225345925763, + "grad_norm": 1.9736993312835693, + "learning_rate": 5e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7737208008766174, + "num_tokens": 673392079.0, + "step": 26027 + }, + { + "epoch": 2.858335163628377, + "grad_norm": 1.946277379989624, + "learning_rate": 5e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.7612711787223816, + "num_tokens": 673418577.0, + "step": 26028 + }, + { + "epoch": 2.8584449813309907, + "grad_norm": 2.0148632526397705, + "learning_rate": 5e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7493252754211426, + "num_tokens": 673447388.0, + "step": 26029 + }, + { + "epoch": 2.8585547990336044, + "grad_norm": 2.013643503189087, + "learning_rate": 5e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7555433511734009, + "num_tokens": 673475857.0, + "step": 26030 + }, + { + "epoch": 2.8586646167362177, + "grad_norm": 2.249135971069336, + "learning_rate": 5e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7603647112846375, + "num_tokens": 673499833.0, + "step": 26031 + }, + { + "epoch": 2.8587744344388315, + "grad_norm": 1.9985601902008057, + "learning_rate": 5e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7530914545059204, + "num_tokens": 673524937.0, + "step": 26032 + }, + { + "epoch": 2.8588842521414453, + "grad_norm": 2.115807294845581, + "learning_rate": 5e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7491806745529175, + "num_tokens": 673551128.0, + "step": 26033 + }, + { + "epoch": 2.858994069844059, + "grad_norm": 2.143789529800415, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7663125991821289, + "num_tokens": 673575822.0, + "step": 26034 + }, + { + "epoch": 2.8591038875466728, + "grad_norm": 2.3196821212768555, + "learning_rate": 5e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.7741860151290894, + "num_tokens": 673597306.0, + "step": 26035 + }, + { + "epoch": 2.859213705249286, + "grad_norm": 1.9552466869354248, + "learning_rate": 5e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7716662287712097, + "num_tokens": 673623729.0, + "step": 26036 + }, + { + "epoch": 2.8593235229519, + "grad_norm": 2.115250825881958, + "learning_rate": 5e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7717652320861816, + "num_tokens": 673645694.0, + "step": 26037 + }, + { + "epoch": 2.8594333406545136, + "grad_norm": 1.9586492776870728, + "learning_rate": 5e-06, + "loss": 0.7632, + "mean_token_accuracy": 0.7580065727233887, + "num_tokens": 673673566.0, + "step": 26038 + }, + { + "epoch": 2.859543158357127, + "grad_norm": 2.3208518028259277, + "learning_rate": 5e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7711785435676575, + "num_tokens": 673694677.0, + "step": 26039 + }, + { + "epoch": 2.8596529760597407, + "grad_norm": 1.9406847953796387, + "learning_rate": 5e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7628591656684875, + "num_tokens": 673722190.0, + "step": 26040 + }, + { + "epoch": 2.8597627937623544, + "grad_norm": 2.029149293899536, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7521662712097168, + "num_tokens": 673749343.0, + "step": 26041 + }, + { + "epoch": 2.859872611464968, + "grad_norm": 2.1280694007873535, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7696340680122375, + "num_tokens": 673772519.0, + "step": 26042 + }, + { + "epoch": 2.859982429167582, + "grad_norm": 2.0076024532318115, + "learning_rate": 5e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7635292410850525, + "num_tokens": 673799672.0, + "step": 26043 + }, + { + "epoch": 2.8600922468701953, + "grad_norm": 2.088709831237793, + "learning_rate": 5e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7724728584289551, + "num_tokens": 673825171.0, + "step": 26044 + }, + { + "epoch": 2.860202064572809, + "grad_norm": 2.195784091949463, + "learning_rate": 5e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7576519250869751, + "num_tokens": 673847651.0, + "step": 26045 + }, + { + "epoch": 2.8603118822754228, + "grad_norm": 2.044557809829712, + "learning_rate": 5e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7675668001174927, + "num_tokens": 673875204.0, + "step": 26046 + }, + { + "epoch": 2.8604216999780365, + "grad_norm": 2.1586227416992188, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.769419252872467, + "num_tokens": 673898193.0, + "step": 26047 + }, + { + "epoch": 2.8605315176806503, + "grad_norm": 2.0669853687286377, + "learning_rate": 5e-06, + "loss": 0.7185, + "mean_token_accuracy": 0.7745537161827087, + "num_tokens": 673924622.0, + "step": 26048 + }, + { + "epoch": 2.8606413353832636, + "grad_norm": 1.9522734880447388, + "learning_rate": 5e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7571578025817871, + "num_tokens": 673953421.0, + "step": 26049 + }, + { + "epoch": 2.8607511530858774, + "grad_norm": 1.9218146800994873, + "learning_rate": 5e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7465270757675171, + "num_tokens": 673983355.0, + "step": 26050 + }, + { + "epoch": 2.860860970788491, + "grad_norm": 2.0550971031188965, + "learning_rate": 5e-06, + "loss": 0.6555, + "mean_token_accuracy": 0.7849857211112976, + "num_tokens": 674007555.0, + "step": 26051 + }, + { + "epoch": 2.860970788491105, + "grad_norm": 2.0427072048187256, + "learning_rate": 5e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7383808493614197, + "num_tokens": 674035248.0, + "step": 26052 + }, + { + "epoch": 2.8610806061937186, + "grad_norm": 1.9896546602249146, + "learning_rate": 5e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7552361488342285, + "num_tokens": 674061875.0, + "step": 26053 + }, + { + "epoch": 2.861190423896332, + "grad_norm": 1.9906461238861084, + "learning_rate": 5e-06, + "loss": 0.7159, + "mean_token_accuracy": 0.7672553658485413, + "num_tokens": 674086488.0, + "step": 26054 + }, + { + "epoch": 2.8613002415989457, + "grad_norm": 2.1238982677459717, + "learning_rate": 5e-06, + "loss": 0.6323, + "mean_token_accuracy": 0.7852970361709595, + "num_tokens": 674108218.0, + "step": 26055 + }, + { + "epoch": 2.8614100593015594, + "grad_norm": 2.100435495376587, + "learning_rate": 5e-06, + "loss": 0.7724, + "mean_token_accuracy": 0.7480809688568115, + "num_tokens": 674134704.0, + "step": 26056 + }, + { + "epoch": 2.861519877004173, + "grad_norm": 2.0242435932159424, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7584819793701172, + "num_tokens": 674159514.0, + "step": 26057 + }, + { + "epoch": 2.861629694706787, + "grad_norm": 2.275418281555176, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7634270191192627, + "num_tokens": 674182182.0, + "step": 26058 + }, + { + "epoch": 2.8617395124094003, + "grad_norm": 2.379814863204956, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7768955230712891, + "num_tokens": 674202844.0, + "step": 26059 + }, + { + "epoch": 2.861849330112014, + "grad_norm": 1.9449362754821777, + "learning_rate": 5e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7464609742164612, + "num_tokens": 674234340.0, + "step": 26060 + }, + { + "epoch": 2.861959147814628, + "grad_norm": 2.02396559715271, + "learning_rate": 5e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7467074990272522, + "num_tokens": 674261387.0, + "step": 26061 + }, + { + "epoch": 2.862068965517241, + "grad_norm": 2.110577344894409, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7541422247886658, + "num_tokens": 674286204.0, + "step": 26062 + }, + { + "epoch": 2.8621787832198553, + "grad_norm": 1.900962471961975, + "learning_rate": 5e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7465255260467529, + "num_tokens": 674315879.0, + "step": 26063 + }, + { + "epoch": 2.8622886009224686, + "grad_norm": 2.0095913410186768, + "learning_rate": 5e-06, + "loss": 0.7275, + "mean_token_accuracy": 0.7632588148117065, + "num_tokens": 674343511.0, + "step": 26064 + }, + { + "epoch": 2.8623984186250824, + "grad_norm": 2.12526273727417, + "learning_rate": 5e-06, + "loss": 0.7536, + "mean_token_accuracy": 0.7507004737854004, + "num_tokens": 674368715.0, + "step": 26065 + }, + { + "epoch": 2.862508236327696, + "grad_norm": 2.063344717025757, + "learning_rate": 5e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7364096641540527, + "num_tokens": 674395372.0, + "step": 26066 + }, + { + "epoch": 2.8626180540303094, + "grad_norm": 2.098377227783203, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.746444582939148, + "num_tokens": 674420478.0, + "step": 26067 + }, + { + "epoch": 2.862727871732923, + "grad_norm": 2.0420114994049072, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7654759883880615, + "num_tokens": 674445713.0, + "step": 26068 + }, + { + "epoch": 2.862837689435537, + "grad_norm": 1.9963328838348389, + "learning_rate": 5e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.751421332359314, + "num_tokens": 674472586.0, + "step": 26069 + }, + { + "epoch": 2.8629475071381507, + "grad_norm": 2.04052734375, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7676219940185547, + "num_tokens": 674498896.0, + "step": 26070 + }, + { + "epoch": 2.8630573248407645, + "grad_norm": 1.9776825904846191, + "learning_rate": 5e-06, + "loss": 0.7464, + "mean_token_accuracy": 0.7562151551246643, + "num_tokens": 674528524.0, + "step": 26071 + }, + { + "epoch": 2.863167142543378, + "grad_norm": 2.1137161254882812, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7593263387680054, + "num_tokens": 674553374.0, + "step": 26072 + }, + { + "epoch": 2.8632769602459915, + "grad_norm": 1.9264931678771973, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7620886564254761, + "num_tokens": 674585362.0, + "step": 26073 + }, + { + "epoch": 2.8633867779486053, + "grad_norm": 1.9244903326034546, + "learning_rate": 5e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7529528141021729, + "num_tokens": 674613469.0, + "step": 26074 + }, + { + "epoch": 2.863496595651219, + "grad_norm": 2.1012065410614014, + "learning_rate": 5e-06, + "loss": 0.7067, + "mean_token_accuracy": 0.7641680240631104, + "num_tokens": 674639137.0, + "step": 26075 + }, + { + "epoch": 2.863606413353833, + "grad_norm": 1.9421758651733398, + "learning_rate": 5e-06, + "loss": 0.7091, + "mean_token_accuracy": 0.7639684081077576, + "num_tokens": 674667324.0, + "step": 26076 + }, + { + "epoch": 2.863716231056446, + "grad_norm": 2.111584424972534, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.765241801738739, + "num_tokens": 674693201.0, + "step": 26077 + }, + { + "epoch": 2.86382604875906, + "grad_norm": 1.9553089141845703, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7729344367980957, + "num_tokens": 674720226.0, + "step": 26078 + }, + { + "epoch": 2.8639358664616736, + "grad_norm": 1.9842290878295898, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7530627250671387, + "num_tokens": 674747877.0, + "step": 26079 + }, + { + "epoch": 2.8640456841642874, + "grad_norm": 2.244253635406494, + "learning_rate": 5e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7547988891601562, + "num_tokens": 674773681.0, + "step": 26080 + }, + { + "epoch": 2.864155501866901, + "grad_norm": 1.8343936204910278, + "learning_rate": 5e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7388343214988708, + "num_tokens": 674806752.0, + "step": 26081 + }, + { + "epoch": 2.8642653195695145, + "grad_norm": 1.8803492784500122, + "learning_rate": 5e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7667078375816345, + "num_tokens": 674836474.0, + "step": 26082 + }, + { + "epoch": 2.8643751372721282, + "grad_norm": 2.351041316986084, + "learning_rate": 5e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7652852535247803, + "num_tokens": 674859676.0, + "step": 26083 + }, + { + "epoch": 2.864484954974742, + "grad_norm": 1.8706605434417725, + "learning_rate": 5e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.7703038454055786, + "num_tokens": 674889638.0, + "step": 26084 + }, + { + "epoch": 2.8645947726773557, + "grad_norm": 2.30061411857605, + "learning_rate": 5e-06, + "loss": 0.6479, + "mean_token_accuracy": 0.786338210105896, + "num_tokens": 674909978.0, + "step": 26085 + }, + { + "epoch": 2.8647045903799695, + "grad_norm": 2.145359754562378, + "learning_rate": 5e-06, + "loss": 0.6619, + "mean_token_accuracy": 0.7782189846038818, + "num_tokens": 674932909.0, + "step": 26086 + }, + { + "epoch": 2.864814408082583, + "grad_norm": 1.9257680177688599, + "learning_rate": 5e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7616461515426636, + "num_tokens": 674959814.0, + "step": 26087 + }, + { + "epoch": 2.8649242257851966, + "grad_norm": 1.7848834991455078, + "learning_rate": 5e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7472327351570129, + "num_tokens": 674993393.0, + "step": 26088 + }, + { + "epoch": 2.8650340434878103, + "grad_norm": 2.0424163341522217, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7651072144508362, + "num_tokens": 675018084.0, + "step": 26089 + }, + { + "epoch": 2.8651438611904236, + "grad_norm": 2.0195107460021973, + "learning_rate": 5e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7576913833618164, + "num_tokens": 675045157.0, + "step": 26090 + }, + { + "epoch": 2.8652536788930374, + "grad_norm": 2.04455828666687, + "learning_rate": 5e-06, + "loss": 0.7702, + "mean_token_accuracy": 0.7499607801437378, + "num_tokens": 675073815.0, + "step": 26091 + }, + { + "epoch": 2.865363496595651, + "grad_norm": 1.9700454473495483, + "learning_rate": 5e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7725654244422913, + "num_tokens": 675102480.0, + "step": 26092 + }, + { + "epoch": 2.865473314298265, + "grad_norm": 2.0643458366394043, + "learning_rate": 5e-06, + "loss": 0.7427, + "mean_token_accuracy": 0.7615799903869629, + "num_tokens": 675128188.0, + "step": 26093 + }, + { + "epoch": 2.8655831320008787, + "grad_norm": 1.9351779222488403, + "learning_rate": 5e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.743748664855957, + "num_tokens": 675159608.0, + "step": 26094 + }, + { + "epoch": 2.865692949703492, + "grad_norm": 2.138655424118042, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7488897442817688, + "num_tokens": 675184349.0, + "step": 26095 + }, + { + "epoch": 2.8658027674061057, + "grad_norm": 2.397592306137085, + "learning_rate": 5e-06, + "loss": 0.6213, + "mean_token_accuracy": 0.7853847742080688, + "num_tokens": 675203434.0, + "step": 26096 + }, + { + "epoch": 2.8659125851087195, + "grad_norm": 1.904464840888977, + "learning_rate": 5e-06, + "loss": 0.7845, + "mean_token_accuracy": 0.7436189651489258, + "num_tokens": 675231505.0, + "step": 26097 + }, + { + "epoch": 2.8660224028113332, + "grad_norm": 1.8221627473831177, + "learning_rate": 5e-06, + "loss": 0.7713, + "mean_token_accuracy": 0.7472493648529053, + "num_tokens": 675262318.0, + "step": 26098 + }, + { + "epoch": 2.866132220513947, + "grad_norm": 2.2922425270080566, + "learning_rate": 5e-06, + "loss": 0.6357, + "mean_token_accuracy": 0.7866674065589905, + "num_tokens": 675282857.0, + "step": 26099 + }, + { + "epoch": 2.8662420382165603, + "grad_norm": 2.0434117317199707, + "learning_rate": 5e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7283879518508911, + "num_tokens": 675311392.0, + "step": 26100 + }, + { + "epoch": 2.866351855919174, + "grad_norm": 2.2489633560180664, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7494058012962341, + "num_tokens": 675334443.0, + "step": 26101 + }, + { + "epoch": 2.866461673621788, + "grad_norm": 2.302499294281006, + "learning_rate": 5e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7466189861297607, + "num_tokens": 675357747.0, + "step": 26102 + }, + { + "epoch": 2.8665714913244016, + "grad_norm": 2.168999433517456, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7581762075424194, + "num_tokens": 675380859.0, + "step": 26103 + }, + { + "epoch": 2.8666813090270153, + "grad_norm": 2.1154394149780273, + "learning_rate": 5e-06, + "loss": 0.6644, + "mean_token_accuracy": 0.7754483222961426, + "num_tokens": 675404382.0, + "step": 26104 + }, + { + "epoch": 2.8667911267296287, + "grad_norm": 1.9834991693496704, + "learning_rate": 5e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7498438358306885, + "num_tokens": 675435166.0, + "step": 26105 + }, + { + "epoch": 2.8669009444322424, + "grad_norm": 1.973851203918457, + "learning_rate": 5e-06, + "loss": 0.7351, + "mean_token_accuracy": 0.7620401382446289, + "num_tokens": 675461436.0, + "step": 26106 + }, + { + "epoch": 2.867010762134856, + "grad_norm": 2.1025729179382324, + "learning_rate": 5e-06, + "loss": 0.6837, + "mean_token_accuracy": 0.7670159339904785, + "num_tokens": 675486014.0, + "step": 26107 + }, + { + "epoch": 2.86712057983747, + "grad_norm": 2.116548538208008, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7510982751846313, + "num_tokens": 675510825.0, + "step": 26108 + }, + { + "epoch": 2.8672303975400837, + "grad_norm": 2.0035641193389893, + "learning_rate": 5e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7588911652565002, + "num_tokens": 675538391.0, + "step": 26109 + }, + { + "epoch": 2.867340215242697, + "grad_norm": 2.161823034286499, + "learning_rate": 5e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7764240503311157, + "num_tokens": 675562169.0, + "step": 26110 + }, + { + "epoch": 2.8674500329453108, + "grad_norm": 2.069075584411621, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7533159852027893, + "num_tokens": 675586815.0, + "step": 26111 + }, + { + "epoch": 2.8675598506479245, + "grad_norm": 2.0207412242889404, + "learning_rate": 5e-06, + "loss": 0.7559, + "mean_token_accuracy": 0.7542959451675415, + "num_tokens": 675613593.0, + "step": 26112 + }, + { + "epoch": 2.8676696683505383, + "grad_norm": 1.784067988395691, + "learning_rate": 5e-06, + "loss": 0.7765, + "mean_token_accuracy": 0.7544842958450317, + "num_tokens": 675649212.0, + "step": 26113 + }, + { + "epoch": 2.867779486053152, + "grad_norm": 1.8558409214019775, + "learning_rate": 5e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7451812624931335, + "num_tokens": 675682082.0, + "step": 26114 + }, + { + "epoch": 2.8678893037557653, + "grad_norm": 2.076343297958374, + "learning_rate": 5e-06, + "loss": 0.7481, + "mean_token_accuracy": 0.7564265727996826, + "num_tokens": 675709342.0, + "step": 26115 + }, + { + "epoch": 2.867999121458379, + "grad_norm": 1.9624934196472168, + "learning_rate": 5e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7504557967185974, + "num_tokens": 675740201.0, + "step": 26116 + }, + { + "epoch": 2.868108939160993, + "grad_norm": 2.129321575164795, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.754060685634613, + "num_tokens": 675766353.0, + "step": 26117 + }, + { + "epoch": 2.868218756863606, + "grad_norm": 2.1643927097320557, + "learning_rate": 5e-06, + "loss": 0.676, + "mean_token_accuracy": 0.776055097579956, + "num_tokens": 675789819.0, + "step": 26118 + }, + { + "epoch": 2.86832857456622, + "grad_norm": 2.0221128463745117, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7559084892272949, + "num_tokens": 675817294.0, + "step": 26119 + }, + { + "epoch": 2.8684383922688337, + "grad_norm": 2.0402824878692627, + "learning_rate": 5e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7417649030685425, + "num_tokens": 675842327.0, + "step": 26120 + }, + { + "epoch": 2.8685482099714474, + "grad_norm": 2.163830518722534, + "learning_rate": 5e-06, + "loss": 0.7075, + "mean_token_accuracy": 0.7657707929611206, + "num_tokens": 675868029.0, + "step": 26121 + }, + { + "epoch": 2.868658027674061, + "grad_norm": 2.2925305366516113, + "learning_rate": 5e-06, + "loss": 0.6329, + "mean_token_accuracy": 0.7881940603256226, + "num_tokens": 675888224.0, + "step": 26122 + }, + { + "epoch": 2.8687678453766745, + "grad_norm": 2.0030558109283447, + "learning_rate": 5e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7404866218566895, + "num_tokens": 675914937.0, + "step": 26123 + }, + { + "epoch": 2.8688776630792883, + "grad_norm": 2.099388599395752, + "learning_rate": 5e-06, + "loss": 0.6751, + "mean_token_accuracy": 0.7726166248321533, + "num_tokens": 675937509.0, + "step": 26124 + }, + { + "epoch": 2.868987480781902, + "grad_norm": 2.1172752380371094, + "learning_rate": 5e-06, + "loss": 0.6211, + "mean_token_accuracy": 0.7874879837036133, + "num_tokens": 675960677.0, + "step": 26125 + }, + { + "epoch": 2.869097298484516, + "grad_norm": 1.7776819467544556, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7557730674743652, + "num_tokens": 675995512.0, + "step": 26126 + }, + { + "epoch": 2.8692071161871295, + "grad_norm": 1.9118269681930542, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7672197818756104, + "num_tokens": 676023181.0, + "step": 26127 + }, + { + "epoch": 2.869316933889743, + "grad_norm": 1.9383325576782227, + "learning_rate": 5e-06, + "loss": 0.6887, + "mean_token_accuracy": 0.7731286287307739, + "num_tokens": 676050964.0, + "step": 26128 + }, + { + "epoch": 2.8694267515923566, + "grad_norm": 2.28102707862854, + "learning_rate": 5e-06, + "loss": 0.6255, + "mean_token_accuracy": 0.7850797176361084, + "num_tokens": 676071389.0, + "step": 26129 + }, + { + "epoch": 2.8695365692949704, + "grad_norm": 2.384385108947754, + "learning_rate": 5e-06, + "loss": 0.6741, + "mean_token_accuracy": 0.7790285348892212, + "num_tokens": 676093037.0, + "step": 26130 + }, + { + "epoch": 2.869646386997584, + "grad_norm": 2.145998001098633, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7500951290130615, + "num_tokens": 676123187.0, + "step": 26131 + }, + { + "epoch": 2.869756204700198, + "grad_norm": 2.1799867153167725, + "learning_rate": 5e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.7872398495674133, + "num_tokens": 676145205.0, + "step": 26132 + }, + { + "epoch": 2.869866022402811, + "grad_norm": 2.109907865524292, + "learning_rate": 5e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7690860033035278, + "num_tokens": 676168235.0, + "step": 26133 + }, + { + "epoch": 2.869975840105425, + "grad_norm": 1.992951512336731, + "learning_rate": 5e-06, + "loss": 0.71, + "mean_token_accuracy": 0.7698918581008911, + "num_tokens": 676194192.0, + "step": 26134 + }, + { + "epoch": 2.8700856578080387, + "grad_norm": 2.213258743286133, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7482209801673889, + "num_tokens": 676216056.0, + "step": 26135 + }, + { + "epoch": 2.8701954755106525, + "grad_norm": 1.9902561902999878, + "learning_rate": 5e-06, + "loss": 0.6143, + "mean_token_accuracy": 0.7925021648406982, + "num_tokens": 676242489.0, + "step": 26136 + }, + { + "epoch": 2.870305293213266, + "grad_norm": 2.148439884185791, + "learning_rate": 5e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7641153931617737, + "num_tokens": 676268683.0, + "step": 26137 + }, + { + "epoch": 2.8704151109158795, + "grad_norm": 1.903754472732544, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7622660994529724, + "num_tokens": 676297758.0, + "step": 26138 + }, + { + "epoch": 2.8705249286184933, + "grad_norm": 2.1321115493774414, + "learning_rate": 5e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7332672476768494, + "num_tokens": 676323800.0, + "step": 26139 + }, + { + "epoch": 2.870634746321107, + "grad_norm": 2.3963239192962646, + "learning_rate": 5e-06, + "loss": 0.6653, + "mean_token_accuracy": 0.773841142654419, + "num_tokens": 676343296.0, + "step": 26140 + }, + { + "epoch": 2.8707445640237204, + "grad_norm": 2.5039446353912354, + "learning_rate": 5e-06, + "loss": 0.703, + "mean_token_accuracy": 0.7637948989868164, + "num_tokens": 676362820.0, + "step": 26141 + }, + { + "epoch": 2.8708543817263346, + "grad_norm": 1.8536778688430786, + "learning_rate": 5e-06, + "loss": 0.677, + "mean_token_accuracy": 0.7795383930206299, + "num_tokens": 676393096.0, + "step": 26142 + }, + { + "epoch": 2.870964199428948, + "grad_norm": 2.1710879802703857, + "learning_rate": 5e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7644418478012085, + "num_tokens": 676414966.0, + "step": 26143 + }, + { + "epoch": 2.8710740171315616, + "grad_norm": 1.8184398412704468, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.736785888671875, + "num_tokens": 676446421.0, + "step": 26144 + }, + { + "epoch": 2.8711838348341754, + "grad_norm": 2.0401813983917236, + "learning_rate": 5e-06, + "loss": 0.7338, + "mean_token_accuracy": 0.7566377520561218, + "num_tokens": 676471769.0, + "step": 26145 + }, + { + "epoch": 2.8712936525367887, + "grad_norm": 1.8951164484024048, + "learning_rate": 5e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7646664977073669, + "num_tokens": 676502145.0, + "step": 26146 + }, + { + "epoch": 2.8714034702394025, + "grad_norm": 2.157248020172119, + "learning_rate": 5e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.7692098021507263, + "num_tokens": 676526988.0, + "step": 26147 + }, + { + "epoch": 2.871513287942016, + "grad_norm": 2.023129940032959, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7537053823471069, + "num_tokens": 676553112.0, + "step": 26148 + }, + { + "epoch": 2.87162310564463, + "grad_norm": 2.067951202392578, + "learning_rate": 5e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7590180039405823, + "num_tokens": 676581020.0, + "step": 26149 + }, + { + "epoch": 2.8717329233472437, + "grad_norm": 2.1167843341827393, + "learning_rate": 5e-06, + "loss": 0.6545, + "mean_token_accuracy": 0.7819452285766602, + "num_tokens": 676603911.0, + "step": 26150 + }, + { + "epoch": 2.871842741049857, + "grad_norm": 1.9446828365325928, + "learning_rate": 5e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7510198950767517, + "num_tokens": 676632134.0, + "step": 26151 + }, + { + "epoch": 2.871952558752471, + "grad_norm": 1.9409565925598145, + "learning_rate": 5e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.764812707901001, + "num_tokens": 676660889.0, + "step": 26152 + }, + { + "epoch": 2.8720623764550846, + "grad_norm": 2.132979393005371, + "learning_rate": 5e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.745975136756897, + "num_tokens": 676687249.0, + "step": 26153 + }, + { + "epoch": 2.8721721941576983, + "grad_norm": 2.25474214553833, + "learning_rate": 5e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7819008827209473, + "num_tokens": 676709724.0, + "step": 26154 + }, + { + "epoch": 2.872282011860312, + "grad_norm": 2.0234227180480957, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7460904121398926, + "num_tokens": 676736593.0, + "step": 26155 + }, + { + "epoch": 2.8723918295629254, + "grad_norm": 1.9453946352005005, + "learning_rate": 5e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7406333684921265, + "num_tokens": 676764862.0, + "step": 26156 + }, + { + "epoch": 2.872501647265539, + "grad_norm": 2.1471898555755615, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7668197751045227, + "num_tokens": 676787827.0, + "step": 26157 + }, + { + "epoch": 2.872611464968153, + "grad_norm": 2.059910535812378, + "learning_rate": 5e-06, + "loss": 0.6998, + "mean_token_accuracy": 0.7700525522232056, + "num_tokens": 676812303.0, + "step": 26158 + }, + { + "epoch": 2.8727212826707667, + "grad_norm": 2.1060726642608643, + "learning_rate": 5e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7725272178649902, + "num_tokens": 676836018.0, + "step": 26159 + }, + { + "epoch": 2.8728311003733804, + "grad_norm": 1.9655261039733887, + "learning_rate": 5e-06, + "loss": 0.6833, + "mean_token_accuracy": 0.7751792669296265, + "num_tokens": 676863520.0, + "step": 26160 + }, + { + "epoch": 2.8729409180759937, + "grad_norm": 2.162461280822754, + "learning_rate": 5e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.7790996432304382, + "num_tokens": 676885911.0, + "step": 26161 + }, + { + "epoch": 2.8730507357786075, + "grad_norm": 1.8465983867645264, + "learning_rate": 5e-06, + "loss": 0.7905, + "mean_token_accuracy": 0.7487533688545227, + "num_tokens": 676918337.0, + "step": 26162 + }, + { + "epoch": 2.8731605534812212, + "grad_norm": 1.9828547239303589, + "learning_rate": 5e-06, + "loss": 0.7504, + "mean_token_accuracy": 0.755986213684082, + "num_tokens": 676944525.0, + "step": 26163 + }, + { + "epoch": 2.873270371183835, + "grad_norm": 1.9489930868148804, + "learning_rate": 5e-06, + "loss": 0.6923, + "mean_token_accuracy": 0.7744564414024353, + "num_tokens": 676970084.0, + "step": 26164 + }, + { + "epoch": 2.8733801888864487, + "grad_norm": 1.9326064586639404, + "learning_rate": 5e-06, + "loss": 0.6433, + "mean_token_accuracy": 0.7845282554626465, + "num_tokens": 676996283.0, + "step": 26165 + }, + { + "epoch": 2.873490006589062, + "grad_norm": 2.137880563735962, + "learning_rate": 5e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7561771869659424, + "num_tokens": 677019589.0, + "step": 26166 + }, + { + "epoch": 2.873599824291676, + "grad_norm": 2.0180671215057373, + "learning_rate": 5e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.7763673067092896, + "num_tokens": 677046754.0, + "step": 26167 + }, + { + "epoch": 2.8737096419942896, + "grad_norm": 2.0996885299682617, + "learning_rate": 5e-06, + "loss": 0.7038, + "mean_token_accuracy": 0.7626966238021851, + "num_tokens": 677071375.0, + "step": 26168 + }, + { + "epoch": 2.873819459696903, + "grad_norm": 2.119553804397583, + "learning_rate": 5e-06, + "loss": 0.7125, + "mean_token_accuracy": 0.7748904228210449, + "num_tokens": 677096303.0, + "step": 26169 + }, + { + "epoch": 2.8739292773995166, + "grad_norm": 2.4304893016815186, + "learning_rate": 5e-06, + "loss": 0.7344, + "mean_token_accuracy": 0.7590582370758057, + "num_tokens": 677118487.0, + "step": 26170 + }, + { + "epoch": 2.8740390951021304, + "grad_norm": 2.1272683143615723, + "learning_rate": 5e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7538517117500305, + "num_tokens": 677144680.0, + "step": 26171 + }, + { + "epoch": 2.874148912804744, + "grad_norm": 1.9330122470855713, + "learning_rate": 5e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7549692392349243, + "num_tokens": 677172557.0, + "step": 26172 + }, + { + "epoch": 2.874258730507358, + "grad_norm": 2.2019455432891846, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7620776891708374, + "num_tokens": 677195134.0, + "step": 26173 + }, + { + "epoch": 2.8743685482099712, + "grad_norm": 2.0943827629089355, + "learning_rate": 5e-06, + "loss": 0.6981, + "mean_token_accuracy": 0.7740275859832764, + "num_tokens": 677219777.0, + "step": 26174 + }, + { + "epoch": 2.874478365912585, + "grad_norm": 1.99211847782135, + "learning_rate": 5e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7527636289596558, + "num_tokens": 677248943.0, + "step": 26175 + }, + { + "epoch": 2.8745881836151987, + "grad_norm": 2.068883180618286, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7649339437484741, + "num_tokens": 677274789.0, + "step": 26176 + }, + { + "epoch": 2.8746980013178125, + "grad_norm": 1.9016687870025635, + "learning_rate": 5e-06, + "loss": 0.624, + "mean_token_accuracy": 0.7894123792648315, + "num_tokens": 677301525.0, + "step": 26177 + }, + { + "epoch": 2.8748078190204263, + "grad_norm": 2.1545186042785645, + "learning_rate": 5e-06, + "loss": 0.6804, + "mean_token_accuracy": 0.7787362337112427, + "num_tokens": 677323098.0, + "step": 26178 + }, + { + "epoch": 2.8749176367230396, + "grad_norm": 2.055485963821411, + "learning_rate": 5e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7687946557998657, + "num_tokens": 677348185.0, + "step": 26179 + }, + { + "epoch": 2.8750274544256533, + "grad_norm": 1.9529633522033691, + "learning_rate": 5e-06, + "loss": 0.7371, + "mean_token_accuracy": 0.7563693523406982, + "num_tokens": 677376524.0, + "step": 26180 + }, + { + "epoch": 2.875137272128267, + "grad_norm": 2.3383865356445312, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7660847306251526, + "num_tokens": 677398577.0, + "step": 26181 + }, + { + "epoch": 2.875247089830881, + "grad_norm": 1.996350884437561, + "learning_rate": 5e-06, + "loss": 0.7009, + "mean_token_accuracy": 0.7670282125473022, + "num_tokens": 677426166.0, + "step": 26182 + }, + { + "epoch": 2.8753569075334946, + "grad_norm": 1.9734278917312622, + "learning_rate": 5e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7736443877220154, + "num_tokens": 677451978.0, + "step": 26183 + }, + { + "epoch": 2.875466725236108, + "grad_norm": 1.8983948230743408, + "learning_rate": 5e-06, + "loss": 0.6798, + "mean_token_accuracy": 0.7814671993255615, + "num_tokens": 677476632.0, + "step": 26184 + }, + { + "epoch": 2.8755765429387217, + "grad_norm": 1.8715994358062744, + "learning_rate": 5e-06, + "loss": 0.7453, + "mean_token_accuracy": 0.7590077519416809, + "num_tokens": 677506392.0, + "step": 26185 + }, + { + "epoch": 2.8756863606413354, + "grad_norm": 2.2612898349761963, + "learning_rate": 5e-06, + "loss": 0.7053, + "mean_token_accuracy": 0.7662373185157776, + "num_tokens": 677530332.0, + "step": 26186 + }, + { + "epoch": 2.875796178343949, + "grad_norm": 2.305370569229126, + "learning_rate": 5e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7751047611236572, + "num_tokens": 677550647.0, + "step": 26187 + }, + { + "epoch": 2.875905996046563, + "grad_norm": 2.361416816711426, + "learning_rate": 5e-06, + "loss": 0.7324, + "mean_token_accuracy": 0.7606111764907837, + "num_tokens": 677572970.0, + "step": 26188 + }, + { + "epoch": 2.8760158137491763, + "grad_norm": 2.3054358959198, + "learning_rate": 5e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.77228182554245, + "num_tokens": 677593915.0, + "step": 26189 + }, + { + "epoch": 2.87612563145179, + "grad_norm": 1.9257614612579346, + "learning_rate": 5e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7446798086166382, + "num_tokens": 677624410.0, + "step": 26190 + }, + { + "epoch": 2.8762354491544038, + "grad_norm": 2.3310470581054688, + "learning_rate": 5e-06, + "loss": 0.6799, + "mean_token_accuracy": 0.7680415511131287, + "num_tokens": 677646130.0, + "step": 26191 + }, + { + "epoch": 2.876345266857017, + "grad_norm": 1.9369581937789917, + "learning_rate": 5e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7479724287986755, + "num_tokens": 677677461.0, + "step": 26192 + }, + { + "epoch": 2.8764550845596313, + "grad_norm": 1.7737109661102295, + "learning_rate": 5e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7597231268882751, + "num_tokens": 677707878.0, + "step": 26193 + }, + { + "epoch": 2.8765649022622446, + "grad_norm": 1.906249761581421, + "learning_rate": 5e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7478935122489929, + "num_tokens": 677735962.0, + "step": 26194 + }, + { + "epoch": 2.8766747199648584, + "grad_norm": 2.022385835647583, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7659026384353638, + "num_tokens": 677763782.0, + "step": 26195 + }, + { + "epoch": 2.876784537667472, + "grad_norm": 1.9593777656555176, + "learning_rate": 5e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7679096460342407, + "num_tokens": 677791017.0, + "step": 26196 + }, + { + "epoch": 2.8768943553700854, + "grad_norm": 1.9503259658813477, + "learning_rate": 5e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.7594201564788818, + "num_tokens": 677819930.0, + "step": 26197 + }, + { + "epoch": 2.877004173072699, + "grad_norm": 1.9467027187347412, + "learning_rate": 5e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7508724331855774, + "num_tokens": 677848564.0, + "step": 26198 + }, + { + "epoch": 2.877113990775313, + "grad_norm": 1.8677445650100708, + "learning_rate": 5e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7484682202339172, + "num_tokens": 677877291.0, + "step": 26199 + }, + { + "epoch": 2.8772238084779267, + "grad_norm": 2.1725704669952393, + "learning_rate": 5e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.7799100875854492, + "num_tokens": 677899328.0, + "step": 26200 + }, + { + "epoch": 2.8773336261805404, + "grad_norm": 1.9241490364074707, + "learning_rate": 5e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7491061091423035, + "num_tokens": 677929258.0, + "step": 26201 + }, + { + "epoch": 2.8774434438831538, + "grad_norm": 1.9615885019302368, + "learning_rate": 5e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7632403373718262, + "num_tokens": 677953583.0, + "step": 26202 + }, + { + "epoch": 2.8775532615857675, + "grad_norm": 2.0421395301818848, + "learning_rate": 5e-06, + "loss": 0.717, + "mean_token_accuracy": 0.7662661075592041, + "num_tokens": 677979175.0, + "step": 26203 + }, + { + "epoch": 2.8776630792883813, + "grad_norm": 1.9044878482818604, + "learning_rate": 5e-06, + "loss": 0.7, + "mean_token_accuracy": 0.7677947282791138, + "num_tokens": 678007122.0, + "step": 26204 + }, + { + "epoch": 2.877772896990995, + "grad_norm": 2.1723763942718506, + "learning_rate": 5e-06, + "loss": 0.7072, + "mean_token_accuracy": 0.771353006362915, + "num_tokens": 678028935.0, + "step": 26205 + }, + { + "epoch": 2.877882714693609, + "grad_norm": 2.068523406982422, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7491300106048584, + "num_tokens": 678055017.0, + "step": 26206 + }, + { + "epoch": 2.877992532396222, + "grad_norm": 2.3077430725097656, + "learning_rate": 5e-06, + "loss": 0.7117, + "mean_token_accuracy": 0.7672528028488159, + "num_tokens": 678076020.0, + "step": 26207 + }, + { + "epoch": 2.878102350098836, + "grad_norm": 2.184687852859497, + "learning_rate": 5e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7641531229019165, + "num_tokens": 678098495.0, + "step": 26208 + }, + { + "epoch": 2.8782121678014496, + "grad_norm": 1.8919025659561157, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7694692611694336, + "num_tokens": 678129097.0, + "step": 26209 + }, + { + "epoch": 2.8783219855040634, + "grad_norm": 2.017449378967285, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.764011025428772, + "num_tokens": 678157819.0, + "step": 26210 + }, + { + "epoch": 2.878431803206677, + "grad_norm": 1.916374921798706, + "learning_rate": 5e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.7795788645744324, + "num_tokens": 678186256.0, + "step": 26211 + }, + { + "epoch": 2.8785416209092904, + "grad_norm": 2.3100786209106445, + "learning_rate": 5e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.7598137259483337, + "num_tokens": 678207095.0, + "step": 26212 + }, + { + "epoch": 2.878651438611904, + "grad_norm": 1.948515772819519, + "learning_rate": 5e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7431012392044067, + "num_tokens": 678237557.0, + "step": 26213 + }, + { + "epoch": 2.878761256314518, + "grad_norm": 2.1221206188201904, + "learning_rate": 5e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7698041200637817, + "num_tokens": 678261125.0, + "step": 26214 + }, + { + "epoch": 2.8788710740171317, + "grad_norm": 1.956725835800171, + "learning_rate": 5e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7533246278762817, + "num_tokens": 678290949.0, + "step": 26215 + }, + { + "epoch": 2.8789808917197455, + "grad_norm": 1.9078984260559082, + "learning_rate": 5e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7763752937316895, + "num_tokens": 678319610.0, + "step": 26216 + }, + { + "epoch": 2.879090709422359, + "grad_norm": 2.1488585472106934, + "learning_rate": 5e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.7605109214782715, + "num_tokens": 678346136.0, + "step": 26217 + }, + { + "epoch": 2.8792005271249725, + "grad_norm": 2.0014069080352783, + "learning_rate": 5e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.761947512626648, + "num_tokens": 678375001.0, + "step": 26218 + }, + { + "epoch": 2.8793103448275863, + "grad_norm": 1.9234490394592285, + "learning_rate": 5e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7636390924453735, + "num_tokens": 678403249.0, + "step": 26219 + }, + { + "epoch": 2.8794201625301996, + "grad_norm": 1.819653868675232, + "learning_rate": 5e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7681423425674438, + "num_tokens": 678435317.0, + "step": 26220 + }, + { + "epoch": 2.8795299802328134, + "grad_norm": 2.179823637008667, + "learning_rate": 5e-06, + "loss": 0.7495, + "mean_token_accuracy": 0.7575970888137817, + "num_tokens": 678458809.0, + "step": 26221 + }, + { + "epoch": 2.879639797935427, + "grad_norm": 1.9632477760314941, + "learning_rate": 5e-06, + "loss": 0.7465, + "mean_token_accuracy": 0.756228506565094, + "num_tokens": 678487708.0, + "step": 26222 + }, + { + "epoch": 2.879749615638041, + "grad_norm": 1.6815378665924072, + "learning_rate": 5e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.7540308237075806, + "num_tokens": 678525869.0, + "step": 26223 + }, + { + "epoch": 2.8798594333406546, + "grad_norm": 2.055220365524292, + "learning_rate": 5e-06, + "loss": 0.7257, + "mean_token_accuracy": 0.767798662185669, + "num_tokens": 678551600.0, + "step": 26224 + }, + { + "epoch": 2.879969251043268, + "grad_norm": 2.1189451217651367, + "learning_rate": 5e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7760477662086487, + "num_tokens": 678574824.0, + "step": 26225 + }, + { + "epoch": 2.8800790687458817, + "grad_norm": 2.1245720386505127, + "learning_rate": 5e-06, + "loss": 0.7773, + "mean_token_accuracy": 0.7536711096763611, + "num_tokens": 678601551.0, + "step": 26226 + }, + { + "epoch": 2.8801888864484955, + "grad_norm": 2.3874547481536865, + "learning_rate": 5e-06, + "loss": 0.6686, + "mean_token_accuracy": 0.7787065505981445, + "num_tokens": 678620839.0, + "step": 26227 + }, + { + "epoch": 2.8802987041511092, + "grad_norm": 2.100318193435669, + "learning_rate": 5e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.7706527709960938, + "num_tokens": 678644737.0, + "step": 26228 + }, + { + "epoch": 2.880408521853723, + "grad_norm": 2.405557155609131, + "learning_rate": 5e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7578775882720947, + "num_tokens": 678666406.0, + "step": 26229 + }, + { + "epoch": 2.8805183395563363, + "grad_norm": 1.9195663928985596, + "learning_rate": 5e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7437528371810913, + "num_tokens": 678695649.0, + "step": 26230 + }, + { + "epoch": 2.88062815725895, + "grad_norm": 1.8884471654891968, + "learning_rate": 5e-06, + "loss": 0.7035, + "mean_token_accuracy": 0.7635068297386169, + "num_tokens": 678727963.0, + "step": 26231 + }, + { + "epoch": 2.880737974961564, + "grad_norm": 2.1757044792175293, + "learning_rate": 5e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7659165859222412, + "num_tokens": 678751060.0, + "step": 26232 + }, + { + "epoch": 2.8808477926641776, + "grad_norm": 2.0510730743408203, + "learning_rate": 5e-06, + "loss": 0.7127, + "mean_token_accuracy": 0.765163779258728, + "num_tokens": 678774678.0, + "step": 26233 + }, + { + "epoch": 2.8809576103667913, + "grad_norm": 2.4843926429748535, + "learning_rate": 5e-06, + "loss": 0.6019, + "mean_token_accuracy": 0.7993168830871582, + "num_tokens": 678792505.0, + "step": 26234 + }, + { + "epoch": 2.8810674280694046, + "grad_norm": 1.9788275957107544, + "learning_rate": 5e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7521311640739441, + "num_tokens": 678818200.0, + "step": 26235 + }, + { + "epoch": 2.8811772457720184, + "grad_norm": 2.243973731994629, + "learning_rate": 5e-06, + "loss": 0.7083, + "mean_token_accuracy": 0.7670023441314697, + "num_tokens": 678840454.0, + "step": 26236 + }, + { + "epoch": 2.881287063474632, + "grad_norm": 1.887252926826477, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.7531676292419434, + "num_tokens": 678870004.0, + "step": 26237 + }, + { + "epoch": 2.881396881177246, + "grad_norm": 2.155942678451538, + "learning_rate": 5e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7548488974571228, + "num_tokens": 678896376.0, + "step": 26238 + }, + { + "epoch": 2.8815066988798597, + "grad_norm": 2.2378740310668945, + "learning_rate": 5e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7502493858337402, + "num_tokens": 678921150.0, + "step": 26239 + }, + { + "epoch": 2.881616516582473, + "grad_norm": 2.401029109954834, + "learning_rate": 5e-06, + "loss": 0.6761, + "mean_token_accuracy": 0.780707836151123, + "num_tokens": 678940296.0, + "step": 26240 + }, + { + "epoch": 2.8817263342850867, + "grad_norm": 2.357257604598999, + "learning_rate": 5e-06, + "loss": 0.6446, + "mean_token_accuracy": 0.7813089489936829, + "num_tokens": 678958595.0, + "step": 26241 + }, + { + "epoch": 2.8818361519877005, + "grad_norm": 1.8979389667510986, + "learning_rate": 5e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.754435658454895, + "num_tokens": 678989769.0, + "step": 26242 + }, + { + "epoch": 2.881945969690314, + "grad_norm": 2.3945045471191406, + "learning_rate": 5e-06, + "loss": 0.659, + "mean_token_accuracy": 0.7815500497817993, + "num_tokens": 679009379.0, + "step": 26243 + }, + { + "epoch": 2.882055787392928, + "grad_norm": 1.8881511688232422, + "learning_rate": 5e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7481062412261963, + "num_tokens": 679041388.0, + "step": 26244 + }, + { + "epoch": 2.8821656050955413, + "grad_norm": 1.9478209018707275, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.7688279151916504, + "num_tokens": 679066616.0, + "step": 26245 + }, + { + "epoch": 2.882275422798155, + "grad_norm": 2.1068291664123535, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7593263387680054, + "num_tokens": 679090085.0, + "step": 26246 + }, + { + "epoch": 2.882385240500769, + "grad_norm": 2.129646062850952, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7558642625808716, + "num_tokens": 679113825.0, + "step": 26247 + }, + { + "epoch": 2.882495058203382, + "grad_norm": 1.9446333646774292, + "learning_rate": 5e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7274763584136963, + "num_tokens": 679143212.0, + "step": 26248 + }, + { + "epoch": 2.882604875905996, + "grad_norm": 2.164855480194092, + "learning_rate": 5e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7539941668510437, + "num_tokens": 679167898.0, + "step": 26249 + }, + { + "epoch": 2.8827146936086097, + "grad_norm": 2.1262035369873047, + "learning_rate": 5e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7678074836730957, + "num_tokens": 679190856.0, + "step": 26250 + }, + { + "epoch": 2.8828245113112234, + "grad_norm": 1.8811577558517456, + "learning_rate": 5e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7408843040466309, + "num_tokens": 679223234.0, + "step": 26251 + }, + { + "epoch": 2.882934329013837, + "grad_norm": 1.9326024055480957, + "learning_rate": 5e-06, + "loss": 0.6607, + "mean_token_accuracy": 0.7789071202278137, + "num_tokens": 679250503.0, + "step": 26252 + }, + { + "epoch": 2.8830441467164505, + "grad_norm": 2.093062162399292, + "learning_rate": 5e-06, + "loss": 0.6429, + "mean_token_accuracy": 0.786837100982666, + "num_tokens": 679272858.0, + "step": 26253 + }, + { + "epoch": 2.8831539644190642, + "grad_norm": 2.118623971939087, + "learning_rate": 5e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7408006191253662, + "num_tokens": 679298723.0, + "step": 26254 + }, + { + "epoch": 2.883263782121678, + "grad_norm": 2.1111373901367188, + "learning_rate": 5e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7631669044494629, + "num_tokens": 679323559.0, + "step": 26255 + }, + { + "epoch": 2.8833735998242918, + "grad_norm": 2.0077311992645264, + "learning_rate": 5e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7649587392807007, + "num_tokens": 679348571.0, + "step": 26256 + }, + { + "epoch": 2.8834834175269055, + "grad_norm": 1.9344717264175415, + "learning_rate": 5e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7610650062561035, + "num_tokens": 679375842.0, + "step": 26257 + }, + { + "epoch": 2.883593235229519, + "grad_norm": 2.010549306869507, + "learning_rate": 5e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.7607497572898865, + "num_tokens": 679403523.0, + "step": 26258 + }, + { + "epoch": 2.8837030529321326, + "grad_norm": 2.195857524871826, + "learning_rate": 5e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.757298469543457, + "num_tokens": 679428807.0, + "step": 26259 + }, + { + "epoch": 2.8838128706347463, + "grad_norm": 2.3716964721679688, + "learning_rate": 5e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7716790437698364, + "num_tokens": 679449601.0, + "step": 26260 + }, + { + "epoch": 2.88392268833736, + "grad_norm": 1.9654771089553833, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.744249701499939, + "num_tokens": 679479259.0, + "step": 26261 + }, + { + "epoch": 2.884032506039974, + "grad_norm": 2.267826795578003, + "learning_rate": 5e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7726905345916748, + "num_tokens": 679501446.0, + "step": 26262 + }, + { + "epoch": 2.884142323742587, + "grad_norm": 2.0110507011413574, + "learning_rate": 5e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7452356219291687, + "num_tokens": 679527260.0, + "step": 26263 + }, + { + "epoch": 2.884252141445201, + "grad_norm": 2.5223891735076904, + "learning_rate": 5e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.7895177006721497, + "num_tokens": 679543773.0, + "step": 26264 + }, + { + "epoch": 2.8843619591478147, + "grad_norm": 1.9755032062530518, + "learning_rate": 5e-06, + "loss": 0.718, + "mean_token_accuracy": 0.7642762064933777, + "num_tokens": 679571619.0, + "step": 26265 + }, + { + "epoch": 2.8844717768504284, + "grad_norm": 2.0437021255493164, + "learning_rate": 5e-06, + "loss": 0.6863, + "mean_token_accuracy": 0.7755710482597351, + "num_tokens": 679595403.0, + "step": 26266 + }, + { + "epoch": 2.884581594553042, + "grad_norm": 2.0914063453674316, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7581056952476501, + "num_tokens": 679622393.0, + "step": 26267 + }, + { + "epoch": 2.8846914122556555, + "grad_norm": 1.8344923257827759, + "learning_rate": 5e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7379526495933533, + "num_tokens": 679655895.0, + "step": 26268 + }, + { + "epoch": 2.8848012299582693, + "grad_norm": 2.1143064498901367, + "learning_rate": 5e-06, + "loss": 0.7014, + "mean_token_accuracy": 0.7639912366867065, + "num_tokens": 679680666.0, + "step": 26269 + }, + { + "epoch": 2.884911047660883, + "grad_norm": 2.355050802230835, + "learning_rate": 5e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.771541178226471, + "num_tokens": 679700894.0, + "step": 26270 + }, + { + "epoch": 2.8850208653634963, + "grad_norm": 2.0758447647094727, + "learning_rate": 5e-06, + "loss": 0.7842, + "mean_token_accuracy": 0.7502584457397461, + "num_tokens": 679727134.0, + "step": 26271 + }, + { + "epoch": 2.88513068306611, + "grad_norm": 2.1973564624786377, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7694628834724426, + "num_tokens": 679752130.0, + "step": 26272 + }, + { + "epoch": 2.885240500768724, + "grad_norm": 2.0573320388793945, + "learning_rate": 5e-06, + "loss": 0.7315, + "mean_token_accuracy": 0.768951416015625, + "num_tokens": 679781586.0, + "step": 26273 + }, + { + "epoch": 2.8853503184713376, + "grad_norm": 2.000882863998413, + "learning_rate": 5e-06, + "loss": 0.7167, + "mean_token_accuracy": 0.7613872289657593, + "num_tokens": 679808619.0, + "step": 26274 + }, + { + "epoch": 2.8854601361739514, + "grad_norm": 1.9549064636230469, + "learning_rate": 5e-06, + "loss": 0.752, + "mean_token_accuracy": 0.7557864189147949, + "num_tokens": 679836932.0, + "step": 26275 + }, + { + "epoch": 2.8855699538765647, + "grad_norm": 2.445587635040283, + "learning_rate": 5e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.782247006893158, + "num_tokens": 679856019.0, + "step": 26276 + }, + { + "epoch": 2.8856797715791784, + "grad_norm": 2.1734282970428467, + "learning_rate": 5e-06, + "loss": 0.7103, + "mean_token_accuracy": 0.7695918083190918, + "num_tokens": 679879365.0, + "step": 26277 + }, + { + "epoch": 2.885789589281792, + "grad_norm": 2.2055931091308594, + "learning_rate": 5e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7623646855354309, + "num_tokens": 679903835.0, + "step": 26278 + }, + { + "epoch": 2.885899406984406, + "grad_norm": 2.1704111099243164, + "learning_rate": 5e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7710109949111938, + "num_tokens": 679930774.0, + "step": 26279 + }, + { + "epoch": 2.8860092246870197, + "grad_norm": 1.8600560426712036, + "learning_rate": 5e-06, + "loss": 0.6714, + "mean_token_accuracy": 0.7762059569358826, + "num_tokens": 679960306.0, + "step": 26280 + }, + { + "epoch": 2.886119042389633, + "grad_norm": 2.170532464981079, + "learning_rate": 5e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7691929340362549, + "num_tokens": 679983899.0, + "step": 26281 + }, + { + "epoch": 2.8862288600922468, + "grad_norm": 1.9994360208511353, + "learning_rate": 5e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7653291821479797, + "num_tokens": 680009179.0, + "step": 26282 + }, + { + "epoch": 2.8863386777948605, + "grad_norm": 2.161494493484497, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7501868009567261, + "num_tokens": 680031872.0, + "step": 26283 + }, + { + "epoch": 2.8864484954974743, + "grad_norm": 1.964747428894043, + "learning_rate": 5e-06, + "loss": 0.6719, + "mean_token_accuracy": 0.7835858464241028, + "num_tokens": 680058332.0, + "step": 26284 + }, + { + "epoch": 2.886558313200088, + "grad_norm": 1.8635435104370117, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7490899562835693, + "num_tokens": 680089395.0, + "step": 26285 + }, + { + "epoch": 2.8866681309027014, + "grad_norm": 1.927808165550232, + "learning_rate": 5e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7629613876342773, + "num_tokens": 680118770.0, + "step": 26286 + }, + { + "epoch": 2.886777948605315, + "grad_norm": 1.85699462890625, + "learning_rate": 5e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.739000678062439, + "num_tokens": 680149750.0, + "step": 26287 + }, + { + "epoch": 2.886887766307929, + "grad_norm": 2.1934967041015625, + "learning_rate": 5e-06, + "loss": 0.726, + "mean_token_accuracy": 0.7662025094032288, + "num_tokens": 680173907.0, + "step": 26288 + }, + { + "epoch": 2.8869975840105426, + "grad_norm": 1.941022276878357, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7732220888137817, + "num_tokens": 680198509.0, + "step": 26289 + }, + { + "epoch": 2.8871074017131564, + "grad_norm": 1.9522935152053833, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.758832573890686, + "num_tokens": 680227688.0, + "step": 26290 + }, + { + "epoch": 2.8872172194157697, + "grad_norm": 2.161193370819092, + "learning_rate": 5e-06, + "loss": 0.691, + "mean_token_accuracy": 0.7790440917015076, + "num_tokens": 680251284.0, + "step": 26291 + }, + { + "epoch": 2.8873270371183835, + "grad_norm": 2.4309825897216797, + "learning_rate": 5e-06, + "loss": 0.6535, + "mean_token_accuracy": 0.7797298431396484, + "num_tokens": 680269954.0, + "step": 26292 + }, + { + "epoch": 2.887436854820997, + "grad_norm": 2.1252379417419434, + "learning_rate": 5e-06, + "loss": 0.646, + "mean_token_accuracy": 0.783683180809021, + "num_tokens": 680293745.0, + "step": 26293 + }, + { + "epoch": 2.887546672523611, + "grad_norm": 2.1489880084991455, + "learning_rate": 5e-06, + "loss": 0.6652, + "mean_token_accuracy": 0.774875819683075, + "num_tokens": 680317778.0, + "step": 26294 + }, + { + "epoch": 2.8876564902262247, + "grad_norm": 1.9860891103744507, + "learning_rate": 5e-06, + "loss": 0.687, + "mean_token_accuracy": 0.768291711807251, + "num_tokens": 680346668.0, + "step": 26295 + }, + { + "epoch": 2.887766307928838, + "grad_norm": 2.0610904693603516, + "learning_rate": 5e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7507936954498291, + "num_tokens": 680377390.0, + "step": 26296 + }, + { + "epoch": 2.887876125631452, + "grad_norm": 1.785279393196106, + "learning_rate": 5e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7405902147293091, + "num_tokens": 680413330.0, + "step": 26297 + }, + { + "epoch": 2.8879859433340656, + "grad_norm": 1.7732081413269043, + "learning_rate": 5e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7516423463821411, + "num_tokens": 680448309.0, + "step": 26298 + }, + { + "epoch": 2.888095761036679, + "grad_norm": 2.2637939453125, + "learning_rate": 5e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.763515830039978, + "num_tokens": 680471566.0, + "step": 26299 + }, + { + "epoch": 2.8882055787392926, + "grad_norm": 1.9325259923934937, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.7496647834777832, + "num_tokens": 680499838.0, + "step": 26300 + }, + { + "epoch": 2.8883153964419064, + "grad_norm": 2.1599888801574707, + "learning_rate": 5e-06, + "loss": 0.685, + "mean_token_accuracy": 0.7779357433319092, + "num_tokens": 680521981.0, + "step": 26301 + }, + { + "epoch": 2.88842521414452, + "grad_norm": 2.050689697265625, + "learning_rate": 5e-06, + "loss": 0.6888, + "mean_token_accuracy": 0.7700777649879456, + "num_tokens": 680547514.0, + "step": 26302 + }, + { + "epoch": 2.888535031847134, + "grad_norm": 1.955546498298645, + "learning_rate": 5e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.7659801244735718, + "num_tokens": 680572520.0, + "step": 26303 + }, + { + "epoch": 2.888644849549747, + "grad_norm": 2.0228354930877686, + "learning_rate": 5e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7364645600318909, + "num_tokens": 680599522.0, + "step": 26304 + }, + { + "epoch": 2.888754667252361, + "grad_norm": 1.9246610403060913, + "learning_rate": 5e-06, + "loss": 0.677, + "mean_token_accuracy": 0.7809481620788574, + "num_tokens": 680626150.0, + "step": 26305 + }, + { + "epoch": 2.8888644849549747, + "grad_norm": 1.9442698955535889, + "learning_rate": 5e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7438377141952515, + "num_tokens": 680653879.0, + "step": 26306 + }, + { + "epoch": 2.8889743026575885, + "grad_norm": 1.9104225635528564, + "learning_rate": 5e-06, + "loss": 0.7513, + "mean_token_accuracy": 0.7566120028495789, + "num_tokens": 680682393.0, + "step": 26307 + }, + { + "epoch": 2.8890841203602022, + "grad_norm": 2.2473902702331543, + "learning_rate": 5e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7710057497024536, + "num_tokens": 680706626.0, + "step": 26308 + }, + { + "epoch": 2.8891939380628155, + "grad_norm": 2.1819026470184326, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7734315395355225, + "num_tokens": 680729364.0, + "step": 26309 + }, + { + "epoch": 2.8893037557654293, + "grad_norm": 1.988303303718567, + "learning_rate": 5e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7498805522918701, + "num_tokens": 680756500.0, + "step": 26310 + }, + { + "epoch": 2.889413573468043, + "grad_norm": 2.184783935546875, + "learning_rate": 5e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7540132999420166, + "num_tokens": 680781644.0, + "step": 26311 + }, + { + "epoch": 2.889523391170657, + "grad_norm": 2.3025333881378174, + "learning_rate": 5e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.734157383441925, + "num_tokens": 680806747.0, + "step": 26312 + }, + { + "epoch": 2.8896332088732706, + "grad_norm": 2.061509132385254, + "learning_rate": 5e-06, + "loss": 0.6838, + "mean_token_accuracy": 0.7719988822937012, + "num_tokens": 680832656.0, + "step": 26313 + }, + { + "epoch": 2.889743026575884, + "grad_norm": 2.0944628715515137, + "learning_rate": 5e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7679682970046997, + "num_tokens": 680859109.0, + "step": 26314 + }, + { + "epoch": 2.8898528442784976, + "grad_norm": 2.1054792404174805, + "learning_rate": 5e-06, + "loss": 0.6726, + "mean_token_accuracy": 0.7797072529792786, + "num_tokens": 680884074.0, + "step": 26315 + }, + { + "epoch": 2.8899626619811114, + "grad_norm": 2.0533223152160645, + "learning_rate": 5e-06, + "loss": 0.7027, + "mean_token_accuracy": 0.7699216604232788, + "num_tokens": 680909324.0, + "step": 26316 + }, + { + "epoch": 2.890072479683725, + "grad_norm": 2.1576149463653564, + "learning_rate": 5e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.739672839641571, + "num_tokens": 680934509.0, + "step": 26317 + }, + { + "epoch": 2.890182297386339, + "grad_norm": 1.9503109455108643, + "learning_rate": 5e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7587295174598694, + "num_tokens": 680959926.0, + "step": 26318 + }, + { + "epoch": 2.8902921150889522, + "grad_norm": 1.8733278512954712, + "learning_rate": 5e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7671712636947632, + "num_tokens": 680989498.0, + "step": 26319 + }, + { + "epoch": 2.890401932791566, + "grad_norm": 1.9751472473144531, + "learning_rate": 5e-06, + "loss": 0.6249, + "mean_token_accuracy": 0.8010430932044983, + "num_tokens": 681016049.0, + "step": 26320 + }, + { + "epoch": 2.8905117504941797, + "grad_norm": 2.3231101036071777, + "learning_rate": 5e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7491154670715332, + "num_tokens": 681036843.0, + "step": 26321 + }, + { + "epoch": 2.890621568196793, + "grad_norm": 2.4662036895751953, + "learning_rate": 5e-06, + "loss": 0.7216, + "mean_token_accuracy": 0.7569059133529663, + "num_tokens": 681056971.0, + "step": 26322 + }, + { + "epoch": 2.8907313858994073, + "grad_norm": 1.8401997089385986, + "learning_rate": 5e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7376271486282349, + "num_tokens": 681088098.0, + "step": 26323 + }, + { + "epoch": 2.8908412036020206, + "grad_norm": 2.0516517162323, + "learning_rate": 5e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7575600147247314, + "num_tokens": 681113951.0, + "step": 26324 + }, + { + "epoch": 2.8909510213046343, + "grad_norm": 2.218501329421997, + "learning_rate": 5e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7380590438842773, + "num_tokens": 681142536.0, + "step": 26325 + }, + { + "epoch": 2.891060839007248, + "grad_norm": 2.2082126140594482, + "learning_rate": 5e-06, + "loss": 0.7142, + "mean_token_accuracy": 0.7780463695526123, + "num_tokens": 681165397.0, + "step": 26326 + }, + { + "epoch": 2.8911706567098614, + "grad_norm": 2.02750825881958, + "learning_rate": 5e-06, + "loss": 0.7201, + "mean_token_accuracy": 0.7635289430618286, + "num_tokens": 681192234.0, + "step": 26327 + }, + { + "epoch": 2.891280474412475, + "grad_norm": 2.325228214263916, + "learning_rate": 5e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7650533318519592, + "num_tokens": 681213048.0, + "step": 26328 + }, + { + "epoch": 2.891390292115089, + "grad_norm": 2.004136800765991, + "learning_rate": 5e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7436169385910034, + "num_tokens": 681242104.0, + "step": 26329 + }, + { + "epoch": 2.8915001098177027, + "grad_norm": 2.070507764816284, + "learning_rate": 5e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.7686243653297424, + "num_tokens": 681265966.0, + "step": 26330 + }, + { + "epoch": 2.8916099275203164, + "grad_norm": 1.8557161092758179, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7655870914459229, + "num_tokens": 681295068.0, + "step": 26331 + }, + { + "epoch": 2.8917197452229297, + "grad_norm": 2.106520891189575, + "learning_rate": 5e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7628386616706848, + "num_tokens": 681320597.0, + "step": 26332 + }, + { + "epoch": 2.8918295629255435, + "grad_norm": 2.339402198791504, + "learning_rate": 5e-06, + "loss": 0.64, + "mean_token_accuracy": 0.7841042280197144, + "num_tokens": 681338871.0, + "step": 26333 + }, + { + "epoch": 2.8919393806281573, + "grad_norm": 1.97737455368042, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7618794441223145, + "num_tokens": 681367337.0, + "step": 26334 + }, + { + "epoch": 2.892049198330771, + "grad_norm": 1.8878610134124756, + "learning_rate": 5e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7580409049987793, + "num_tokens": 681396320.0, + "step": 26335 + }, + { + "epoch": 2.8921590160333848, + "grad_norm": 1.9419015645980835, + "learning_rate": 5e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7286711931228638, + "num_tokens": 681426097.0, + "step": 26336 + }, + { + "epoch": 2.892268833735998, + "grad_norm": 2.059798240661621, + "learning_rate": 5e-06, + "loss": 0.7327, + "mean_token_accuracy": 0.7608398199081421, + "num_tokens": 681451874.0, + "step": 26337 + }, + { + "epoch": 2.892378651438612, + "grad_norm": 1.9252822399139404, + "learning_rate": 5e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7490297555923462, + "num_tokens": 681480914.0, + "step": 26338 + }, + { + "epoch": 2.8924884691412256, + "grad_norm": 2.120135545730591, + "learning_rate": 5e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.761501669883728, + "num_tokens": 681507011.0, + "step": 26339 + }, + { + "epoch": 2.8925982868438394, + "grad_norm": 2.1844825744628906, + "learning_rate": 5e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7497619986534119, + "num_tokens": 681534570.0, + "step": 26340 + }, + { + "epoch": 2.892708104546453, + "grad_norm": 2.362590789794922, + "learning_rate": 5e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7669891119003296, + "num_tokens": 681556847.0, + "step": 26341 + }, + { + "epoch": 2.8928179222490664, + "grad_norm": 2.3291139602661133, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7734209299087524, + "num_tokens": 681575541.0, + "step": 26342 + }, + { + "epoch": 2.89292773995168, + "grad_norm": 2.2635445594787598, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7760222554206848, + "num_tokens": 681596476.0, + "step": 26343 + }, + { + "epoch": 2.893037557654294, + "grad_norm": 1.8862860202789307, + "learning_rate": 5e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7308288812637329, + "num_tokens": 681629160.0, + "step": 26344 + }, + { + "epoch": 2.8931473753569077, + "grad_norm": 2.1797738075256348, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7754040956497192, + "num_tokens": 681653502.0, + "step": 26345 + }, + { + "epoch": 2.8932571930595214, + "grad_norm": 1.8192723989486694, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7591941356658936, + "num_tokens": 681687556.0, + "step": 26346 + }, + { + "epoch": 2.8933670107621348, + "grad_norm": 2.1084115505218506, + "learning_rate": 5e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.770824670791626, + "num_tokens": 681712678.0, + "step": 26347 + }, + { + "epoch": 2.8934768284647485, + "grad_norm": 2.4193060398101807, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7629672288894653, + "num_tokens": 681733947.0, + "step": 26348 + }, + { + "epoch": 2.8935866461673623, + "grad_norm": 1.7457547187805176, + "learning_rate": 5e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7395100593566895, + "num_tokens": 681769850.0, + "step": 26349 + }, + { + "epoch": 2.8936964638699756, + "grad_norm": 1.9921836853027344, + "learning_rate": 5e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7483049631118774, + "num_tokens": 681797395.0, + "step": 26350 + }, + { + "epoch": 2.8938062815725893, + "grad_norm": 2.11759090423584, + "learning_rate": 5e-06, + "loss": 0.676, + "mean_token_accuracy": 0.7736668586730957, + "num_tokens": 681820836.0, + "step": 26351 + }, + { + "epoch": 2.893916099275203, + "grad_norm": 2.0903992652893066, + "learning_rate": 5e-06, + "loss": 0.6593, + "mean_token_accuracy": 0.7783404588699341, + "num_tokens": 681843982.0, + "step": 26352 + }, + { + "epoch": 2.894025916977817, + "grad_norm": 2.0479736328125, + "learning_rate": 5e-06, + "loss": 0.6821, + "mean_token_accuracy": 0.7747372388839722, + "num_tokens": 681870402.0, + "step": 26353 + }, + { + "epoch": 2.8941357346804306, + "grad_norm": 2.1678521633148193, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7405304908752441, + "num_tokens": 681895307.0, + "step": 26354 + }, + { + "epoch": 2.894245552383044, + "grad_norm": 1.8977203369140625, + "learning_rate": 5e-06, + "loss": 0.6743, + "mean_token_accuracy": 0.7741085290908813, + "num_tokens": 681926361.0, + "step": 26355 + }, + { + "epoch": 2.8943553700856577, + "grad_norm": 1.9637473821640015, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7466786503791809, + "num_tokens": 681955236.0, + "step": 26356 + }, + { + "epoch": 2.8944651877882714, + "grad_norm": 2.0296952724456787, + "learning_rate": 5e-06, + "loss": 0.7475, + "mean_token_accuracy": 0.7569953203201294, + "num_tokens": 681979416.0, + "step": 26357 + }, + { + "epoch": 2.894575005490885, + "grad_norm": 1.8485276699066162, + "learning_rate": 5e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7609117031097412, + "num_tokens": 682010864.0, + "step": 26358 + }, + { + "epoch": 2.894684823193499, + "grad_norm": 2.154080867767334, + "learning_rate": 5e-06, + "loss": 0.7233, + "mean_token_accuracy": 0.7590333819389343, + "num_tokens": 682035713.0, + "step": 26359 + }, + { + "epoch": 2.8947946408961123, + "grad_norm": 2.0097382068634033, + "learning_rate": 5e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7580509185791016, + "num_tokens": 682064849.0, + "step": 26360 + }, + { + "epoch": 2.894904458598726, + "grad_norm": 2.0393099784851074, + "learning_rate": 5e-06, + "loss": 0.7616, + "mean_token_accuracy": 0.7509266138076782, + "num_tokens": 682092648.0, + "step": 26361 + }, + { + "epoch": 2.89501427630134, + "grad_norm": 2.281738758087158, + "learning_rate": 5e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.7727481126785278, + "num_tokens": 682114076.0, + "step": 26362 + }, + { + "epoch": 2.8951240940039535, + "grad_norm": 1.9946578741073608, + "learning_rate": 5e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.761411726474762, + "num_tokens": 682140431.0, + "step": 26363 + }, + { + "epoch": 2.8952339117065673, + "grad_norm": 1.9862923622131348, + "learning_rate": 5e-06, + "loss": 0.7853, + "mean_token_accuracy": 0.7430307269096375, + "num_tokens": 682171781.0, + "step": 26364 + }, + { + "epoch": 2.8953437294091806, + "grad_norm": 2.1274213790893555, + "learning_rate": 5e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7464971542358398, + "num_tokens": 682197448.0, + "step": 26365 + }, + { + "epoch": 2.8954535471117944, + "grad_norm": 2.1861298084259033, + "learning_rate": 5e-06, + "loss": 0.7651, + "mean_token_accuracy": 0.7516906261444092, + "num_tokens": 682222809.0, + "step": 26366 + }, + { + "epoch": 2.895563364814408, + "grad_norm": 2.729504108428955, + "learning_rate": 5e-06, + "loss": 0.6298, + "mean_token_accuracy": 0.7973058223724365, + "num_tokens": 682240414.0, + "step": 26367 + }, + { + "epoch": 2.895673182517022, + "grad_norm": 2.191080093383789, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7829329967498779, + "num_tokens": 682263306.0, + "step": 26368 + }, + { + "epoch": 2.8957830002196356, + "grad_norm": 1.8157188892364502, + "learning_rate": 5e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7383216619491577, + "num_tokens": 682294544.0, + "step": 26369 + }, + { + "epoch": 2.895892817922249, + "grad_norm": 2.201686382293701, + "learning_rate": 5e-06, + "loss": 0.6538, + "mean_token_accuracy": 0.7838761210441589, + "num_tokens": 682316728.0, + "step": 26370 + }, + { + "epoch": 2.8960026356248627, + "grad_norm": 1.9669805765151978, + "learning_rate": 5e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7405399084091187, + "num_tokens": 682349679.0, + "step": 26371 + }, + { + "epoch": 2.8961124533274765, + "grad_norm": 2.1558797359466553, + "learning_rate": 5e-06, + "loss": 0.6919, + "mean_token_accuracy": 0.7718683481216431, + "num_tokens": 682372161.0, + "step": 26372 + }, + { + "epoch": 2.89622227103009, + "grad_norm": 2.0543155670166016, + "learning_rate": 5e-06, + "loss": 0.72, + "mean_token_accuracy": 0.764105498790741, + "num_tokens": 682397987.0, + "step": 26373 + }, + { + "epoch": 2.896332088732704, + "grad_norm": 1.8165926933288574, + "learning_rate": 5e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7387022972106934, + "num_tokens": 682431182.0, + "step": 26374 + }, + { + "epoch": 2.8964419064353173, + "grad_norm": 2.0309789180755615, + "learning_rate": 5e-06, + "loss": 0.6865, + "mean_token_accuracy": 0.7682919502258301, + "num_tokens": 682456553.0, + "step": 26375 + }, + { + "epoch": 2.896551724137931, + "grad_norm": 1.9839048385620117, + "learning_rate": 5e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7788844704627991, + "num_tokens": 682483213.0, + "step": 26376 + }, + { + "epoch": 2.896661541840545, + "grad_norm": 2.202512264251709, + "learning_rate": 5e-06, + "loss": 0.6771, + "mean_token_accuracy": 0.779572606086731, + "num_tokens": 682504979.0, + "step": 26377 + }, + { + "epoch": 2.896771359543158, + "grad_norm": 2.154569387435913, + "learning_rate": 5e-06, + "loss": 0.7205, + "mean_token_accuracy": 0.7669378519058228, + "num_tokens": 682530160.0, + "step": 26378 + }, + { + "epoch": 2.896881177245772, + "grad_norm": 2.357093572616577, + "learning_rate": 5e-06, + "loss": 0.6428, + "mean_token_accuracy": 0.7862915992736816, + "num_tokens": 682550901.0, + "step": 26379 + }, + { + "epoch": 2.8969909949483856, + "grad_norm": 1.738404393196106, + "learning_rate": 5e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7392733097076416, + "num_tokens": 682588760.0, + "step": 26380 + }, + { + "epoch": 2.8971008126509994, + "grad_norm": 2.0683956146240234, + "learning_rate": 5e-06, + "loss": 0.6889, + "mean_token_accuracy": 0.7732884287834167, + "num_tokens": 682616137.0, + "step": 26381 + }, + { + "epoch": 2.897210630353613, + "grad_norm": 2.0008480548858643, + "learning_rate": 5e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7587134838104248, + "num_tokens": 682644875.0, + "step": 26382 + }, + { + "epoch": 2.8973204480562265, + "grad_norm": 2.0930941104888916, + "learning_rate": 5e-06, + "loss": 0.6876, + "mean_token_accuracy": 0.7711607217788696, + "num_tokens": 682670133.0, + "step": 26383 + }, + { + "epoch": 2.89743026575884, + "grad_norm": 2.1343276500701904, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7696066498756409, + "num_tokens": 682696706.0, + "step": 26384 + }, + { + "epoch": 2.897540083461454, + "grad_norm": 2.184352159500122, + "learning_rate": 5e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7697868347167969, + "num_tokens": 682719128.0, + "step": 26385 + }, + { + "epoch": 2.8976499011640677, + "grad_norm": 2.1358001232147217, + "learning_rate": 5e-06, + "loss": 0.6519, + "mean_token_accuracy": 0.7819681763648987, + "num_tokens": 682741577.0, + "step": 26386 + }, + { + "epoch": 2.8977597188666815, + "grad_norm": 1.9887036085128784, + "learning_rate": 5e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7675613164901733, + "num_tokens": 682768294.0, + "step": 26387 + }, + { + "epoch": 2.897869536569295, + "grad_norm": 2.191263198852539, + "learning_rate": 5e-06, + "loss": 0.6747, + "mean_token_accuracy": 0.7773580551147461, + "num_tokens": 682789707.0, + "step": 26388 + }, + { + "epoch": 2.8979793542719086, + "grad_norm": 2.007432460784912, + "learning_rate": 5e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7385590076446533, + "num_tokens": 682820257.0, + "step": 26389 + }, + { + "epoch": 2.8980891719745223, + "grad_norm": 2.0956742763519287, + "learning_rate": 5e-06, + "loss": 0.6816, + "mean_token_accuracy": 0.7733683586120605, + "num_tokens": 682845396.0, + "step": 26390 + }, + { + "epoch": 2.898198989677136, + "grad_norm": 2.076923131942749, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7492252588272095, + "num_tokens": 682874491.0, + "step": 26391 + }, + { + "epoch": 2.89830880737975, + "grad_norm": 2.318709373474121, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7777079343795776, + "num_tokens": 682894390.0, + "step": 26392 + }, + { + "epoch": 2.898418625082363, + "grad_norm": 2.106250286102295, + "learning_rate": 5e-06, + "loss": 0.7128, + "mean_token_accuracy": 0.7640612125396729, + "num_tokens": 682919340.0, + "step": 26393 + }, + { + "epoch": 2.898528442784977, + "grad_norm": 1.7923880815505981, + "learning_rate": 5e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7451251745223999, + "num_tokens": 682954208.0, + "step": 26394 + }, + { + "epoch": 2.8986382604875907, + "grad_norm": 2.0963802337646484, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.7503482699394226, + "num_tokens": 682980113.0, + "step": 26395 + }, + { + "epoch": 2.8987480781902044, + "grad_norm": 1.902042269706726, + "learning_rate": 5e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7531259655952454, + "num_tokens": 683009577.0, + "step": 26396 + }, + { + "epoch": 2.898857895892818, + "grad_norm": 1.9260042905807495, + "learning_rate": 5e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7686727046966553, + "num_tokens": 683036978.0, + "step": 26397 + }, + { + "epoch": 2.8989677135954315, + "grad_norm": 1.9686583280563354, + "learning_rate": 5e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.750982403755188, + "num_tokens": 683064839.0, + "step": 26398 + }, + { + "epoch": 2.8990775312980452, + "grad_norm": 2.3651013374328613, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7711538672447205, + "num_tokens": 683085750.0, + "step": 26399 + }, + { + "epoch": 2.899187349000659, + "grad_norm": 2.041254997253418, + "learning_rate": 5e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7690260410308838, + "num_tokens": 683109694.0, + "step": 26400 + }, + { + "epoch": 2.8992971667032723, + "grad_norm": 1.9676053524017334, + "learning_rate": 5e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7410786151885986, + "num_tokens": 683139865.0, + "step": 26401 + }, + { + "epoch": 2.899406984405886, + "grad_norm": 2.087244987487793, + "learning_rate": 5e-06, + "loss": 0.7019, + "mean_token_accuracy": 0.7672326564788818, + "num_tokens": 683165957.0, + "step": 26402 + }, + { + "epoch": 2.8995168021085, + "grad_norm": 1.9321541786193848, + "learning_rate": 5e-06, + "loss": 0.7084, + "mean_token_accuracy": 0.7695006728172302, + "num_tokens": 683192910.0, + "step": 26403 + }, + { + "epoch": 2.8996266198111136, + "grad_norm": 2.2415082454681396, + "learning_rate": 5e-06, + "loss": 0.6043, + "mean_token_accuracy": 0.7935551404953003, + "num_tokens": 683213225.0, + "step": 26404 + }, + { + "epoch": 2.8997364375137273, + "grad_norm": 1.9701249599456787, + "learning_rate": 5e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7735168933868408, + "num_tokens": 683238292.0, + "step": 26405 + }, + { + "epoch": 2.8998462552163407, + "grad_norm": 2.2357330322265625, + "learning_rate": 5e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.7572978138923645, + "num_tokens": 683260835.0, + "step": 26406 + }, + { + "epoch": 2.8999560729189544, + "grad_norm": 2.065767526626587, + "learning_rate": 5e-06, + "loss": 0.7334, + "mean_token_accuracy": 0.7633470296859741, + "num_tokens": 683284918.0, + "step": 26407 + }, + { + "epoch": 2.900065890621568, + "grad_norm": 2.1689505577087402, + "learning_rate": 5e-06, + "loss": 0.6651, + "mean_token_accuracy": 0.7738426923751831, + "num_tokens": 683307842.0, + "step": 26408 + }, + { + "epoch": 2.900175708324182, + "grad_norm": 1.9624146223068237, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7522380352020264, + "num_tokens": 683336199.0, + "step": 26409 + }, + { + "epoch": 2.9002855260267957, + "grad_norm": 2.113535165786743, + "learning_rate": 5e-06, + "loss": 0.7531, + "mean_token_accuracy": 0.7525588870048523, + "num_tokens": 683363601.0, + "step": 26410 + }, + { + "epoch": 2.900395343729409, + "grad_norm": 1.9796355962753296, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.754315972328186, + "num_tokens": 683393555.0, + "step": 26411 + }, + { + "epoch": 2.9005051614320227, + "grad_norm": 1.9162123203277588, + "learning_rate": 5e-06, + "loss": 0.7915, + "mean_token_accuracy": 0.7373600602149963, + "num_tokens": 683422691.0, + "step": 26412 + }, + { + "epoch": 2.9006149791346365, + "grad_norm": 1.8709475994110107, + "learning_rate": 5e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7671034336090088, + "num_tokens": 683456714.0, + "step": 26413 + }, + { + "epoch": 2.9007247968372503, + "grad_norm": 1.9899183511734009, + "learning_rate": 5e-06, + "loss": 0.7022, + "mean_token_accuracy": 0.7703536748886108, + "num_tokens": 683482495.0, + "step": 26414 + }, + { + "epoch": 2.900834614539864, + "grad_norm": 2.0476412773132324, + "learning_rate": 5e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.7727478742599487, + "num_tokens": 683507581.0, + "step": 26415 + }, + { + "epoch": 2.9009444322424773, + "grad_norm": 2.0400514602661133, + "learning_rate": 5e-06, + "loss": 0.6873, + "mean_token_accuracy": 0.7771554589271545, + "num_tokens": 683534959.0, + "step": 26416 + }, + { + "epoch": 2.901054249945091, + "grad_norm": 1.9178563356399536, + "learning_rate": 5e-06, + "loss": 0.7133, + "mean_token_accuracy": 0.7613775730133057, + "num_tokens": 683561767.0, + "step": 26417 + }, + { + "epoch": 2.901164067647705, + "grad_norm": 2.109283208847046, + "learning_rate": 5e-06, + "loss": 0.7221, + "mean_token_accuracy": 0.7656811475753784, + "num_tokens": 683586088.0, + "step": 26418 + }, + { + "epoch": 2.9012738853503186, + "grad_norm": 2.2388834953308105, + "learning_rate": 5e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7538634538650513, + "num_tokens": 683609269.0, + "step": 26419 + }, + { + "epoch": 2.9013837030529324, + "grad_norm": 2.303947687149048, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7738476991653442, + "num_tokens": 683631699.0, + "step": 26420 + }, + { + "epoch": 2.9014935207555457, + "grad_norm": 1.9129798412322998, + "learning_rate": 5e-06, + "loss": 0.723, + "mean_token_accuracy": 0.7686535716056824, + "num_tokens": 683660311.0, + "step": 26421 + }, + { + "epoch": 2.9016033384581594, + "grad_norm": 2.2778160572052, + "learning_rate": 5e-06, + "loss": 0.63, + "mean_token_accuracy": 0.7890293598175049, + "num_tokens": 683680919.0, + "step": 26422 + }, + { + "epoch": 2.901713156160773, + "grad_norm": 2.595607280731201, + "learning_rate": 5e-06, + "loss": 0.6594, + "mean_token_accuracy": 0.7817689180374146, + "num_tokens": 683698534.0, + "step": 26423 + }, + { + "epoch": 2.9018229738633865, + "grad_norm": 2.094649314880371, + "learning_rate": 5e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7636348009109497, + "num_tokens": 683723648.0, + "step": 26424 + }, + { + "epoch": 2.9019327915660007, + "grad_norm": 2.1626155376434326, + "learning_rate": 5e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7679075002670288, + "num_tokens": 683746101.0, + "step": 26425 + }, + { + "epoch": 2.902042609268614, + "grad_norm": 2.4470510482788086, + "learning_rate": 5e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.7668455243110657, + "num_tokens": 683766264.0, + "step": 26426 + }, + { + "epoch": 2.9021524269712278, + "grad_norm": 2.2156782150268555, + "learning_rate": 5e-06, + "loss": 0.7193, + "mean_token_accuracy": 0.7682480812072754, + "num_tokens": 683789934.0, + "step": 26427 + }, + { + "epoch": 2.9022622446738415, + "grad_norm": 2.234543561935425, + "learning_rate": 5e-06, + "loss": 0.731, + "mean_token_accuracy": 0.7609846591949463, + "num_tokens": 683814403.0, + "step": 26428 + }, + { + "epoch": 2.902372062376455, + "grad_norm": 1.9690735340118408, + "learning_rate": 5e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7465277910232544, + "num_tokens": 683844111.0, + "step": 26429 + }, + { + "epoch": 2.9024818800790686, + "grad_norm": 2.2638418674468994, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7709337472915649, + "num_tokens": 683864565.0, + "step": 26430 + }, + { + "epoch": 2.9025916977816824, + "grad_norm": 2.0871307849884033, + "learning_rate": 5e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7489532232284546, + "num_tokens": 683890742.0, + "step": 26431 + }, + { + "epoch": 2.902701515484296, + "grad_norm": 2.246885061264038, + "learning_rate": 5e-06, + "loss": 0.6991, + "mean_token_accuracy": 0.7687357068061829, + "num_tokens": 683913039.0, + "step": 26432 + }, + { + "epoch": 2.90281133318691, + "grad_norm": 1.8082499504089355, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7563775777816772, + "num_tokens": 683946192.0, + "step": 26433 + }, + { + "epoch": 2.902921150889523, + "grad_norm": 2.096221685409546, + "learning_rate": 5e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7274891138076782, + "num_tokens": 683974200.0, + "step": 26434 + }, + { + "epoch": 2.903030968592137, + "grad_norm": 1.96175217628479, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.765990674495697, + "num_tokens": 683999988.0, + "step": 26435 + }, + { + "epoch": 2.9031407862947507, + "grad_norm": 2.2212836742401123, + "learning_rate": 5e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.75816810131073, + "num_tokens": 684024174.0, + "step": 26436 + }, + { + "epoch": 2.9032506039973645, + "grad_norm": 2.0312089920043945, + "learning_rate": 5e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.754019021987915, + "num_tokens": 684047359.0, + "step": 26437 + }, + { + "epoch": 2.903360421699978, + "grad_norm": 1.9826624393463135, + "learning_rate": 5e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7639108896255493, + "num_tokens": 684074209.0, + "step": 26438 + }, + { + "epoch": 2.9034702394025915, + "grad_norm": 2.068190097808838, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7441267967224121, + "num_tokens": 684102521.0, + "step": 26439 + }, + { + "epoch": 2.9035800571052053, + "grad_norm": 2.5178282260894775, + "learning_rate": 5e-06, + "loss": 0.6712, + "mean_token_accuracy": 0.7782053351402283, + "num_tokens": 684120721.0, + "step": 26440 + }, + { + "epoch": 2.903689874807819, + "grad_norm": 2.061635971069336, + "learning_rate": 5e-06, + "loss": 0.7043, + "mean_token_accuracy": 0.7641801834106445, + "num_tokens": 684145027.0, + "step": 26441 + }, + { + "epoch": 2.903799692510433, + "grad_norm": 2.11173415184021, + "learning_rate": 5e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.789219081401825, + "num_tokens": 684168071.0, + "step": 26442 + }, + { + "epoch": 2.9039095102130466, + "grad_norm": 2.0746734142303467, + "learning_rate": 5e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7492842674255371, + "num_tokens": 684194253.0, + "step": 26443 + }, + { + "epoch": 2.90401932791566, + "grad_norm": 2.0266916751861572, + "learning_rate": 5e-06, + "loss": 0.7213, + "mean_token_accuracy": 0.7657087445259094, + "num_tokens": 684221224.0, + "step": 26444 + }, + { + "epoch": 2.9041291456182736, + "grad_norm": 2.028348207473755, + "learning_rate": 5e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7637122273445129, + "num_tokens": 684246743.0, + "step": 26445 + }, + { + "epoch": 2.9042389633208874, + "grad_norm": 1.9410518407821655, + "learning_rate": 5e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7408138513565063, + "num_tokens": 684279060.0, + "step": 26446 + }, + { + "epoch": 2.904348781023501, + "grad_norm": 2.1587395668029785, + "learning_rate": 5e-06, + "loss": 0.7059, + "mean_token_accuracy": 0.7665190696716309, + "num_tokens": 684301859.0, + "step": 26447 + }, + { + "epoch": 2.904458598726115, + "grad_norm": 2.166989326477051, + "learning_rate": 5e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7533120512962341, + "num_tokens": 684324466.0, + "step": 26448 + }, + { + "epoch": 2.904568416428728, + "grad_norm": 2.059551954269409, + "learning_rate": 5e-06, + "loss": 0.6732, + "mean_token_accuracy": 0.7790415287017822, + "num_tokens": 684350543.0, + "step": 26449 + }, + { + "epoch": 2.904678234131342, + "grad_norm": 2.057830810546875, + "learning_rate": 5e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7614288330078125, + "num_tokens": 684376808.0, + "step": 26450 + }, + { + "epoch": 2.9047880518339557, + "grad_norm": 2.0946407318115234, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7655137777328491, + "num_tokens": 684400227.0, + "step": 26451 + }, + { + "epoch": 2.904897869536569, + "grad_norm": 1.8679908514022827, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7400549650192261, + "num_tokens": 684430854.0, + "step": 26452 + }, + { + "epoch": 2.905007687239183, + "grad_norm": 2.0560615062713623, + "learning_rate": 5e-06, + "loss": 0.6421, + "mean_token_accuracy": 0.7857645153999329, + "num_tokens": 684455835.0, + "step": 26453 + }, + { + "epoch": 2.9051175049417965, + "grad_norm": 2.3935866355895996, + "learning_rate": 5e-06, + "loss": 0.6342, + "mean_token_accuracy": 0.785063624382019, + "num_tokens": 684476878.0, + "step": 26454 + }, + { + "epoch": 2.9052273226444103, + "grad_norm": 1.9778631925582886, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7672261595726013, + "num_tokens": 684504343.0, + "step": 26455 + }, + { + "epoch": 2.905337140347024, + "grad_norm": 1.9969972372055054, + "learning_rate": 5e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.749483048915863, + "num_tokens": 684531345.0, + "step": 26456 + }, + { + "epoch": 2.9054469580496374, + "grad_norm": 2.0703468322753906, + "learning_rate": 5e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7616616487503052, + "num_tokens": 684558222.0, + "step": 26457 + }, + { + "epoch": 2.905556775752251, + "grad_norm": 2.1467294692993164, + "learning_rate": 5e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7645963430404663, + "num_tokens": 684583322.0, + "step": 26458 + }, + { + "epoch": 2.905666593454865, + "grad_norm": 2.1158533096313477, + "learning_rate": 5e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.772249698638916, + "num_tokens": 684608891.0, + "step": 26459 + }, + { + "epoch": 2.9057764111574786, + "grad_norm": 2.311232805252075, + "learning_rate": 5e-06, + "loss": 0.7115, + "mean_token_accuracy": 0.7612143754959106, + "num_tokens": 684629755.0, + "step": 26460 + }, + { + "epoch": 2.9058862288600924, + "grad_norm": 2.2220537662506104, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7411075830459595, + "num_tokens": 684655690.0, + "step": 26461 + }, + { + "epoch": 2.9059960465627057, + "grad_norm": 1.9341121912002563, + "learning_rate": 5e-06, + "loss": 0.776, + "mean_token_accuracy": 0.7461850047111511, + "num_tokens": 684683986.0, + "step": 26462 + }, + { + "epoch": 2.9061058642653195, + "grad_norm": 2.0304322242736816, + "learning_rate": 5e-06, + "loss": 0.7435, + "mean_token_accuracy": 0.7618162631988525, + "num_tokens": 684711925.0, + "step": 26463 + }, + { + "epoch": 2.9062156819679332, + "grad_norm": 2.06278657913208, + "learning_rate": 5e-06, + "loss": 0.7962, + "mean_token_accuracy": 0.7395892143249512, + "num_tokens": 684739745.0, + "step": 26464 + }, + { + "epoch": 2.906325499670547, + "grad_norm": 1.9553332328796387, + "learning_rate": 5e-06, + "loss": 0.6843, + "mean_token_accuracy": 0.7723550796508789, + "num_tokens": 684766419.0, + "step": 26465 + }, + { + "epoch": 2.9064353173731607, + "grad_norm": 2.4707322120666504, + "learning_rate": 5e-06, + "loss": 0.6844, + "mean_token_accuracy": 0.7742375135421753, + "num_tokens": 684783304.0, + "step": 26466 + }, + { + "epoch": 2.906545135075774, + "grad_norm": 2.064387798309326, + "learning_rate": 5e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7672978043556213, + "num_tokens": 684808464.0, + "step": 26467 + }, + { + "epoch": 2.906654952778388, + "grad_norm": 2.1068923473358154, + "learning_rate": 5e-06, + "loss": 0.7029, + "mean_token_accuracy": 0.7608286142349243, + "num_tokens": 684832596.0, + "step": 26468 + }, + { + "epoch": 2.9067647704810016, + "grad_norm": 2.0634374618530273, + "learning_rate": 5e-06, + "loss": 0.5977, + "mean_token_accuracy": 0.7962560057640076, + "num_tokens": 684857403.0, + "step": 26469 + }, + { + "epoch": 2.9068745881836153, + "grad_norm": 1.918570637702942, + "learning_rate": 5e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7554171681404114, + "num_tokens": 684885670.0, + "step": 26470 + }, + { + "epoch": 2.906984405886229, + "grad_norm": 2.166074514389038, + "learning_rate": 5e-06, + "loss": 0.7585, + "mean_token_accuracy": 0.7494763731956482, + "num_tokens": 684912432.0, + "step": 26471 + }, + { + "epoch": 2.9070942235888424, + "grad_norm": 1.9184973239898682, + "learning_rate": 5e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7548689842224121, + "num_tokens": 684942107.0, + "step": 26472 + }, + { + "epoch": 2.907204041291456, + "grad_norm": 2.01005220413208, + "learning_rate": 5e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7572545409202576, + "num_tokens": 684968164.0, + "step": 26473 + }, + { + "epoch": 2.90731385899407, + "grad_norm": 2.0132548809051514, + "learning_rate": 5e-06, + "loss": 0.7584, + "mean_token_accuracy": 0.7501856684684753, + "num_tokens": 684998790.0, + "step": 26474 + }, + { + "epoch": 2.9074236766966837, + "grad_norm": 2.0930190086364746, + "learning_rate": 5e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7497896552085876, + "num_tokens": 685024204.0, + "step": 26475 + }, + { + "epoch": 2.9075334943992974, + "grad_norm": 2.2521145343780518, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7661422491073608, + "num_tokens": 685047135.0, + "step": 26476 + }, + { + "epoch": 2.9076433121019107, + "grad_norm": 1.9406551122665405, + "learning_rate": 5e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7490667104721069, + "num_tokens": 685074789.0, + "step": 26477 + }, + { + "epoch": 2.9077531298045245, + "grad_norm": 2.1487691402435303, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7526615858078003, + "num_tokens": 685100691.0, + "step": 26478 + }, + { + "epoch": 2.9078629475071383, + "grad_norm": 1.9352930784225464, + "learning_rate": 5e-06, + "loss": 0.7522, + "mean_token_accuracy": 0.7564237117767334, + "num_tokens": 685129207.0, + "step": 26479 + }, + { + "epoch": 2.9079727652097516, + "grad_norm": 1.8935354948043823, + "learning_rate": 5e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.741344153881073, + "num_tokens": 685157119.0, + "step": 26480 + }, + { + "epoch": 2.9080825829123653, + "grad_norm": 2.231154680252075, + "learning_rate": 5e-06, + "loss": 0.736, + "mean_token_accuracy": 0.765967071056366, + "num_tokens": 685179248.0, + "step": 26481 + }, + { + "epoch": 2.908192400614979, + "grad_norm": 1.9683942794799805, + "learning_rate": 5e-06, + "loss": 0.7654, + "mean_token_accuracy": 0.7470006942749023, + "num_tokens": 685208491.0, + "step": 26482 + }, + { + "epoch": 2.908302218317593, + "grad_norm": 1.9006659984588623, + "learning_rate": 5e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7562781572341919, + "num_tokens": 685237949.0, + "step": 26483 + }, + { + "epoch": 2.9084120360202066, + "grad_norm": 2.0839266777038574, + "learning_rate": 5e-06, + "loss": 0.686, + "mean_token_accuracy": 0.7770628929138184, + "num_tokens": 685261824.0, + "step": 26484 + }, + { + "epoch": 2.90852185372282, + "grad_norm": 1.7989953756332397, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7585827112197876, + "num_tokens": 685295314.0, + "step": 26485 + }, + { + "epoch": 2.9086316714254337, + "grad_norm": 2.0938093662261963, + "learning_rate": 5e-06, + "loss": 0.667, + "mean_token_accuracy": 0.7797175645828247, + "num_tokens": 685320361.0, + "step": 26486 + }, + { + "epoch": 2.9087414891280474, + "grad_norm": 2.394042491912842, + "learning_rate": 5e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7562337517738342, + "num_tokens": 685340811.0, + "step": 26487 + }, + { + "epoch": 2.908851306830661, + "grad_norm": 2.251396894454956, + "learning_rate": 5e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7619205713272095, + "num_tokens": 685363296.0, + "step": 26488 + }, + { + "epoch": 2.908961124533275, + "grad_norm": 1.930267572402954, + "learning_rate": 5e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7430108785629272, + "num_tokens": 685391994.0, + "step": 26489 + }, + { + "epoch": 2.9090709422358882, + "grad_norm": 2.083939790725708, + "learning_rate": 5e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7470645308494568, + "num_tokens": 685419149.0, + "step": 26490 + }, + { + "epoch": 2.909180759938502, + "grad_norm": 1.9465361833572388, + "learning_rate": 5e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7353259325027466, + "num_tokens": 685448597.0, + "step": 26491 + }, + { + "epoch": 2.9092905776411158, + "grad_norm": 1.967793345451355, + "learning_rate": 5e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.767605185508728, + "num_tokens": 685476483.0, + "step": 26492 + }, + { + "epoch": 2.9094003953437295, + "grad_norm": 1.9997859001159668, + "learning_rate": 5e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7385122776031494, + "num_tokens": 685506252.0, + "step": 26493 + }, + { + "epoch": 2.9095102130463433, + "grad_norm": 2.18735408782959, + "learning_rate": 5e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7701701521873474, + "num_tokens": 685528968.0, + "step": 26494 + }, + { + "epoch": 2.9096200307489566, + "grad_norm": 2.053657054901123, + "learning_rate": 5e-06, + "loss": 0.6461, + "mean_token_accuracy": 0.7831292152404785, + "num_tokens": 685555676.0, + "step": 26495 + }, + { + "epoch": 2.9097298484515703, + "grad_norm": 1.9812815189361572, + "learning_rate": 5e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7506896257400513, + "num_tokens": 685584618.0, + "step": 26496 + }, + { + "epoch": 2.909839666154184, + "grad_norm": 2.016913652420044, + "learning_rate": 5e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7657296061515808, + "num_tokens": 685610716.0, + "step": 26497 + }, + { + "epoch": 2.909949483856798, + "grad_norm": 1.717451572418213, + "learning_rate": 5e-06, + "loss": 0.722, + "mean_token_accuracy": 0.7586637139320374, + "num_tokens": 685644020.0, + "step": 26498 + }, + { + "epoch": 2.9100593015594116, + "grad_norm": 1.9392062425613403, + "learning_rate": 5e-06, + "loss": 0.6713, + "mean_token_accuracy": 0.7786116003990173, + "num_tokens": 685671574.0, + "step": 26499 + }, + { + "epoch": 2.910169119262025, + "grad_norm": 2.1501379013061523, + "learning_rate": 5e-06, + "loss": 0.5764, + "mean_token_accuracy": 0.8060824871063232, + "num_tokens": 685692518.0, + "step": 26500 + }, + { + "epoch": 2.9102789369646387, + "grad_norm": 2.4032156467437744, + "learning_rate": 5e-06, + "loss": 0.546, + "mean_token_accuracy": 0.8152220249176025, + "num_tokens": 685710179.0, + "step": 26501 + }, + { + "epoch": 2.9103887546672524, + "grad_norm": 2.052488327026367, + "learning_rate": 5e-06, + "loss": 0.716, + "mean_token_accuracy": 0.7581162452697754, + "num_tokens": 685737113.0, + "step": 26502 + }, + { + "epoch": 2.9104985723698658, + "grad_norm": 1.9692810773849487, + "learning_rate": 5e-06, + "loss": 0.6711, + "mean_token_accuracy": 0.7820276021957397, + "num_tokens": 685761982.0, + "step": 26503 + }, + { + "epoch": 2.91060839007248, + "grad_norm": 2.2954490184783936, + "learning_rate": 5e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7518811225891113, + "num_tokens": 685784701.0, + "step": 26504 + }, + { + "epoch": 2.9107182077750933, + "grad_norm": 2.1100358963012695, + "learning_rate": 5e-06, + "loss": 0.6726, + "mean_token_accuracy": 0.7776548266410828, + "num_tokens": 685808692.0, + "step": 26505 + }, + { + "epoch": 2.910828025477707, + "grad_norm": 1.793717384338379, + "learning_rate": 5e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7396003007888794, + "num_tokens": 685843875.0, + "step": 26506 + }, + { + "epoch": 2.910937843180321, + "grad_norm": 1.9453576803207397, + "learning_rate": 5e-06, + "loss": 0.7078, + "mean_token_accuracy": 0.7723086476325989, + "num_tokens": 685871006.0, + "step": 26507 + }, + { + "epoch": 2.911047660882934, + "grad_norm": 1.9498159885406494, + "learning_rate": 5e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7659921646118164, + "num_tokens": 685898644.0, + "step": 26508 + }, + { + "epoch": 2.911157478585548, + "grad_norm": 1.8571085929870605, + "learning_rate": 5e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7728425860404968, + "num_tokens": 685928481.0, + "step": 26509 + }, + { + "epoch": 2.9112672962881616, + "grad_norm": 1.9287382364273071, + "learning_rate": 5e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7524977922439575, + "num_tokens": 685955733.0, + "step": 26510 + }, + { + "epoch": 2.9113771139907754, + "grad_norm": 2.140399217605591, + "learning_rate": 5e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7709479331970215, + "num_tokens": 685979497.0, + "step": 26511 + }, + { + "epoch": 2.911486931693389, + "grad_norm": 2.223839044570923, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7546970844268799, + "num_tokens": 686004005.0, + "step": 26512 + }, + { + "epoch": 2.9115967493960024, + "grad_norm": 2.336060047149658, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7771936655044556, + "num_tokens": 686026313.0, + "step": 26513 + }, + { + "epoch": 2.911706567098616, + "grad_norm": 1.9857745170593262, + "learning_rate": 5e-06, + "loss": 0.6978, + "mean_token_accuracy": 0.764164388179779, + "num_tokens": 686052662.0, + "step": 26514 + }, + { + "epoch": 2.91181638480123, + "grad_norm": 2.1817009449005127, + "learning_rate": 5e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7594119906425476, + "num_tokens": 686077154.0, + "step": 26515 + }, + { + "epoch": 2.9119262025038437, + "grad_norm": 2.0082104206085205, + "learning_rate": 5e-06, + "loss": 0.6935, + "mean_token_accuracy": 0.7729281187057495, + "num_tokens": 686104475.0, + "step": 26516 + }, + { + "epoch": 2.9120360202064575, + "grad_norm": 2.033158779144287, + "learning_rate": 5e-06, + "loss": 0.6534, + "mean_token_accuracy": 0.7849127054214478, + "num_tokens": 686129777.0, + "step": 26517 + }, + { + "epoch": 2.912145837909071, + "grad_norm": 2.2187395095825195, + "learning_rate": 5e-06, + "loss": 0.6913, + "mean_token_accuracy": 0.7740682363510132, + "num_tokens": 686150689.0, + "step": 26518 + }, + { + "epoch": 2.9122556556116845, + "grad_norm": 1.8036761283874512, + "learning_rate": 5e-06, + "loss": 0.7102, + "mean_token_accuracy": 0.7658947706222534, + "num_tokens": 686181244.0, + "step": 26519 + }, + { + "epoch": 2.9123654733142983, + "grad_norm": 2.2256150245666504, + "learning_rate": 5e-06, + "loss": 0.7644, + "mean_token_accuracy": 0.7552006244659424, + "num_tokens": 686206770.0, + "step": 26520 + }, + { + "epoch": 2.912475291016912, + "grad_norm": 2.140509843826294, + "learning_rate": 5e-06, + "loss": 0.6124, + "mean_token_accuracy": 0.7943629026412964, + "num_tokens": 686229386.0, + "step": 26521 + }, + { + "epoch": 2.912585108719526, + "grad_norm": 2.2806644439697266, + "learning_rate": 5e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.745287299156189, + "num_tokens": 686255128.0, + "step": 26522 + }, + { + "epoch": 2.912694926422139, + "grad_norm": 1.9405326843261719, + "learning_rate": 5e-06, + "loss": 0.7094, + "mean_token_accuracy": 0.7777865529060364, + "num_tokens": 686285706.0, + "step": 26523 + }, + { + "epoch": 2.912804744124753, + "grad_norm": 2.2021002769470215, + "learning_rate": 5e-06, + "loss": 0.7057, + "mean_token_accuracy": 0.771902322769165, + "num_tokens": 686308352.0, + "step": 26524 + }, + { + "epoch": 2.9129145618273666, + "grad_norm": 2.162773370742798, + "learning_rate": 5e-06, + "loss": 0.6661, + "mean_token_accuracy": 0.781977653503418, + "num_tokens": 686332079.0, + "step": 26525 + }, + { + "epoch": 2.9130243795299804, + "grad_norm": 2.1043355464935303, + "learning_rate": 5e-06, + "loss": 0.699, + "mean_token_accuracy": 0.7717888355255127, + "num_tokens": 686354565.0, + "step": 26526 + }, + { + "epoch": 2.913134197232594, + "grad_norm": 2.245028018951416, + "learning_rate": 5e-06, + "loss": 0.6785, + "mean_token_accuracy": 0.769533097743988, + "num_tokens": 686375893.0, + "step": 26527 + }, + { + "epoch": 2.9132440149352075, + "grad_norm": 2.252424955368042, + "learning_rate": 5e-06, + "loss": 0.6003, + "mean_token_accuracy": 0.7973803877830505, + "num_tokens": 686397110.0, + "step": 26528 + }, + { + "epoch": 2.913353832637821, + "grad_norm": 1.9546691179275513, + "learning_rate": 5e-06, + "loss": 0.7138, + "mean_token_accuracy": 0.7819889187812805, + "num_tokens": 686424505.0, + "step": 26529 + }, + { + "epoch": 2.913463650340435, + "grad_norm": 2.5894575119018555, + "learning_rate": 5e-06, + "loss": 0.6377, + "mean_token_accuracy": 0.782105028629303, + "num_tokens": 686441845.0, + "step": 26530 + }, + { + "epoch": 2.9135734680430483, + "grad_norm": 1.8326035737991333, + "learning_rate": 5e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7462546825408936, + "num_tokens": 686473059.0, + "step": 26531 + }, + { + "epoch": 2.913683285745662, + "grad_norm": 2.4217021465301514, + "learning_rate": 5e-06, + "loss": 0.7028, + "mean_token_accuracy": 0.7668309211730957, + "num_tokens": 686493222.0, + "step": 26532 + }, + { + "epoch": 2.913793103448276, + "grad_norm": 2.166417360305786, + "learning_rate": 5e-06, + "loss": 0.6673, + "mean_token_accuracy": 0.7767172455787659, + "num_tokens": 686517089.0, + "step": 26533 + }, + { + "epoch": 2.9139029211508896, + "grad_norm": 2.148120880126953, + "learning_rate": 5e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7519845366477966, + "num_tokens": 686542755.0, + "step": 26534 + }, + { + "epoch": 2.9140127388535033, + "grad_norm": 2.1906211376190186, + "learning_rate": 5e-06, + "loss": 0.6819, + "mean_token_accuracy": 0.7720155715942383, + "num_tokens": 686564023.0, + "step": 26535 + }, + { + "epoch": 2.9141225565561166, + "grad_norm": 2.0768260955810547, + "learning_rate": 5e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.755942702293396, + "num_tokens": 686589621.0, + "step": 26536 + }, + { + "epoch": 2.9142323742587304, + "grad_norm": 2.057659149169922, + "learning_rate": 5e-06, + "loss": 0.78, + "mean_token_accuracy": 0.7469512820243835, + "num_tokens": 686617584.0, + "step": 26537 + }, + { + "epoch": 2.914342191961344, + "grad_norm": 1.908782958984375, + "learning_rate": 5e-06, + "loss": 0.7466, + "mean_token_accuracy": 0.7559224367141724, + "num_tokens": 686646000.0, + "step": 26538 + }, + { + "epoch": 2.914452009663958, + "grad_norm": 2.2388505935668945, + "learning_rate": 5e-06, + "loss": 0.6369, + "mean_token_accuracy": 0.7912293672561646, + "num_tokens": 686665264.0, + "step": 26539 + }, + { + "epoch": 2.9145618273665717, + "grad_norm": 1.9093364477157593, + "learning_rate": 5e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7454333305358887, + "num_tokens": 686693967.0, + "step": 26540 + }, + { + "epoch": 2.914671645069185, + "grad_norm": 1.982772946357727, + "learning_rate": 5e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7620909214019775, + "num_tokens": 686722169.0, + "step": 26541 + }, + { + "epoch": 2.9147814627717987, + "grad_norm": 2.030893325805664, + "learning_rate": 5e-06, + "loss": 0.773, + "mean_token_accuracy": 0.749001145362854, + "num_tokens": 686748331.0, + "step": 26542 + }, + { + "epoch": 2.9148912804744125, + "grad_norm": 2.3674895763397217, + "learning_rate": 5e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7452073097229004, + "num_tokens": 686770361.0, + "step": 26543 + }, + { + "epoch": 2.9150010981770262, + "grad_norm": 2.2670915126800537, + "learning_rate": 5e-06, + "loss": 0.6512, + "mean_token_accuracy": 0.7933223247528076, + "num_tokens": 686793225.0, + "step": 26544 + }, + { + "epoch": 2.91511091587964, + "grad_norm": 1.8098480701446533, + "learning_rate": 5e-06, + "loss": 0.7296, + "mean_token_accuracy": 0.7594183683395386, + "num_tokens": 686824549.0, + "step": 26545 + }, + { + "epoch": 2.9152207335822533, + "grad_norm": 2.2063887119293213, + "learning_rate": 5e-06, + "loss": 0.7116, + "mean_token_accuracy": 0.7702770829200745, + "num_tokens": 686849238.0, + "step": 26546 + }, + { + "epoch": 2.915330551284867, + "grad_norm": 2.2096476554870605, + "learning_rate": 5e-06, + "loss": 0.6921, + "mean_token_accuracy": 0.7662150263786316, + "num_tokens": 686871520.0, + "step": 26547 + }, + { + "epoch": 2.915440368987481, + "grad_norm": 2.1816203594207764, + "learning_rate": 5e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7718878388404846, + "num_tokens": 686893940.0, + "step": 26548 + }, + { + "epoch": 2.9155501866900946, + "grad_norm": 1.9556105136871338, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7507126927375793, + "num_tokens": 686920236.0, + "step": 26549 + }, + { + "epoch": 2.9156600043927083, + "grad_norm": 2.2973573207855225, + "learning_rate": 5e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7451034784317017, + "num_tokens": 686943467.0, + "step": 26550 + }, + { + "epoch": 2.9157698220953217, + "grad_norm": 2.0924551486968994, + "learning_rate": 5e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7723173499107361, + "num_tokens": 686968046.0, + "step": 26551 + }, + { + "epoch": 2.9158796397979354, + "grad_norm": 2.1648473739624023, + "learning_rate": 5e-06, + "loss": 0.692, + "mean_token_accuracy": 0.775886058807373, + "num_tokens": 686990738.0, + "step": 26552 + }, + { + "epoch": 2.915989457500549, + "grad_norm": 1.9655368328094482, + "learning_rate": 5e-06, + "loss": 0.759, + "mean_token_accuracy": 0.7577627301216125, + "num_tokens": 687019449.0, + "step": 26553 + }, + { + "epoch": 2.9160992752031625, + "grad_norm": 1.9644789695739746, + "learning_rate": 5e-06, + "loss": 0.7096, + "mean_token_accuracy": 0.767227053642273, + "num_tokens": 687046214.0, + "step": 26554 + }, + { + "epoch": 2.9162090929057767, + "grad_norm": 2.1663663387298584, + "learning_rate": 5e-06, + "loss": 0.7145, + "mean_token_accuracy": 0.7634374499320984, + "num_tokens": 687070741.0, + "step": 26555 + }, + { + "epoch": 2.91631891060839, + "grad_norm": 2.3147146701812744, + "learning_rate": 5e-06, + "loss": 0.6954, + "mean_token_accuracy": 0.7781285643577576, + "num_tokens": 687092035.0, + "step": 26556 + }, + { + "epoch": 2.9164287283110037, + "grad_norm": 1.757343053817749, + "learning_rate": 5e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.7604475021362305, + "num_tokens": 687125265.0, + "step": 26557 + }, + { + "epoch": 2.9165385460136175, + "grad_norm": 2.327993392944336, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.7739936113357544, + "num_tokens": 687146869.0, + "step": 26558 + }, + { + "epoch": 2.916648363716231, + "grad_norm": 2.0423483848571777, + "learning_rate": 5e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.7731878757476807, + "num_tokens": 687172614.0, + "step": 26559 + }, + { + "epoch": 2.9167581814188446, + "grad_norm": 2.060940980911255, + "learning_rate": 5e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7349290251731873, + "num_tokens": 687199143.0, + "step": 26560 + }, + { + "epoch": 2.9168679991214583, + "grad_norm": 2.3398165702819824, + "learning_rate": 5e-06, + "loss": 0.7214, + "mean_token_accuracy": 0.7676454782485962, + "num_tokens": 687220961.0, + "step": 26561 + }, + { + "epoch": 2.916977816824072, + "grad_norm": 2.0553181171417236, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7593674659729004, + "num_tokens": 687245221.0, + "step": 26562 + }, + { + "epoch": 2.917087634526686, + "grad_norm": 2.363765001296997, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7728444337844849, + "num_tokens": 687266497.0, + "step": 26563 + }, + { + "epoch": 2.917197452229299, + "grad_norm": 2.31890869140625, + "learning_rate": 5e-06, + "loss": 0.6929, + "mean_token_accuracy": 0.7696887850761414, + "num_tokens": 687287514.0, + "step": 26564 + }, + { + "epoch": 2.917307269931913, + "grad_norm": 1.9308644533157349, + "learning_rate": 5e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.736629843711853, + "num_tokens": 687314771.0, + "step": 26565 + }, + { + "epoch": 2.9174170876345267, + "grad_norm": 2.089200735092163, + "learning_rate": 5e-06, + "loss": 0.7087, + "mean_token_accuracy": 0.7757264971733093, + "num_tokens": 687339706.0, + "step": 26566 + }, + { + "epoch": 2.9175269053371404, + "grad_norm": 1.8031527996063232, + "learning_rate": 5e-06, + "loss": 0.7687, + "mean_token_accuracy": 0.7475607395172119, + "num_tokens": 687370476.0, + "step": 26567 + }, + { + "epoch": 2.917636723039754, + "grad_norm": 2.388237953186035, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7709087133407593, + "num_tokens": 687390937.0, + "step": 26568 + }, + { + "epoch": 2.9177465407423675, + "grad_norm": 2.1643879413604736, + "learning_rate": 5e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.7848142385482788, + "num_tokens": 687413002.0, + "step": 26569 + }, + { + "epoch": 2.9178563584449813, + "grad_norm": 2.2210347652435303, + "learning_rate": 5e-06, + "loss": 0.7158, + "mean_token_accuracy": 0.7747248411178589, + "num_tokens": 687436130.0, + "step": 26570 + }, + { + "epoch": 2.917966176147595, + "grad_norm": 2.490260601043701, + "learning_rate": 5e-06, + "loss": 0.7048, + "mean_token_accuracy": 0.7765810489654541, + "num_tokens": 687455561.0, + "step": 26571 + }, + { + "epoch": 2.9180759938502088, + "grad_norm": 2.077366590499878, + "learning_rate": 5e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7614932656288147, + "num_tokens": 687483427.0, + "step": 26572 + }, + { + "epoch": 2.9181858115528225, + "grad_norm": 2.0453898906707764, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7648036479949951, + "num_tokens": 687509248.0, + "step": 26573 + }, + { + "epoch": 2.918295629255436, + "grad_norm": 2.3141238689422607, + "learning_rate": 5e-06, + "loss": 0.655, + "mean_token_accuracy": 0.783562183380127, + "num_tokens": 687532293.0, + "step": 26574 + }, + { + "epoch": 2.9184054469580496, + "grad_norm": 2.0315401554107666, + "learning_rate": 5e-06, + "loss": 0.736, + "mean_token_accuracy": 0.7568029761314392, + "num_tokens": 687557448.0, + "step": 26575 + }, + { + "epoch": 2.9185152646606634, + "grad_norm": 2.129878282546997, + "learning_rate": 5e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7574033737182617, + "num_tokens": 687582932.0, + "step": 26576 + }, + { + "epoch": 2.918625082363277, + "grad_norm": 1.9641495943069458, + "learning_rate": 5e-06, + "loss": 0.7918, + "mean_token_accuracy": 0.7357642650604248, + "num_tokens": 687614840.0, + "step": 26577 + }, + { + "epoch": 2.918734900065891, + "grad_norm": 2.0835752487182617, + "learning_rate": 5e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.7862056493759155, + "num_tokens": 687639896.0, + "step": 26578 + }, + { + "epoch": 2.918844717768504, + "grad_norm": 2.077589988708496, + "learning_rate": 5e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7687785029411316, + "num_tokens": 687665191.0, + "step": 26579 + }, + { + "epoch": 2.918954535471118, + "grad_norm": 2.0319418907165527, + "learning_rate": 5e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7522672414779663, + "num_tokens": 687696403.0, + "step": 26580 + }, + { + "epoch": 2.9190643531737317, + "grad_norm": 2.1411259174346924, + "learning_rate": 5e-06, + "loss": 0.6997, + "mean_token_accuracy": 0.7646811008453369, + "num_tokens": 687719646.0, + "step": 26581 + }, + { + "epoch": 2.919174170876345, + "grad_norm": 2.380988836288452, + "learning_rate": 5e-06, + "loss": 0.6354, + "mean_token_accuracy": 0.782518208026886, + "num_tokens": 687737951.0, + "step": 26582 + }, + { + "epoch": 2.9192839885789588, + "grad_norm": 1.974250316619873, + "learning_rate": 5e-06, + "loss": 0.7297, + "mean_token_accuracy": 0.7641169428825378, + "num_tokens": 687765008.0, + "step": 26583 + }, + { + "epoch": 2.9193938062815725, + "grad_norm": 2.0565598011016846, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7653619050979614, + "num_tokens": 687787982.0, + "step": 26584 + }, + { + "epoch": 2.9195036239841863, + "grad_norm": 1.9528306722640991, + "learning_rate": 5e-06, + "loss": 0.7878, + "mean_token_accuracy": 0.7445968389511108, + "num_tokens": 687818008.0, + "step": 26585 + }, + { + "epoch": 2.9196134416868, + "grad_norm": 1.8845294713974, + "learning_rate": 5e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7524204254150391, + "num_tokens": 687848032.0, + "step": 26586 + }, + { + "epoch": 2.9197232593894134, + "grad_norm": 1.8303848505020142, + "learning_rate": 5e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7536134719848633, + "num_tokens": 687880300.0, + "step": 26587 + }, + { + "epoch": 2.919833077092027, + "grad_norm": 2.070148229598999, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7643482685089111, + "num_tokens": 687905960.0, + "step": 26588 + }, + { + "epoch": 2.919942894794641, + "grad_norm": 2.2168080806732178, + "learning_rate": 5e-06, + "loss": 0.6988, + "mean_token_accuracy": 0.7706162333488464, + "num_tokens": 687928070.0, + "step": 26589 + }, + { + "epoch": 2.9200527124972546, + "grad_norm": 2.254380941390991, + "learning_rate": 5e-06, + "loss": 0.6386, + "mean_token_accuracy": 0.7869143486022949, + "num_tokens": 687949205.0, + "step": 26590 + }, + { + "epoch": 2.9201625301998684, + "grad_norm": 2.1042275428771973, + "learning_rate": 5e-06, + "loss": 0.7561, + "mean_token_accuracy": 0.7537322640419006, + "num_tokens": 687974118.0, + "step": 26591 + }, + { + "epoch": 2.9202723479024817, + "grad_norm": 1.938301920890808, + "learning_rate": 5e-06, + "loss": 0.7331, + "mean_token_accuracy": 0.7568030953407288, + "num_tokens": 688004482.0, + "step": 26592 + }, + { + "epoch": 2.9203821656050954, + "grad_norm": 2.306814432144165, + "learning_rate": 5e-06, + "loss": 0.6239, + "mean_token_accuracy": 0.7894412875175476, + "num_tokens": 688027166.0, + "step": 26593 + }, + { + "epoch": 2.920491983307709, + "grad_norm": 2.0390985012054443, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7561190724372864, + "num_tokens": 688058706.0, + "step": 26594 + }, + { + "epoch": 2.920601801010323, + "grad_norm": 1.976853370666504, + "learning_rate": 5e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7550212144851685, + "num_tokens": 688086386.0, + "step": 26595 + }, + { + "epoch": 2.9207116187129367, + "grad_norm": 2.0996475219726562, + "learning_rate": 5e-06, + "loss": 0.6865, + "mean_token_accuracy": 0.7749361991882324, + "num_tokens": 688109309.0, + "step": 26596 + }, + { + "epoch": 2.92082143641555, + "grad_norm": 2.223139524459839, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7670363187789917, + "num_tokens": 688131281.0, + "step": 26597 + }, + { + "epoch": 2.920931254118164, + "grad_norm": 2.018505334854126, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7549219131469727, + "num_tokens": 688157317.0, + "step": 26598 + }, + { + "epoch": 2.9210410718207775, + "grad_norm": 1.9579877853393555, + "learning_rate": 5e-06, + "loss": 0.7218, + "mean_token_accuracy": 0.7590672373771667, + "num_tokens": 688183596.0, + "step": 26599 + }, + { + "epoch": 2.9211508895233913, + "grad_norm": 2.0506865978240967, + "learning_rate": 5e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7589707374572754, + "num_tokens": 688207627.0, + "step": 26600 + }, + { + "epoch": 2.921260707226005, + "grad_norm": 2.067525863647461, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7593190670013428, + "num_tokens": 688234592.0, + "step": 26601 + }, + { + "epoch": 2.9213705249286184, + "grad_norm": 2.1748642921447754, + "learning_rate": 5e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.7732135653495789, + "num_tokens": 688257192.0, + "step": 26602 + }, + { + "epoch": 2.921480342631232, + "grad_norm": 1.9096665382385254, + "learning_rate": 5e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7555224895477295, + "num_tokens": 688284975.0, + "step": 26603 + }, + { + "epoch": 2.921590160333846, + "grad_norm": 1.9579441547393799, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.749717116355896, + "num_tokens": 688312979.0, + "step": 26604 + }, + { + "epoch": 2.921699978036459, + "grad_norm": 1.9756290912628174, + "learning_rate": 5e-06, + "loss": 0.6764, + "mean_token_accuracy": 0.7725677490234375, + "num_tokens": 688341247.0, + "step": 26605 + }, + { + "epoch": 2.9218097957390734, + "grad_norm": 1.9104483127593994, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7603148221969604, + "num_tokens": 688369835.0, + "step": 26606 + }, + { + "epoch": 2.9219196134416867, + "grad_norm": 2.14150333404541, + "learning_rate": 5e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.747407853603363, + "num_tokens": 688394835.0, + "step": 26607 + }, + { + "epoch": 2.9220294311443005, + "grad_norm": 2.0333592891693115, + "learning_rate": 5e-06, + "loss": 0.6789, + "mean_token_accuracy": 0.7683478593826294, + "num_tokens": 688418172.0, + "step": 26608 + }, + { + "epoch": 2.9221392488469142, + "grad_norm": 1.8883882761001587, + "learning_rate": 5e-06, + "loss": 0.6557, + "mean_token_accuracy": 0.7778524160385132, + "num_tokens": 688445558.0, + "step": 26609 + }, + { + "epoch": 2.9222490665495275, + "grad_norm": 1.9537545442581177, + "learning_rate": 5e-06, + "loss": 0.784, + "mean_token_accuracy": 0.7471422553062439, + "num_tokens": 688475964.0, + "step": 26610 + }, + { + "epoch": 2.9223588842521413, + "grad_norm": 2.2064597606658936, + "learning_rate": 5e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.7555376291275024, + "num_tokens": 688500209.0, + "step": 26611 + }, + { + "epoch": 2.922468701954755, + "grad_norm": 1.928298830986023, + "learning_rate": 5e-06, + "loss": 0.6581, + "mean_token_accuracy": 0.7804652452468872, + "num_tokens": 688526921.0, + "step": 26612 + }, + { + "epoch": 2.922578519657369, + "grad_norm": 1.9868969917297363, + "learning_rate": 5e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7620422840118408, + "num_tokens": 688552930.0, + "step": 26613 + }, + { + "epoch": 2.9226883373599826, + "grad_norm": 1.9486446380615234, + "learning_rate": 5e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7579125165939331, + "num_tokens": 688579212.0, + "step": 26614 + }, + { + "epoch": 2.922798155062596, + "grad_norm": 1.7018150091171265, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7523646354675293, + "num_tokens": 688612218.0, + "step": 26615 + }, + { + "epoch": 2.9229079727652096, + "grad_norm": 2.110757827758789, + "learning_rate": 5e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7452478408813477, + "num_tokens": 688638057.0, + "step": 26616 + }, + { + "epoch": 2.9230177904678234, + "grad_norm": 1.9762318134307861, + "learning_rate": 5e-06, + "loss": 0.7393, + "mean_token_accuracy": 0.7579504251480103, + "num_tokens": 688668253.0, + "step": 26617 + }, + { + "epoch": 2.923127608170437, + "grad_norm": 1.8046091794967651, + "learning_rate": 5e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7412126064300537, + "num_tokens": 688700289.0, + "step": 26618 + }, + { + "epoch": 2.923237425873051, + "grad_norm": 2.090308666229248, + "learning_rate": 5e-06, + "loss": 0.7069, + "mean_token_accuracy": 0.7679055333137512, + "num_tokens": 688727342.0, + "step": 26619 + }, + { + "epoch": 2.9233472435756642, + "grad_norm": 1.8784891366958618, + "learning_rate": 5e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7420183420181274, + "num_tokens": 688756232.0, + "step": 26620 + }, + { + "epoch": 2.923457061278278, + "grad_norm": 2.1558408737182617, + "learning_rate": 5e-06, + "loss": 0.739, + "mean_token_accuracy": 0.7554702758789062, + "num_tokens": 688781858.0, + "step": 26621 + }, + { + "epoch": 2.9235668789808917, + "grad_norm": 2.137718439102173, + "learning_rate": 5e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7745393514633179, + "num_tokens": 688804825.0, + "step": 26622 + }, + { + "epoch": 2.9236766966835055, + "grad_norm": 2.171215534210205, + "learning_rate": 5e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.7678880095481873, + "num_tokens": 688826976.0, + "step": 26623 + }, + { + "epoch": 2.9237865143861193, + "grad_norm": 2.1936464309692383, + "learning_rate": 5e-06, + "loss": 0.658, + "mean_token_accuracy": 0.7791993618011475, + "num_tokens": 688848423.0, + "step": 26624 + }, + { + "epoch": 2.9238963320887326, + "grad_norm": 1.8448655605316162, + "learning_rate": 5e-06, + "loss": 0.7777, + "mean_token_accuracy": 0.7494561076164246, + "num_tokens": 688878267.0, + "step": 26625 + }, + { + "epoch": 2.9240061497913463, + "grad_norm": 2.2453622817993164, + "learning_rate": 5e-06, + "loss": 0.6963, + "mean_token_accuracy": 0.7730177640914917, + "num_tokens": 688900271.0, + "step": 26626 + }, + { + "epoch": 2.92411596749396, + "grad_norm": 2.0869312286376953, + "learning_rate": 5e-06, + "loss": 0.7388, + "mean_token_accuracy": 0.7534101009368896, + "num_tokens": 688927580.0, + "step": 26627 + }, + { + "epoch": 2.924225785196574, + "grad_norm": 2.043674945831299, + "learning_rate": 5e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7738454341888428, + "num_tokens": 688955211.0, + "step": 26628 + }, + { + "epoch": 2.9243356028991876, + "grad_norm": 2.2193076610565186, + "learning_rate": 5e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7673977613449097, + "num_tokens": 688976667.0, + "step": 26629 + }, + { + "epoch": 2.924445420601801, + "grad_norm": 2.2492763996124268, + "learning_rate": 5e-06, + "loss": 0.7119, + "mean_token_accuracy": 0.7768290042877197, + "num_tokens": 688998475.0, + "step": 26630 + }, + { + "epoch": 2.9245552383044147, + "grad_norm": 1.9823757410049438, + "learning_rate": 5e-06, + "loss": 0.748, + "mean_token_accuracy": 0.758084774017334, + "num_tokens": 689028259.0, + "step": 26631 + }, + { + "epoch": 2.9246650560070284, + "grad_norm": 2.1208114624023438, + "learning_rate": 5e-06, + "loss": 0.6206, + "mean_token_accuracy": 0.7864651679992676, + "num_tokens": 689050541.0, + "step": 26632 + }, + { + "epoch": 2.9247748737096417, + "grad_norm": 2.187284469604492, + "learning_rate": 5e-06, + "loss": 0.6459, + "mean_token_accuracy": 0.7837860584259033, + "num_tokens": 689072875.0, + "step": 26633 + }, + { + "epoch": 2.924884691412256, + "grad_norm": 2.337719678878784, + "learning_rate": 5e-06, + "loss": 0.6735, + "mean_token_accuracy": 0.77159184217453, + "num_tokens": 689094720.0, + "step": 26634 + }, + { + "epoch": 2.9249945091148692, + "grad_norm": 2.13908314704895, + "learning_rate": 5e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7512333393096924, + "num_tokens": 689119744.0, + "step": 26635 + }, + { + "epoch": 2.925104326817483, + "grad_norm": 2.240200996398926, + "learning_rate": 5e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.7866916060447693, + "num_tokens": 689142119.0, + "step": 26636 + }, + { + "epoch": 2.9252141445200968, + "grad_norm": 2.0013577938079834, + "learning_rate": 5e-06, + "loss": 0.7176, + "mean_token_accuracy": 0.7625154256820679, + "num_tokens": 689168266.0, + "step": 26637 + }, + { + "epoch": 2.92532396222271, + "grad_norm": 2.411186933517456, + "learning_rate": 5e-06, + "loss": 0.6794, + "mean_token_accuracy": 0.7753502130508423, + "num_tokens": 689187287.0, + "step": 26638 + }, + { + "epoch": 2.925433779925324, + "grad_norm": 2.1549642086029053, + "learning_rate": 5e-06, + "loss": 0.7379, + "mean_token_accuracy": 0.770524263381958, + "num_tokens": 689210422.0, + "step": 26639 + }, + { + "epoch": 2.9255435976279376, + "grad_norm": 2.0851829051971436, + "learning_rate": 5e-06, + "loss": 0.6539, + "mean_token_accuracy": 0.7809637784957886, + "num_tokens": 689232852.0, + "step": 26640 + }, + { + "epoch": 2.9256534153305513, + "grad_norm": 1.9766896963119507, + "learning_rate": 5e-06, + "loss": 0.7118, + "mean_token_accuracy": 0.768040657043457, + "num_tokens": 689257436.0, + "step": 26641 + }, + { + "epoch": 2.925763233033165, + "grad_norm": 2.1601529121398926, + "learning_rate": 5e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7617700099945068, + "num_tokens": 689281577.0, + "step": 26642 + }, + { + "epoch": 2.9258730507357784, + "grad_norm": 2.1336987018585205, + "learning_rate": 5e-06, + "loss": 0.6448, + "mean_token_accuracy": 0.7842628955841064, + "num_tokens": 689304102.0, + "step": 26643 + }, + { + "epoch": 2.925982868438392, + "grad_norm": 2.2934672832489014, + "learning_rate": 5e-06, + "loss": 0.7533, + "mean_token_accuracy": 0.75026535987854, + "num_tokens": 689327389.0, + "step": 26644 + }, + { + "epoch": 2.926092686141006, + "grad_norm": 2.0370352268218994, + "learning_rate": 5e-06, + "loss": 0.7137, + "mean_token_accuracy": 0.7681323885917664, + "num_tokens": 689353702.0, + "step": 26645 + }, + { + "epoch": 2.9262025038436197, + "grad_norm": 2.5476667881011963, + "learning_rate": 5e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7574923038482666, + "num_tokens": 689374143.0, + "step": 26646 + }, + { + "epoch": 2.9263123215462334, + "grad_norm": 1.9993239641189575, + "learning_rate": 5e-06, + "loss": 0.7341, + "mean_token_accuracy": 0.7571843266487122, + "num_tokens": 689403670.0, + "step": 26647 + }, + { + "epoch": 2.9264221392488468, + "grad_norm": 2.2019238471984863, + "learning_rate": 5e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7543621063232422, + "num_tokens": 689431473.0, + "step": 26648 + }, + { + "epoch": 2.9265319569514605, + "grad_norm": 2.1059412956237793, + "learning_rate": 5e-06, + "loss": 0.6928, + "mean_token_accuracy": 0.7722365260124207, + "num_tokens": 689455168.0, + "step": 26649 + }, + { + "epoch": 2.9266417746540743, + "grad_norm": 2.495732545852661, + "learning_rate": 5e-06, + "loss": 0.6601, + "mean_token_accuracy": 0.7866936922073364, + "num_tokens": 689474164.0, + "step": 26650 + }, + { + "epoch": 2.926751592356688, + "grad_norm": 1.8606858253479004, + "learning_rate": 5e-06, + "loss": 0.6761, + "mean_token_accuracy": 0.7793935537338257, + "num_tokens": 689503520.0, + "step": 26651 + }, + { + "epoch": 2.926861410059302, + "grad_norm": 2.2172021865844727, + "learning_rate": 5e-06, + "loss": 0.6915, + "mean_token_accuracy": 0.7683541774749756, + "num_tokens": 689527908.0, + "step": 26652 + }, + { + "epoch": 2.926971227761915, + "grad_norm": 2.105337142944336, + "learning_rate": 5e-06, + "loss": 0.6885, + "mean_token_accuracy": 0.7678302526473999, + "num_tokens": 689551458.0, + "step": 26653 + }, + { + "epoch": 2.927081045464529, + "grad_norm": 2.1367404460906982, + "learning_rate": 5e-06, + "loss": 0.7088, + "mean_token_accuracy": 0.7656574249267578, + "num_tokens": 689576253.0, + "step": 26654 + }, + { + "epoch": 2.9271908631671426, + "grad_norm": 1.918221116065979, + "learning_rate": 5e-06, + "loss": 0.6841, + "mean_token_accuracy": 0.7690063714981079, + "num_tokens": 689603724.0, + "step": 26655 + }, + { + "epoch": 2.9273006808697564, + "grad_norm": 2.0087316036224365, + "learning_rate": 5e-06, + "loss": 0.744, + "mean_token_accuracy": 0.7585188150405884, + "num_tokens": 689630467.0, + "step": 26656 + }, + { + "epoch": 2.92741049857237, + "grad_norm": 1.8735007047653198, + "learning_rate": 5e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.7553159594535828, + "num_tokens": 689658599.0, + "step": 26657 + }, + { + "epoch": 2.9275203162749834, + "grad_norm": 1.9681203365325928, + "learning_rate": 5e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.733790397644043, + "num_tokens": 689685907.0, + "step": 26658 + }, + { + "epoch": 2.927630133977597, + "grad_norm": 1.7963240146636963, + "learning_rate": 5e-06, + "loss": 0.6803, + "mean_token_accuracy": 0.7728797793388367, + "num_tokens": 689712748.0, + "step": 26659 + }, + { + "epoch": 2.927739951680211, + "grad_norm": 1.920879602432251, + "learning_rate": 5e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7390908002853394, + "num_tokens": 689740283.0, + "step": 26660 + }, + { + "epoch": 2.9278497693828243, + "grad_norm": 2.125673770904541, + "learning_rate": 5e-06, + "loss": 0.6831, + "mean_token_accuracy": 0.7725557684898376, + "num_tokens": 689763226.0, + "step": 26661 + }, + { + "epoch": 2.927959587085438, + "grad_norm": 1.9929630756378174, + "learning_rate": 5e-06, + "loss": 0.7408, + "mean_token_accuracy": 0.7582978010177612, + "num_tokens": 689789286.0, + "step": 26662 + }, + { + "epoch": 2.9280694047880518, + "grad_norm": 2.074735164642334, + "learning_rate": 5e-06, + "loss": 0.6556, + "mean_token_accuracy": 0.7824186086654663, + "num_tokens": 689813187.0, + "step": 26663 + }, + { + "epoch": 2.9281792224906655, + "grad_norm": 2.085559368133545, + "learning_rate": 5e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7372262477874756, + "num_tokens": 689838737.0, + "step": 26664 + }, + { + "epoch": 2.9282890401932793, + "grad_norm": 2.3033833503723145, + "learning_rate": 5e-06, + "loss": 0.6437, + "mean_token_accuracy": 0.7878302931785583, + "num_tokens": 689859680.0, + "step": 26665 + }, + { + "epoch": 2.9283988578958926, + "grad_norm": 2.0314369201660156, + "learning_rate": 5e-06, + "loss": 0.758, + "mean_token_accuracy": 0.7513411641120911, + "num_tokens": 689887794.0, + "step": 26666 + }, + { + "epoch": 2.9285086755985064, + "grad_norm": 2.0182414054870605, + "learning_rate": 5e-06, + "loss": 0.7361, + "mean_token_accuracy": 0.7593584060668945, + "num_tokens": 689914280.0, + "step": 26667 + }, + { + "epoch": 2.92861849330112, + "grad_norm": 1.9818432331085205, + "learning_rate": 5e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7436881065368652, + "num_tokens": 689944721.0, + "step": 26668 + }, + { + "epoch": 2.928728311003734, + "grad_norm": 1.8544367551803589, + "learning_rate": 5e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7448694705963135, + "num_tokens": 689975362.0, + "step": 26669 + }, + { + "epoch": 2.9288381287063476, + "grad_norm": 2.250882863998413, + "learning_rate": 5e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7728144526481628, + "num_tokens": 689997547.0, + "step": 26670 + }, + { + "epoch": 2.928947946408961, + "grad_norm": 1.9524009227752686, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7682952284812927, + "num_tokens": 690025929.0, + "step": 26671 + }, + { + "epoch": 2.9290577641115747, + "grad_norm": 2.020561456680298, + "learning_rate": 5e-06, + "loss": 0.6738, + "mean_token_accuracy": 0.7778232097625732, + "num_tokens": 690052526.0, + "step": 26672 + }, + { + "epoch": 2.9291675818141885, + "grad_norm": 1.9466571807861328, + "learning_rate": 5e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7221611738204956, + "num_tokens": 690083171.0, + "step": 26673 + }, + { + "epoch": 2.929277399516802, + "grad_norm": 2.132361888885498, + "learning_rate": 5e-06, + "loss": 0.6633, + "mean_token_accuracy": 0.7794493436813354, + "num_tokens": 690105415.0, + "step": 26674 + }, + { + "epoch": 2.929387217219416, + "grad_norm": 2.246713399887085, + "learning_rate": 5e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7629238963127136, + "num_tokens": 690128596.0, + "step": 26675 + }, + { + "epoch": 2.9294970349220293, + "grad_norm": 2.312932014465332, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7555882930755615, + "num_tokens": 690150871.0, + "step": 26676 + }, + { + "epoch": 2.929606852624643, + "grad_norm": 2.008521318435669, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7542397379875183, + "num_tokens": 690178700.0, + "step": 26677 + }, + { + "epoch": 2.929716670327257, + "grad_norm": 2.217064380645752, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7429057955741882, + "num_tokens": 690205438.0, + "step": 26678 + }, + { + "epoch": 2.9298264880298706, + "grad_norm": 2.307243824005127, + "learning_rate": 5e-06, + "loss": 0.6692, + "mean_token_accuracy": 0.7740707397460938, + "num_tokens": 690227674.0, + "step": 26679 + }, + { + "epoch": 2.9299363057324843, + "grad_norm": 2.0699872970581055, + "learning_rate": 5e-06, + "loss": 0.7258, + "mean_token_accuracy": 0.7617030143737793, + "num_tokens": 690252710.0, + "step": 26680 + }, + { + "epoch": 2.9300461234350976, + "grad_norm": 2.096405029296875, + "learning_rate": 5e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7735957503318787, + "num_tokens": 690276996.0, + "step": 26681 + }, + { + "epoch": 2.9301559411377114, + "grad_norm": 2.1336450576782227, + "learning_rate": 5e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7514777183532715, + "num_tokens": 690301680.0, + "step": 26682 + }, + { + "epoch": 2.930265758840325, + "grad_norm": 1.9237310886383057, + "learning_rate": 5e-06, + "loss": 0.7122, + "mean_token_accuracy": 0.7757033705711365, + "num_tokens": 690327796.0, + "step": 26683 + }, + { + "epoch": 2.9303755765429385, + "grad_norm": 2.0402321815490723, + "learning_rate": 5e-06, + "loss": 0.6168, + "mean_token_accuracy": 0.7945504188537598, + "num_tokens": 690351568.0, + "step": 26684 + }, + { + "epoch": 2.9304853942455527, + "grad_norm": 2.2577996253967285, + "learning_rate": 5e-06, + "loss": 0.6884, + "mean_token_accuracy": 0.7722086310386658, + "num_tokens": 690374183.0, + "step": 26685 + }, + { + "epoch": 2.930595211948166, + "grad_norm": 1.7897590398788452, + "learning_rate": 5e-06, + "loss": 0.764, + "mean_token_accuracy": 0.7515897154808044, + "num_tokens": 690407755.0, + "step": 26686 + }, + { + "epoch": 2.9307050296507797, + "grad_norm": 1.822371244430542, + "learning_rate": 5e-06, + "loss": 0.5822, + "mean_token_accuracy": 0.8085551857948303, + "num_tokens": 690436208.0, + "step": 26687 + }, + { + "epoch": 2.9308148473533935, + "grad_norm": 2.195080041885376, + "learning_rate": 5e-06, + "loss": 0.77, + "mean_token_accuracy": 0.7493358850479126, + "num_tokens": 690461096.0, + "step": 26688 + }, + { + "epoch": 2.930924665056007, + "grad_norm": 2.156306743621826, + "learning_rate": 5e-06, + "loss": 0.6822, + "mean_token_accuracy": 0.7730534672737122, + "num_tokens": 690485322.0, + "step": 26689 + }, + { + "epoch": 2.9310344827586206, + "grad_norm": 2.0352048873901367, + "learning_rate": 5e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7419443130493164, + "num_tokens": 690513213.0, + "step": 26690 + }, + { + "epoch": 2.9311443004612343, + "grad_norm": 2.0834157466888428, + "learning_rate": 5e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.764220654964447, + "num_tokens": 690539597.0, + "step": 26691 + }, + { + "epoch": 2.931254118163848, + "grad_norm": 2.3701670169830322, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7700906991958618, + "num_tokens": 690561165.0, + "step": 26692 + }, + { + "epoch": 2.931363935866462, + "grad_norm": 1.9746133089065552, + "learning_rate": 5e-06, + "loss": 0.6259, + "mean_token_accuracy": 0.791943371295929, + "num_tokens": 690586242.0, + "step": 26693 + }, + { + "epoch": 2.931473753569075, + "grad_norm": 1.9384037256240845, + "learning_rate": 5e-06, + "loss": 0.7181, + "mean_token_accuracy": 0.7655189037322998, + "num_tokens": 690614656.0, + "step": 26694 + }, + { + "epoch": 2.931583571271689, + "grad_norm": 2.0244741439819336, + "learning_rate": 5e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7584440112113953, + "num_tokens": 690640067.0, + "step": 26695 + }, + { + "epoch": 2.9316933889743026, + "grad_norm": 2.1587791442871094, + "learning_rate": 5e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7693006992340088, + "num_tokens": 690662041.0, + "step": 26696 + }, + { + "epoch": 2.9318032066769164, + "grad_norm": 2.3838696479797363, + "learning_rate": 5e-06, + "loss": 0.7056, + "mean_token_accuracy": 0.777114987373352, + "num_tokens": 690683097.0, + "step": 26697 + }, + { + "epoch": 2.93191302437953, + "grad_norm": 2.0605244636535645, + "learning_rate": 5e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7634915113449097, + "num_tokens": 690709909.0, + "step": 26698 + }, + { + "epoch": 2.9320228420821435, + "grad_norm": 1.9375746250152588, + "learning_rate": 5e-06, + "loss": 0.7317, + "mean_token_accuracy": 0.7610530853271484, + "num_tokens": 690736243.0, + "step": 26699 + }, + { + "epoch": 2.9321326597847572, + "grad_norm": 2.214535713195801, + "learning_rate": 5e-06, + "loss": 0.7234, + "mean_token_accuracy": 0.763462245464325, + "num_tokens": 690757740.0, + "step": 26700 + }, + { + "epoch": 2.932242477487371, + "grad_norm": 1.8760119676589966, + "learning_rate": 5e-06, + "loss": 0.7337, + "mean_token_accuracy": 0.7594777345657349, + "num_tokens": 690787211.0, + "step": 26701 + }, + { + "epoch": 2.9323522951899847, + "grad_norm": 1.9464665651321411, + "learning_rate": 5e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7470638751983643, + "num_tokens": 690819061.0, + "step": 26702 + }, + { + "epoch": 2.9324621128925985, + "grad_norm": 1.9130911827087402, + "learning_rate": 5e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7559852600097656, + "num_tokens": 690847974.0, + "step": 26703 + }, + { + "epoch": 2.932571930595212, + "grad_norm": 2.1823296546936035, + "learning_rate": 5e-06, + "loss": 0.6685, + "mean_token_accuracy": 0.7779958248138428, + "num_tokens": 690870427.0, + "step": 26704 + }, + { + "epoch": 2.9326817482978256, + "grad_norm": 1.9519942998886108, + "learning_rate": 5e-06, + "loss": 0.7448, + "mean_token_accuracy": 0.7648731470108032, + "num_tokens": 690897408.0, + "step": 26705 + }, + { + "epoch": 2.9327915660004393, + "grad_norm": 2.188443183898926, + "learning_rate": 5e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.7834773063659668, + "num_tokens": 690920901.0, + "step": 26706 + }, + { + "epoch": 2.932901383703053, + "grad_norm": 2.0301544666290283, + "learning_rate": 5e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7283629775047302, + "num_tokens": 690949608.0, + "step": 26707 + }, + { + "epoch": 2.933011201405667, + "grad_norm": 2.0276331901550293, + "learning_rate": 5e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7560482621192932, + "num_tokens": 690975695.0, + "step": 26708 + }, + { + "epoch": 2.93312101910828, + "grad_norm": 2.0781307220458984, + "learning_rate": 5e-06, + "loss": 0.7406, + "mean_token_accuracy": 0.7635793685913086, + "num_tokens": 690999010.0, + "step": 26709 + }, + { + "epoch": 2.933230836810894, + "grad_norm": 2.3226096630096436, + "learning_rate": 5e-06, + "loss": 0.6013, + "mean_token_accuracy": 0.7970017194747925, + "num_tokens": 691018242.0, + "step": 26710 + }, + { + "epoch": 2.9333406545135077, + "grad_norm": 2.019986867904663, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7711086869239807, + "num_tokens": 691046821.0, + "step": 26711 + }, + { + "epoch": 2.933450472216121, + "grad_norm": 2.203890323638916, + "learning_rate": 5e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7349880933761597, + "num_tokens": 691070933.0, + "step": 26712 + }, + { + "epoch": 2.9335602899187347, + "grad_norm": 1.9952075481414795, + "learning_rate": 5e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7414689660072327, + "num_tokens": 691098026.0, + "step": 26713 + }, + { + "epoch": 2.9336701076213485, + "grad_norm": 1.8680979013442993, + "learning_rate": 5e-06, + "loss": 0.7267, + "mean_token_accuracy": 0.7548768520355225, + "num_tokens": 691129089.0, + "step": 26714 + }, + { + "epoch": 2.9337799253239623, + "grad_norm": 1.8099873065948486, + "learning_rate": 5e-06, + "loss": 0.5994, + "mean_token_accuracy": 0.7963211536407471, + "num_tokens": 691157191.0, + "step": 26715 + }, + { + "epoch": 2.933889743026576, + "grad_norm": 2.422088146209717, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7680862545967102, + "num_tokens": 691177790.0, + "step": 26716 + }, + { + "epoch": 2.9339995607291893, + "grad_norm": 2.5502572059631348, + "learning_rate": 5e-06, + "loss": 0.6472, + "mean_token_accuracy": 0.7858132123947144, + "num_tokens": 691196438.0, + "step": 26717 + }, + { + "epoch": 2.934109378431803, + "grad_norm": 2.1275973320007324, + "learning_rate": 5e-06, + "loss": 0.7432, + "mean_token_accuracy": 0.7591537833213806, + "num_tokens": 691223008.0, + "step": 26718 + }, + { + "epoch": 2.934219196134417, + "grad_norm": 2.125933885574341, + "learning_rate": 5e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.742168664932251, + "num_tokens": 691249002.0, + "step": 26719 + }, + { + "epoch": 2.9343290138370306, + "grad_norm": 2.285770893096924, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7605305910110474, + "num_tokens": 691272228.0, + "step": 26720 + }, + { + "epoch": 2.9344388315396444, + "grad_norm": 1.9425595998764038, + "learning_rate": 5e-06, + "loss": 0.6966, + "mean_token_accuracy": 0.7727899551391602, + "num_tokens": 691299861.0, + "step": 26721 + }, + { + "epoch": 2.9345486492422577, + "grad_norm": 2.3716163635253906, + "learning_rate": 5e-06, + "loss": 0.6551, + "mean_token_accuracy": 0.7801707983016968, + "num_tokens": 691319724.0, + "step": 26722 + }, + { + "epoch": 2.9346584669448714, + "grad_norm": 1.934239387512207, + "learning_rate": 5e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7445250749588013, + "num_tokens": 691348686.0, + "step": 26723 + }, + { + "epoch": 2.934768284647485, + "grad_norm": 2.0332000255584717, + "learning_rate": 5e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7503947019577026, + "num_tokens": 691375680.0, + "step": 26724 + }, + { + "epoch": 2.934878102350099, + "grad_norm": 2.1747584342956543, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.762266218662262, + "num_tokens": 691401828.0, + "step": 26725 + }, + { + "epoch": 2.9349879200527127, + "grad_norm": 2.304263114929199, + "learning_rate": 5e-06, + "loss": 0.6025, + "mean_token_accuracy": 0.7978512048721313, + "num_tokens": 691420888.0, + "step": 26726 + }, + { + "epoch": 2.935097737755326, + "grad_norm": 2.011510133743286, + "learning_rate": 5e-06, + "loss": 0.72, + "mean_token_accuracy": 0.7606270909309387, + "num_tokens": 691446638.0, + "step": 26727 + }, + { + "epoch": 2.9352075554579398, + "grad_norm": 2.1848702430725098, + "learning_rate": 5e-06, + "loss": 0.679, + "mean_token_accuracy": 0.7730941772460938, + "num_tokens": 691469675.0, + "step": 26728 + }, + { + "epoch": 2.9353173731605535, + "grad_norm": 1.767815351486206, + "learning_rate": 5e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7509536743164062, + "num_tokens": 691503754.0, + "step": 26729 + }, + { + "epoch": 2.9354271908631673, + "grad_norm": 2.0828747749328613, + "learning_rate": 5e-06, + "loss": 0.7627, + "mean_token_accuracy": 0.7557883262634277, + "num_tokens": 691531718.0, + "step": 26730 + }, + { + "epoch": 2.935537008565781, + "grad_norm": 2.1424078941345215, + "learning_rate": 5e-06, + "loss": 0.7073, + "mean_token_accuracy": 0.7674782872200012, + "num_tokens": 691554994.0, + "step": 26731 + }, + { + "epoch": 2.9356468262683943, + "grad_norm": 2.099365711212158, + "learning_rate": 5e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7290923595428467, + "num_tokens": 691581998.0, + "step": 26732 + }, + { + "epoch": 2.935756643971008, + "grad_norm": 2.267155885696411, + "learning_rate": 5e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.746559739112854, + "num_tokens": 691603830.0, + "step": 26733 + }, + { + "epoch": 2.935866461673622, + "grad_norm": 1.8688175678253174, + "learning_rate": 5e-06, + "loss": 0.774, + "mean_token_accuracy": 0.7554909586906433, + "num_tokens": 691632427.0, + "step": 26734 + }, + { + "epoch": 2.935976279376235, + "grad_norm": 1.9602501392364502, + "learning_rate": 5e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.753859281539917, + "num_tokens": 691659257.0, + "step": 26735 + }, + { + "epoch": 2.9360860970788494, + "grad_norm": 2.4628617763519287, + "learning_rate": 5e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.7681834697723389, + "num_tokens": 691678736.0, + "step": 26736 + }, + { + "epoch": 2.9361959147814627, + "grad_norm": 2.022634506225586, + "learning_rate": 5e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7647908926010132, + "num_tokens": 691705207.0, + "step": 26737 + }, + { + "epoch": 2.9363057324840764, + "grad_norm": 2.1818764209747314, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7602726817131042, + "num_tokens": 691727246.0, + "step": 26738 + }, + { + "epoch": 2.93641555018669, + "grad_norm": 1.9383914470672607, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7554911971092224, + "num_tokens": 691756917.0, + "step": 26739 + }, + { + "epoch": 2.9365253678893035, + "grad_norm": 1.7181882858276367, + "learning_rate": 5e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7407695651054382, + "num_tokens": 691790655.0, + "step": 26740 + }, + { + "epoch": 2.9366351855919173, + "grad_norm": 2.0576043128967285, + "learning_rate": 5e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7548882961273193, + "num_tokens": 691815741.0, + "step": 26741 + }, + { + "epoch": 2.936745003294531, + "grad_norm": 1.8569060564041138, + "learning_rate": 5e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.763204038143158, + "num_tokens": 691847207.0, + "step": 26742 + }, + { + "epoch": 2.936854820997145, + "grad_norm": 2.1249821186065674, + "learning_rate": 5e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.745693564414978, + "num_tokens": 691872376.0, + "step": 26743 + }, + { + "epoch": 2.9369646386997585, + "grad_norm": 2.6526730060577393, + "learning_rate": 5e-06, + "loss": 0.6436, + "mean_token_accuracy": 0.785013735294342, + "num_tokens": 691889157.0, + "step": 26744 + }, + { + "epoch": 2.937074456402372, + "grad_norm": 2.3142409324645996, + "learning_rate": 5e-06, + "loss": 0.7001, + "mean_token_accuracy": 0.764020562171936, + "num_tokens": 691911615.0, + "step": 26745 + }, + { + "epoch": 2.9371842741049856, + "grad_norm": 2.1090574264526367, + "learning_rate": 5e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.7584515810012817, + "num_tokens": 691935280.0, + "step": 26746 + }, + { + "epoch": 2.9372940918075994, + "grad_norm": 2.072444200515747, + "learning_rate": 5e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7548959255218506, + "num_tokens": 691962314.0, + "step": 26747 + }, + { + "epoch": 2.937403909510213, + "grad_norm": 2.1887786388397217, + "learning_rate": 5e-06, + "loss": 0.6605, + "mean_token_accuracy": 0.7803936004638672, + "num_tokens": 691984239.0, + "step": 26748 + }, + { + "epoch": 2.937513727212827, + "grad_norm": 1.9757225513458252, + "learning_rate": 5e-06, + "loss": 0.6933, + "mean_token_accuracy": 0.7690591812133789, + "num_tokens": 692010572.0, + "step": 26749 + }, + { + "epoch": 2.93762354491544, + "grad_norm": 1.9744025468826294, + "learning_rate": 5e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7560458183288574, + "num_tokens": 692038730.0, + "step": 26750 + }, + { + "epoch": 2.937733362618054, + "grad_norm": 2.116842746734619, + "learning_rate": 5e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.7687587141990662, + "num_tokens": 692062015.0, + "step": 26751 + }, + { + "epoch": 2.9378431803206677, + "grad_norm": 2.1953136920928955, + "learning_rate": 5e-06, + "loss": 0.6807, + "mean_token_accuracy": 0.7819418907165527, + "num_tokens": 692084689.0, + "step": 26752 + }, + { + "epoch": 2.9379529980232815, + "grad_norm": 1.9779781103134155, + "learning_rate": 5e-06, + "loss": 0.7224, + "mean_token_accuracy": 0.7632535099983215, + "num_tokens": 692113993.0, + "step": 26753 + }, + { + "epoch": 2.9380628157258952, + "grad_norm": 1.9953994750976562, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.7559477090835571, + "num_tokens": 692140877.0, + "step": 26754 + }, + { + "epoch": 2.9381726334285085, + "grad_norm": 2.2433652877807617, + "learning_rate": 5e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.7708727121353149, + "num_tokens": 692164336.0, + "step": 26755 + }, + { + "epoch": 2.9382824511311223, + "grad_norm": 2.2189412117004395, + "learning_rate": 5e-06, + "loss": 0.7219, + "mean_token_accuracy": 0.7598909139633179, + "num_tokens": 692187559.0, + "step": 26756 + }, + { + "epoch": 2.938392268833736, + "grad_norm": 2.0183677673339844, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7633645534515381, + "num_tokens": 692214493.0, + "step": 26757 + }, + { + "epoch": 2.93850208653635, + "grad_norm": 2.0386815071105957, + "learning_rate": 5e-06, + "loss": 0.6234, + "mean_token_accuracy": 0.7871714234352112, + "num_tokens": 692239472.0, + "step": 26758 + }, + { + "epoch": 2.9386119042389636, + "grad_norm": 2.1016645431518555, + "learning_rate": 5e-06, + "loss": 0.7648, + "mean_token_accuracy": 0.752872109413147, + "num_tokens": 692266804.0, + "step": 26759 + }, + { + "epoch": 2.938721721941577, + "grad_norm": 1.9147895574569702, + "learning_rate": 5e-06, + "loss": 0.73, + "mean_token_accuracy": 0.7704092860221863, + "num_tokens": 692295024.0, + "step": 26760 + }, + { + "epoch": 2.9388315396441906, + "grad_norm": 2.2385404109954834, + "learning_rate": 5e-06, + "loss": 0.6801, + "mean_token_accuracy": 0.7794543504714966, + "num_tokens": 692320042.0, + "step": 26761 + }, + { + "epoch": 2.9389413573468044, + "grad_norm": 1.8656612634658813, + "learning_rate": 5e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7453535199165344, + "num_tokens": 692352286.0, + "step": 26762 + }, + { + "epoch": 2.9390511750494177, + "grad_norm": 2.1300852298736572, + "learning_rate": 5e-06, + "loss": 0.7579, + "mean_token_accuracy": 0.7541150450706482, + "num_tokens": 692377449.0, + "step": 26763 + }, + { + "epoch": 2.9391609927520315, + "grad_norm": 2.4691720008850098, + "learning_rate": 5e-06, + "loss": 0.6371, + "mean_token_accuracy": 0.7820249795913696, + "num_tokens": 692396911.0, + "step": 26764 + }, + { + "epoch": 2.939270810454645, + "grad_norm": 1.9909775257110596, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7561199069023132, + "num_tokens": 692425338.0, + "step": 26765 + }, + { + "epoch": 2.939380628157259, + "grad_norm": 2.2871780395507812, + "learning_rate": 5e-06, + "loss": 0.6625, + "mean_token_accuracy": 0.7777200937271118, + "num_tokens": 692447906.0, + "step": 26766 + }, + { + "epoch": 2.9394904458598727, + "grad_norm": 2.236715793609619, + "learning_rate": 5e-06, + "loss": 0.7519, + "mean_token_accuracy": 0.7639719247817993, + "num_tokens": 692473095.0, + "step": 26767 + }, + { + "epoch": 2.939600263562486, + "grad_norm": 2.487348794937134, + "learning_rate": 5e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7727236151695251, + "num_tokens": 692490462.0, + "step": 26768 + }, + { + "epoch": 2.9397100812651, + "grad_norm": 2.0267438888549805, + "learning_rate": 5e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7478477358818054, + "num_tokens": 692517500.0, + "step": 26769 + }, + { + "epoch": 2.9398198989677136, + "grad_norm": 2.035672903060913, + "learning_rate": 5e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7534748911857605, + "num_tokens": 692547632.0, + "step": 26770 + }, + { + "epoch": 2.9399297166703273, + "grad_norm": 2.2158379554748535, + "learning_rate": 5e-06, + "loss": 0.6917, + "mean_token_accuracy": 0.7768310904502869, + "num_tokens": 692569636.0, + "step": 26771 + }, + { + "epoch": 2.940039534372941, + "grad_norm": 2.052910804748535, + "learning_rate": 5e-06, + "loss": 0.6851, + "mean_token_accuracy": 0.7769525647163391, + "num_tokens": 692595855.0, + "step": 26772 + }, + { + "epoch": 2.9401493520755544, + "grad_norm": 2.503523111343384, + "learning_rate": 5e-06, + "loss": 0.7042, + "mean_token_accuracy": 0.7625271081924438, + "num_tokens": 692616408.0, + "step": 26773 + }, + { + "epoch": 2.940259169778168, + "grad_norm": 2.053696632385254, + "learning_rate": 5e-06, + "loss": 0.7277, + "mean_token_accuracy": 0.7571243643760681, + "num_tokens": 692642156.0, + "step": 26774 + }, + { + "epoch": 2.940368987480782, + "grad_norm": 2.457151174545288, + "learning_rate": 5e-06, + "loss": 0.6281, + "mean_token_accuracy": 0.7887061238288879, + "num_tokens": 692661361.0, + "step": 26775 + }, + { + "epoch": 2.9404788051833957, + "grad_norm": 2.036221981048584, + "learning_rate": 5e-06, + "loss": 0.6755, + "mean_token_accuracy": 0.7728142738342285, + "num_tokens": 692686388.0, + "step": 26776 + }, + { + "epoch": 2.9405886228860094, + "grad_norm": 2.2568917274475098, + "learning_rate": 5e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7539818286895752, + "num_tokens": 692708097.0, + "step": 26777 + }, + { + "epoch": 2.9406984405886227, + "grad_norm": 2.082526922225952, + "learning_rate": 5e-06, + "loss": 0.6937, + "mean_token_accuracy": 0.7806347608566284, + "num_tokens": 692731357.0, + "step": 26778 + }, + { + "epoch": 2.9408082582912365, + "grad_norm": 1.912543535232544, + "learning_rate": 5e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7616568207740784, + "num_tokens": 692761527.0, + "step": 26779 + }, + { + "epoch": 2.9409180759938502, + "grad_norm": 2.1830782890319824, + "learning_rate": 5e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.76215660572052, + "num_tokens": 692786284.0, + "step": 26780 + }, + { + "epoch": 2.941027893696464, + "grad_norm": 1.8311153650283813, + "learning_rate": 5e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7281752824783325, + "num_tokens": 692822111.0, + "step": 26781 + }, + { + "epoch": 2.9411377113990778, + "grad_norm": 2.120408535003662, + "learning_rate": 5e-06, + "loss": 0.6402, + "mean_token_accuracy": 0.7895278930664062, + "num_tokens": 692844848.0, + "step": 26782 + }, + { + "epoch": 2.941247529101691, + "grad_norm": 1.9850229024887085, + "learning_rate": 5e-06, + "loss": 0.6691, + "mean_token_accuracy": 0.7771356105804443, + "num_tokens": 692870704.0, + "step": 26783 + }, + { + "epoch": 2.941357346804305, + "grad_norm": 2.2549753189086914, + "learning_rate": 5e-06, + "loss": 0.6408, + "mean_token_accuracy": 0.7953416705131531, + "num_tokens": 692892003.0, + "step": 26784 + }, + { + "epoch": 2.9414671645069186, + "grad_norm": 2.0045831203460693, + "learning_rate": 5e-06, + "loss": 0.7623, + "mean_token_accuracy": 0.7522728443145752, + "num_tokens": 692921318.0, + "step": 26785 + }, + { + "epoch": 2.9415769822095323, + "grad_norm": 1.8985939025878906, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7418092489242554, + "num_tokens": 692951839.0, + "step": 26786 + }, + { + "epoch": 2.941686799912146, + "grad_norm": 1.901336669921875, + "learning_rate": 5e-06, + "loss": 0.6979, + "mean_token_accuracy": 0.7671810984611511, + "num_tokens": 692981196.0, + "step": 26787 + }, + { + "epoch": 2.9417966176147594, + "grad_norm": 2.0524797439575195, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7597583532333374, + "num_tokens": 693011865.0, + "step": 26788 + }, + { + "epoch": 2.941906435317373, + "grad_norm": 2.3962111473083496, + "learning_rate": 5e-06, + "loss": 0.6726, + "mean_token_accuracy": 0.7761912941932678, + "num_tokens": 693032189.0, + "step": 26789 + }, + { + "epoch": 2.942016253019987, + "grad_norm": 2.072242021560669, + "learning_rate": 5e-06, + "loss": 0.6972, + "mean_token_accuracy": 0.7641933560371399, + "num_tokens": 693059097.0, + "step": 26790 + }, + { + "epoch": 2.9421260707226002, + "grad_norm": 1.9628145694732666, + "learning_rate": 5e-06, + "loss": 0.7269, + "mean_token_accuracy": 0.7593309879302979, + "num_tokens": 693086265.0, + "step": 26791 + }, + { + "epoch": 2.942235888425214, + "grad_norm": 2.1352381706237793, + "learning_rate": 5e-06, + "loss": 0.6983, + "mean_token_accuracy": 0.7643406391143799, + "num_tokens": 693110151.0, + "step": 26792 + }, + { + "epoch": 2.9423457061278278, + "grad_norm": 2.000946044921875, + "learning_rate": 5e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7656641006469727, + "num_tokens": 693137449.0, + "step": 26793 + }, + { + "epoch": 2.9424555238304415, + "grad_norm": 1.9967912435531616, + "learning_rate": 5e-06, + "loss": 0.6456, + "mean_token_accuracy": 0.7858765721321106, + "num_tokens": 693163389.0, + "step": 26794 + }, + { + "epoch": 2.9425653415330553, + "grad_norm": 2.2580552101135254, + "learning_rate": 5e-06, + "loss": 0.7044, + "mean_token_accuracy": 0.7663214206695557, + "num_tokens": 693186999.0, + "step": 26795 + }, + { + "epoch": 2.9426751592356686, + "grad_norm": 1.9863778352737427, + "learning_rate": 5e-06, + "loss": 0.6749, + "mean_token_accuracy": 0.7776560187339783, + "num_tokens": 693213459.0, + "step": 26796 + }, + { + "epoch": 2.9427849769382823, + "grad_norm": 1.9566928148269653, + "learning_rate": 5e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7637157440185547, + "num_tokens": 693240555.0, + "step": 26797 + }, + { + "epoch": 2.942894794640896, + "grad_norm": 1.9824345111846924, + "learning_rate": 5e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7340865135192871, + "num_tokens": 693269099.0, + "step": 26798 + }, + { + "epoch": 2.94300461234351, + "grad_norm": 2.1913065910339355, + "learning_rate": 5e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7522474527359009, + "num_tokens": 693293347.0, + "step": 26799 + }, + { + "epoch": 2.9431144300461236, + "grad_norm": 2.240316390991211, + "learning_rate": 5e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.7652560472488403, + "num_tokens": 693317212.0, + "step": 26800 + }, + { + "epoch": 2.943224247748737, + "grad_norm": 1.9474109411239624, + "learning_rate": 5e-06, + "loss": 0.7236, + "mean_token_accuracy": 0.7639839053153992, + "num_tokens": 693343822.0, + "step": 26801 + }, + { + "epoch": 2.9433340654513507, + "grad_norm": 2.380253791809082, + "learning_rate": 5e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.7759876847267151, + "num_tokens": 693364132.0, + "step": 26802 + }, + { + "epoch": 2.9434438831539644, + "grad_norm": 1.9275434017181396, + "learning_rate": 5e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7523318529129028, + "num_tokens": 693392228.0, + "step": 26803 + }, + { + "epoch": 2.943553700856578, + "grad_norm": 1.9211063385009766, + "learning_rate": 5e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7436100244522095, + "num_tokens": 693421907.0, + "step": 26804 + }, + { + "epoch": 2.943663518559192, + "grad_norm": 2.0292370319366455, + "learning_rate": 5e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7772811651229858, + "num_tokens": 693446414.0, + "step": 26805 + }, + { + "epoch": 2.9437733362618053, + "grad_norm": 2.340099811553955, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7670847177505493, + "num_tokens": 693470018.0, + "step": 26806 + }, + { + "epoch": 2.943883153964419, + "grad_norm": 2.1561899185180664, + "learning_rate": 5e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7658345103263855, + "num_tokens": 693493739.0, + "step": 26807 + }, + { + "epoch": 2.9439929716670328, + "grad_norm": 1.921439290046692, + "learning_rate": 5e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7540359497070312, + "num_tokens": 693523634.0, + "step": 26808 + }, + { + "epoch": 2.9441027893696465, + "grad_norm": 2.2372796535491943, + "learning_rate": 5e-06, + "loss": 0.6773, + "mean_token_accuracy": 0.7712422013282776, + "num_tokens": 693546208.0, + "step": 26809 + }, + { + "epoch": 2.9442126070722603, + "grad_norm": 2.0277771949768066, + "learning_rate": 5e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7550806999206543, + "num_tokens": 693571162.0, + "step": 26810 + }, + { + "epoch": 2.9443224247748736, + "grad_norm": 2.128049373626709, + "learning_rate": 5e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7434470653533936, + "num_tokens": 693595547.0, + "step": 26811 + }, + { + "epoch": 2.9444322424774874, + "grad_norm": 1.9744917154312134, + "learning_rate": 5e-06, + "loss": 0.611, + "mean_token_accuracy": 0.7938041687011719, + "num_tokens": 693619851.0, + "step": 26812 + }, + { + "epoch": 2.944542060180101, + "grad_norm": 2.1779232025146484, + "learning_rate": 5e-06, + "loss": 0.7195, + "mean_token_accuracy": 0.7675334811210632, + "num_tokens": 693642676.0, + "step": 26813 + }, + { + "epoch": 2.9446518778827144, + "grad_norm": 1.8531137704849243, + "learning_rate": 5e-06, + "loss": 0.7005, + "mean_token_accuracy": 0.7668205499649048, + "num_tokens": 693674488.0, + "step": 26814 + }, + { + "epoch": 2.9447616955853286, + "grad_norm": 2.077174425125122, + "learning_rate": 5e-06, + "loss": 0.6944, + "mean_token_accuracy": 0.7788500189781189, + "num_tokens": 693699083.0, + "step": 26815 + }, + { + "epoch": 2.944871513287942, + "grad_norm": 2.0291225910186768, + "learning_rate": 5e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7364900708198547, + "num_tokens": 693726703.0, + "step": 26816 + }, + { + "epoch": 2.9449813309905557, + "grad_norm": 2.243659496307373, + "learning_rate": 5e-06, + "loss": 0.6787, + "mean_token_accuracy": 0.774060845375061, + "num_tokens": 693747751.0, + "step": 26817 + }, + { + "epoch": 2.9450911486931695, + "grad_norm": 2.000342607498169, + "learning_rate": 5e-06, + "loss": 0.6642, + "mean_token_accuracy": 0.7787361145019531, + "num_tokens": 693773055.0, + "step": 26818 + }, + { + "epoch": 2.9452009663957828, + "grad_norm": 2.07423996925354, + "learning_rate": 5e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7477371692657471, + "num_tokens": 693801399.0, + "step": 26819 + }, + { + "epoch": 2.9453107840983965, + "grad_norm": 1.9294407367706299, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7610600590705872, + "num_tokens": 693829284.0, + "step": 26820 + }, + { + "epoch": 2.9454206018010103, + "grad_norm": 2.1678354740142822, + "learning_rate": 5e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7586072087287903, + "num_tokens": 693854305.0, + "step": 26821 + }, + { + "epoch": 2.945530419503624, + "grad_norm": 2.172927141189575, + "learning_rate": 5e-06, + "loss": 0.6906, + "mean_token_accuracy": 0.7654726505279541, + "num_tokens": 693877240.0, + "step": 26822 + }, + { + "epoch": 2.945640237206238, + "grad_norm": 1.8377089500427246, + "learning_rate": 5e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7449392080307007, + "num_tokens": 693909101.0, + "step": 26823 + }, + { + "epoch": 2.945750054908851, + "grad_norm": 1.967465877532959, + "learning_rate": 5e-06, + "loss": 0.7063, + "mean_token_accuracy": 0.7701658010482788, + "num_tokens": 693935840.0, + "step": 26824 + }, + { + "epoch": 2.945859872611465, + "grad_norm": 1.9597774744033813, + "learning_rate": 5e-06, + "loss": 0.7387, + "mean_token_accuracy": 0.7600860595703125, + "num_tokens": 693963751.0, + "step": 26825 + }, + { + "epoch": 2.9459696903140786, + "grad_norm": 2.13337779045105, + "learning_rate": 5e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7539786100387573, + "num_tokens": 693989291.0, + "step": 26826 + }, + { + "epoch": 2.9460795080166924, + "grad_norm": 1.7125247716903687, + "learning_rate": 5e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7467044591903687, + "num_tokens": 694023170.0, + "step": 26827 + }, + { + "epoch": 2.946189325719306, + "grad_norm": 1.952254056930542, + "learning_rate": 5e-06, + "loss": 0.7562, + "mean_token_accuracy": 0.753944456577301, + "num_tokens": 694051523.0, + "step": 26828 + }, + { + "epoch": 2.9462991434219195, + "grad_norm": 2.060044765472412, + "learning_rate": 5e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7721024751663208, + "num_tokens": 694076647.0, + "step": 26829 + }, + { + "epoch": 2.946408961124533, + "grad_norm": 1.8224557638168335, + "learning_rate": 5e-06, + "loss": 0.6942, + "mean_token_accuracy": 0.7675777077674866, + "num_tokens": 694110040.0, + "step": 26830 + }, + { + "epoch": 2.946518778827147, + "grad_norm": 2.01621150970459, + "learning_rate": 5e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7430226802825928, + "num_tokens": 694139135.0, + "step": 26831 + }, + { + "epoch": 2.9466285965297607, + "grad_norm": 2.09706449508667, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7443872690200806, + "num_tokens": 694164315.0, + "step": 26832 + }, + { + "epoch": 2.9467384142323745, + "grad_norm": 2.0424375534057617, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7581647634506226, + "num_tokens": 694190870.0, + "step": 26833 + }, + { + "epoch": 2.946848231934988, + "grad_norm": 2.1975314617156982, + "learning_rate": 5e-06, + "loss": 0.6578, + "mean_token_accuracy": 0.7853611707687378, + "num_tokens": 694211258.0, + "step": 26834 + }, + { + "epoch": 2.9469580496376016, + "grad_norm": 2.358762264251709, + "learning_rate": 5e-06, + "loss": 0.6664, + "mean_token_accuracy": 0.7808102369308472, + "num_tokens": 694231912.0, + "step": 26835 + }, + { + "epoch": 2.9470678673402153, + "grad_norm": 2.064887762069702, + "learning_rate": 5e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7464103698730469, + "num_tokens": 694257645.0, + "step": 26836 + }, + { + "epoch": 2.947177685042829, + "grad_norm": 2.3181333541870117, + "learning_rate": 5e-06, + "loss": 0.6808, + "mean_token_accuracy": 0.7778500914573669, + "num_tokens": 694276503.0, + "step": 26837 + }, + { + "epoch": 2.947287502745443, + "grad_norm": 1.895522952079773, + "learning_rate": 5e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7409162521362305, + "num_tokens": 694308641.0, + "step": 26838 + }, + { + "epoch": 2.947397320448056, + "grad_norm": 2.408958911895752, + "learning_rate": 5e-06, + "loss": 0.6511, + "mean_token_accuracy": 0.7827900052070618, + "num_tokens": 694328186.0, + "step": 26839 + }, + { + "epoch": 2.94750713815067, + "grad_norm": 2.1488914489746094, + "learning_rate": 5e-06, + "loss": 0.6845, + "mean_token_accuracy": 0.7797659635543823, + "num_tokens": 694351952.0, + "step": 26840 + }, + { + "epoch": 2.9476169558532836, + "grad_norm": 2.306126356124878, + "learning_rate": 5e-06, + "loss": 0.6761, + "mean_token_accuracy": 0.7725030183792114, + "num_tokens": 694373561.0, + "step": 26841 + }, + { + "epoch": 2.947726773555897, + "grad_norm": 2.073770761489868, + "learning_rate": 5e-06, + "loss": 0.7595, + "mean_token_accuracy": 0.7555012702941895, + "num_tokens": 694398939.0, + "step": 26842 + }, + { + "epoch": 2.9478365912585107, + "grad_norm": 2.1586685180664062, + "learning_rate": 5e-06, + "loss": 0.7322, + "mean_token_accuracy": 0.7602626085281372, + "num_tokens": 694421450.0, + "step": 26843 + }, + { + "epoch": 2.9479464089611245, + "grad_norm": 2.0159499645233154, + "learning_rate": 5e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7766199111938477, + "num_tokens": 694447975.0, + "step": 26844 + }, + { + "epoch": 2.9480562266637382, + "grad_norm": 1.9626414775848389, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7589258551597595, + "num_tokens": 694477186.0, + "step": 26845 + }, + { + "epoch": 2.948166044366352, + "grad_norm": 2.0956923961639404, + "learning_rate": 5e-06, + "loss": 0.6948, + "mean_token_accuracy": 0.7669464349746704, + "num_tokens": 694502055.0, + "step": 26846 + }, + { + "epoch": 2.9482758620689653, + "grad_norm": 2.0445234775543213, + "learning_rate": 5e-06, + "loss": 0.647, + "mean_token_accuracy": 0.7812238931655884, + "num_tokens": 694526146.0, + "step": 26847 + }, + { + "epoch": 2.948385679771579, + "grad_norm": 2.2075092792510986, + "learning_rate": 5e-06, + "loss": 0.7278, + "mean_token_accuracy": 0.7589712142944336, + "num_tokens": 694550937.0, + "step": 26848 + }, + { + "epoch": 2.948495497474193, + "grad_norm": 1.943664312362671, + "learning_rate": 5e-06, + "loss": 0.7744, + "mean_token_accuracy": 0.749399721622467, + "num_tokens": 694581296.0, + "step": 26849 + }, + { + "epoch": 2.9486053151768066, + "grad_norm": 1.8559445142745972, + "learning_rate": 5e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7402881383895874, + "num_tokens": 694613400.0, + "step": 26850 + }, + { + "epoch": 2.9487151328794203, + "grad_norm": 2.338778257369995, + "learning_rate": 5e-06, + "loss": 0.6862, + "mean_token_accuracy": 0.7706090211868286, + "num_tokens": 694631657.0, + "step": 26851 + }, + { + "epoch": 2.9488249505820336, + "grad_norm": 2.0099432468414307, + "learning_rate": 5e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7539557814598083, + "num_tokens": 694661036.0, + "step": 26852 + }, + { + "epoch": 2.9489347682846474, + "grad_norm": 2.17632794380188, + "learning_rate": 5e-06, + "loss": 0.6445, + "mean_token_accuracy": 0.7785183191299438, + "num_tokens": 694681399.0, + "step": 26853 + }, + { + "epoch": 2.949044585987261, + "grad_norm": 2.038529396057129, + "learning_rate": 5e-06, + "loss": 0.7074, + "mean_token_accuracy": 0.7672385573387146, + "num_tokens": 694708100.0, + "step": 26854 + }, + { + "epoch": 2.949154403689875, + "grad_norm": 2.0383191108703613, + "learning_rate": 5e-06, + "loss": 0.7283, + "mean_token_accuracy": 0.7595603466033936, + "num_tokens": 694734464.0, + "step": 26855 + }, + { + "epoch": 2.9492642213924887, + "grad_norm": 2.068179130554199, + "learning_rate": 5e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7665976285934448, + "num_tokens": 694760762.0, + "step": 26856 + }, + { + "epoch": 2.949374039095102, + "grad_norm": 2.139277458190918, + "learning_rate": 5e-06, + "loss": 0.656, + "mean_token_accuracy": 0.7864229679107666, + "num_tokens": 694783917.0, + "step": 26857 + }, + { + "epoch": 2.9494838567977157, + "grad_norm": 1.8505996465682983, + "learning_rate": 5e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7548059225082397, + "num_tokens": 694813945.0, + "step": 26858 + }, + { + "epoch": 2.9495936745003295, + "grad_norm": 1.8711484670639038, + "learning_rate": 5e-06, + "loss": 0.7962, + "mean_token_accuracy": 0.7443740367889404, + "num_tokens": 694847458.0, + "step": 26859 + }, + { + "epoch": 2.9497034922029433, + "grad_norm": 1.9982174634933472, + "learning_rate": 5e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.759466290473938, + "num_tokens": 694874445.0, + "step": 26860 + }, + { + "epoch": 2.949813309905557, + "grad_norm": 1.9032964706420898, + "learning_rate": 5e-06, + "loss": 0.7281, + "mean_token_accuracy": 0.7578942179679871, + "num_tokens": 694905678.0, + "step": 26861 + }, + { + "epoch": 2.9499231276081703, + "grad_norm": 2.163892984390259, + "learning_rate": 5e-06, + "loss": 0.6481, + "mean_token_accuracy": 0.7850831747055054, + "num_tokens": 694928672.0, + "step": 26862 + }, + { + "epoch": 2.950032945310784, + "grad_norm": 2.1325459480285645, + "learning_rate": 5e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7383265495300293, + "num_tokens": 694953567.0, + "step": 26863 + }, + { + "epoch": 2.950142763013398, + "grad_norm": 2.0924248695373535, + "learning_rate": 5e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7675632834434509, + "num_tokens": 694977912.0, + "step": 26864 + }, + { + "epoch": 2.950252580716011, + "grad_norm": 2.145278215408325, + "learning_rate": 5e-06, + "loss": 0.6743, + "mean_token_accuracy": 0.77871173620224, + "num_tokens": 695002020.0, + "step": 26865 + }, + { + "epoch": 2.9503623984186254, + "grad_norm": 2.1721298694610596, + "learning_rate": 5e-06, + "loss": 0.6956, + "mean_token_accuracy": 0.7699131965637207, + "num_tokens": 695024434.0, + "step": 26866 + }, + { + "epoch": 2.9504722161212387, + "grad_norm": 1.9927641153335571, + "learning_rate": 5e-06, + "loss": 0.6817, + "mean_token_accuracy": 0.7723729610443115, + "num_tokens": 695049873.0, + "step": 26867 + }, + { + "epoch": 2.9505820338238524, + "grad_norm": 2.0167529582977295, + "learning_rate": 5e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7297787666320801, + "num_tokens": 695077918.0, + "step": 26868 + }, + { + "epoch": 2.950691851526466, + "grad_norm": 1.9629379510879517, + "learning_rate": 5e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7289775609970093, + "num_tokens": 695106288.0, + "step": 26869 + }, + { + "epoch": 2.9508016692290795, + "grad_norm": 2.1993982791900635, + "learning_rate": 5e-06, + "loss": 0.7394, + "mean_token_accuracy": 0.760733962059021, + "num_tokens": 695132974.0, + "step": 26870 + }, + { + "epoch": 2.9509114869316933, + "grad_norm": 2.059422254562378, + "learning_rate": 5e-06, + "loss": 0.7055, + "mean_token_accuracy": 0.7655739784240723, + "num_tokens": 695157190.0, + "step": 26871 + }, + { + "epoch": 2.951021304634307, + "grad_norm": 1.8684203624725342, + "learning_rate": 5e-06, + "loss": 0.6504, + "mean_token_accuracy": 0.779783308506012, + "num_tokens": 695187462.0, + "step": 26872 + }, + { + "epoch": 2.9511311223369208, + "grad_norm": 2.027491331100464, + "learning_rate": 5e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7618324756622314, + "num_tokens": 695213784.0, + "step": 26873 + }, + { + "epoch": 2.9512409400395345, + "grad_norm": 2.2676548957824707, + "learning_rate": 5e-06, + "loss": 0.7135, + "mean_token_accuracy": 0.761813759803772, + "num_tokens": 695234886.0, + "step": 26874 + }, + { + "epoch": 2.951350757742148, + "grad_norm": 2.031497001647949, + "learning_rate": 5e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7447572350502014, + "num_tokens": 695261415.0, + "step": 26875 + }, + { + "epoch": 2.9514605754447616, + "grad_norm": 2.0841314792633057, + "learning_rate": 5e-06, + "loss": 0.769, + "mean_token_accuracy": 0.7547132968902588, + "num_tokens": 695285394.0, + "step": 26876 + }, + { + "epoch": 2.9515703931473753, + "grad_norm": 1.91056227684021, + "learning_rate": 5e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7667538523674011, + "num_tokens": 695312810.0, + "step": 26877 + }, + { + "epoch": 2.951680210849989, + "grad_norm": 2.02034854888916, + "learning_rate": 5e-06, + "loss": 0.771, + "mean_token_accuracy": 0.7553097009658813, + "num_tokens": 695340280.0, + "step": 26878 + }, + { + "epoch": 2.951790028552603, + "grad_norm": 1.908271074295044, + "learning_rate": 5e-06, + "loss": 0.6974, + "mean_token_accuracy": 0.7765918374061584, + "num_tokens": 695367733.0, + "step": 26879 + }, + { + "epoch": 2.951899846255216, + "grad_norm": 2.020615577697754, + "learning_rate": 5e-06, + "loss": 0.7954, + "mean_token_accuracy": 0.7468187212944031, + "num_tokens": 695394430.0, + "step": 26880 + }, + { + "epoch": 2.95200966395783, + "grad_norm": 2.165968894958496, + "learning_rate": 5e-06, + "loss": 0.6258, + "mean_token_accuracy": 0.7890281677246094, + "num_tokens": 695415803.0, + "step": 26881 + }, + { + "epoch": 2.9521194816604437, + "grad_norm": 1.9033875465393066, + "learning_rate": 5e-06, + "loss": 0.7184, + "mean_token_accuracy": 0.7653418779373169, + "num_tokens": 695443523.0, + "step": 26882 + }, + { + "epoch": 2.9522292993630574, + "grad_norm": 1.8825855255126953, + "learning_rate": 5e-06, + "loss": 0.7108, + "mean_token_accuracy": 0.7702484726905823, + "num_tokens": 695471394.0, + "step": 26883 + }, + { + "epoch": 2.952339117065671, + "grad_norm": 1.8094631433486938, + "learning_rate": 5e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7384041547775269, + "num_tokens": 695503600.0, + "step": 26884 + }, + { + "epoch": 2.9524489347682845, + "grad_norm": 2.0170280933380127, + "learning_rate": 5e-06, + "loss": 0.68, + "mean_token_accuracy": 0.7736230492591858, + "num_tokens": 695530803.0, + "step": 26885 + }, + { + "epoch": 2.9525587524708983, + "grad_norm": 2.1539270877838135, + "learning_rate": 5e-06, + "loss": 0.7052, + "mean_token_accuracy": 0.7697044610977173, + "num_tokens": 695554538.0, + "step": 26886 + }, + { + "epoch": 2.952668570173512, + "grad_norm": 2.0548629760742188, + "learning_rate": 5e-06, + "loss": 0.6358, + "mean_token_accuracy": 0.7857528924942017, + "num_tokens": 695578877.0, + "step": 26887 + }, + { + "epoch": 2.952778387876126, + "grad_norm": 2.2124555110931396, + "learning_rate": 5e-06, + "loss": 0.712, + "mean_token_accuracy": 0.7668002843856812, + "num_tokens": 695601405.0, + "step": 26888 + }, + { + "epoch": 2.9528882055787395, + "grad_norm": 1.955721378326416, + "learning_rate": 5e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.7804891467094421, + "num_tokens": 695625963.0, + "step": 26889 + }, + { + "epoch": 2.952998023281353, + "grad_norm": 2.063880681991577, + "learning_rate": 5e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7717773914337158, + "num_tokens": 695652702.0, + "step": 26890 + }, + { + "epoch": 2.9531078409839666, + "grad_norm": 2.029362678527832, + "learning_rate": 5e-06, + "loss": 0.7529, + "mean_token_accuracy": 0.7601546049118042, + "num_tokens": 695680481.0, + "step": 26891 + }, + { + "epoch": 2.9532176586865804, + "grad_norm": 1.8387036323547363, + "learning_rate": 5e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7409510612487793, + "num_tokens": 695714842.0, + "step": 26892 + }, + { + "epoch": 2.9533274763891937, + "grad_norm": 2.1271772384643555, + "learning_rate": 5e-06, + "loss": 0.7031, + "mean_token_accuracy": 0.7741751670837402, + "num_tokens": 695739738.0, + "step": 26893 + }, + { + "epoch": 2.9534372940918074, + "grad_norm": 2.13546085357666, + "learning_rate": 5e-06, + "loss": 0.6571, + "mean_token_accuracy": 0.7745940685272217, + "num_tokens": 695762466.0, + "step": 26894 + }, + { + "epoch": 2.953547111794421, + "grad_norm": 2.1280415058135986, + "learning_rate": 5e-06, + "loss": 0.6252, + "mean_token_accuracy": 0.7893673777580261, + "num_tokens": 695783821.0, + "step": 26895 + }, + { + "epoch": 2.953656929497035, + "grad_norm": 2.0092599391937256, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7359081506729126, + "num_tokens": 695813539.0, + "step": 26896 + }, + { + "epoch": 2.9537667471996487, + "grad_norm": 1.9084248542785645, + "learning_rate": 5e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7432405352592468, + "num_tokens": 695843914.0, + "step": 26897 + }, + { + "epoch": 2.953876564902262, + "grad_norm": 1.8464605808258057, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7596668004989624, + "num_tokens": 695873274.0, + "step": 26898 + }, + { + "epoch": 2.953986382604876, + "grad_norm": 2.2185580730438232, + "learning_rate": 5e-06, + "loss": 0.6809, + "mean_token_accuracy": 0.7712427377700806, + "num_tokens": 695897006.0, + "step": 26899 + }, + { + "epoch": 2.9540962003074895, + "grad_norm": 1.9891300201416016, + "learning_rate": 5e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.7556554079055786, + "num_tokens": 695925177.0, + "step": 26900 + }, + { + "epoch": 2.9542060180101033, + "grad_norm": 1.9390193223953247, + "learning_rate": 5e-06, + "loss": 0.7299, + "mean_token_accuracy": 0.7583154439926147, + "num_tokens": 695953940.0, + "step": 26901 + }, + { + "epoch": 2.954315835712717, + "grad_norm": 2.1280057430267334, + "learning_rate": 5e-06, + "loss": 0.7222, + "mean_token_accuracy": 0.7615064382553101, + "num_tokens": 695980492.0, + "step": 26902 + }, + { + "epoch": 2.9544256534153304, + "grad_norm": 2.016740322113037, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7716937065124512, + "num_tokens": 696006927.0, + "step": 26903 + }, + { + "epoch": 2.954535471117944, + "grad_norm": 1.9743226766586304, + "learning_rate": 5e-06, + "loss": 0.7508, + "mean_token_accuracy": 0.7536145448684692, + "num_tokens": 696037792.0, + "step": 26904 + }, + { + "epoch": 2.954645288820558, + "grad_norm": 2.0122692584991455, + "learning_rate": 5e-06, + "loss": 0.7309, + "mean_token_accuracy": 0.762030839920044, + "num_tokens": 696062829.0, + "step": 26905 + }, + { + "epoch": 2.9547551065231716, + "grad_norm": 2.038330316543579, + "learning_rate": 5e-06, + "loss": 0.6968, + "mean_token_accuracy": 0.7684513330459595, + "num_tokens": 696088401.0, + "step": 26906 + }, + { + "epoch": 2.9548649242257854, + "grad_norm": 1.9493190050125122, + "learning_rate": 5e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7315404415130615, + "num_tokens": 696119328.0, + "step": 26907 + }, + { + "epoch": 2.9549747419283987, + "grad_norm": 2.3233726024627686, + "learning_rate": 5e-06, + "loss": 0.6795, + "mean_token_accuracy": 0.7771969437599182, + "num_tokens": 696140217.0, + "step": 26908 + }, + { + "epoch": 2.9550845596310125, + "grad_norm": 2.1055984497070312, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7835798263549805, + "num_tokens": 696164797.0, + "step": 26909 + }, + { + "epoch": 2.955194377333626, + "grad_norm": 1.8629940748214722, + "learning_rate": 5e-06, + "loss": 0.7036, + "mean_token_accuracy": 0.7677030563354492, + "num_tokens": 696193042.0, + "step": 26910 + }, + { + "epoch": 2.95530419503624, + "grad_norm": 1.7566348314285278, + "learning_rate": 5e-06, + "loss": 0.7087, + "mean_token_accuracy": 0.7659684419631958, + "num_tokens": 696222574.0, + "step": 26911 + }, + { + "epoch": 2.9554140127388537, + "grad_norm": 2.1629364490509033, + "learning_rate": 5e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7645111680030823, + "num_tokens": 696246308.0, + "step": 26912 + }, + { + "epoch": 2.955523830441467, + "grad_norm": 2.196294069290161, + "learning_rate": 5e-06, + "loss": 0.7444, + "mean_token_accuracy": 0.7516014575958252, + "num_tokens": 696272854.0, + "step": 26913 + }, + { + "epoch": 2.955633648144081, + "grad_norm": 2.301621437072754, + "learning_rate": 5e-06, + "loss": 0.6452, + "mean_token_accuracy": 0.7886499762535095, + "num_tokens": 696292741.0, + "step": 26914 + }, + { + "epoch": 2.9557434658466946, + "grad_norm": 2.166438579559326, + "learning_rate": 5e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7589120268821716, + "num_tokens": 696318138.0, + "step": 26915 + }, + { + "epoch": 2.955853283549308, + "grad_norm": 2.0017409324645996, + "learning_rate": 5e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7537579536437988, + "num_tokens": 696346744.0, + "step": 26916 + }, + { + "epoch": 2.955963101251922, + "grad_norm": 2.0177855491638184, + "learning_rate": 5e-06, + "loss": 0.7775, + "mean_token_accuracy": 0.7484245300292969, + "num_tokens": 696376997.0, + "step": 26917 + }, + { + "epoch": 2.9560729189545354, + "grad_norm": 2.1133840084075928, + "learning_rate": 5e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7572383880615234, + "num_tokens": 696404126.0, + "step": 26918 + }, + { + "epoch": 2.956182736657149, + "grad_norm": 2.0515806674957275, + "learning_rate": 5e-06, + "loss": 0.6987, + "mean_token_accuracy": 0.7723275423049927, + "num_tokens": 696430396.0, + "step": 26919 + }, + { + "epoch": 2.956292554359763, + "grad_norm": 2.071788787841797, + "learning_rate": 5e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7607353925704956, + "num_tokens": 696456211.0, + "step": 26920 + }, + { + "epoch": 2.956402372062376, + "grad_norm": 2.0359928607940674, + "learning_rate": 5e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7555079460144043, + "num_tokens": 696483194.0, + "step": 26921 + }, + { + "epoch": 2.95651218976499, + "grad_norm": 2.124847173690796, + "learning_rate": 5e-06, + "loss": 0.6867, + "mean_token_accuracy": 0.768546462059021, + "num_tokens": 696505732.0, + "step": 26922 + }, + { + "epoch": 2.9566220074676037, + "grad_norm": 2.2719528675079346, + "learning_rate": 5e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.7414751052856445, + "num_tokens": 696530938.0, + "step": 26923 + }, + { + "epoch": 2.9567318251702175, + "grad_norm": 1.7573137283325195, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7343600988388062, + "num_tokens": 696564639.0, + "step": 26924 + }, + { + "epoch": 2.9568416428728312, + "grad_norm": 2.01867413520813, + "learning_rate": 5e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7498887777328491, + "num_tokens": 696592845.0, + "step": 26925 + }, + { + "epoch": 2.9569514605754446, + "grad_norm": 2.3017001152038574, + "learning_rate": 5e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.7683057188987732, + "num_tokens": 696616091.0, + "step": 26926 + }, + { + "epoch": 2.9570612782780583, + "grad_norm": 2.060892105102539, + "learning_rate": 5e-06, + "loss": 0.6934, + "mean_token_accuracy": 0.7695850729942322, + "num_tokens": 696641624.0, + "step": 26927 + }, + { + "epoch": 2.957171095980672, + "grad_norm": 2.0537726879119873, + "learning_rate": 5e-06, + "loss": 0.7413, + "mean_token_accuracy": 0.7630615830421448, + "num_tokens": 696668370.0, + "step": 26928 + }, + { + "epoch": 2.957280913683286, + "grad_norm": 2.0225753784179688, + "learning_rate": 5e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.775047779083252, + "num_tokens": 696694003.0, + "step": 26929 + }, + { + "epoch": 2.9573907313858996, + "grad_norm": 2.2160234451293945, + "learning_rate": 5e-06, + "loss": 0.7173, + "mean_token_accuracy": 0.7633353471755981, + "num_tokens": 696715978.0, + "step": 26930 + }, + { + "epoch": 2.957500549088513, + "grad_norm": 2.2309117317199707, + "learning_rate": 5e-06, + "loss": 0.6668, + "mean_token_accuracy": 0.7804170250892639, + "num_tokens": 696738732.0, + "step": 26931 + }, + { + "epoch": 2.9576103667911267, + "grad_norm": 2.0235848426818848, + "learning_rate": 5e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7393587827682495, + "num_tokens": 696767357.0, + "step": 26932 + }, + { + "epoch": 2.9577201844937404, + "grad_norm": 1.9466174840927124, + "learning_rate": 5e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.7421483993530273, + "num_tokens": 696797954.0, + "step": 26933 + }, + { + "epoch": 2.957830002196354, + "grad_norm": 2.0218067169189453, + "learning_rate": 5e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.760707437992096, + "num_tokens": 696824510.0, + "step": 26934 + }, + { + "epoch": 2.957939819898968, + "grad_norm": 2.1078739166259766, + "learning_rate": 5e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7564138174057007, + "num_tokens": 696850263.0, + "step": 26935 + }, + { + "epoch": 2.9580496376015812, + "grad_norm": 1.7906112670898438, + "learning_rate": 5e-06, + "loss": 0.7208, + "mean_token_accuracy": 0.7570260167121887, + "num_tokens": 696878886.0, + "step": 26936 + }, + { + "epoch": 2.958159455304195, + "grad_norm": 1.9690046310424805, + "learning_rate": 5e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7519274950027466, + "num_tokens": 696904535.0, + "step": 26937 + }, + { + "epoch": 2.9582692730068088, + "grad_norm": 2.319125175476074, + "learning_rate": 5e-06, + "loss": 0.6818, + "mean_token_accuracy": 0.7764111757278442, + "num_tokens": 696925123.0, + "step": 26938 + }, + { + "epoch": 2.9583790907094225, + "grad_norm": 2.139899253845215, + "learning_rate": 5e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7427033185958862, + "num_tokens": 696951577.0, + "step": 26939 + }, + { + "epoch": 2.9584889084120363, + "grad_norm": 1.8744730949401855, + "learning_rate": 5e-06, + "loss": 0.7307, + "mean_token_accuracy": 0.7616013288497925, + "num_tokens": 696982385.0, + "step": 26940 + }, + { + "epoch": 2.9585987261146496, + "grad_norm": 2.395076036453247, + "learning_rate": 5e-06, + "loss": 0.6762, + "mean_token_accuracy": 0.7686200141906738, + "num_tokens": 697002367.0, + "step": 26941 + }, + { + "epoch": 2.9587085438172633, + "grad_norm": 1.9458039999008179, + "learning_rate": 5e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7521268129348755, + "num_tokens": 697030487.0, + "step": 26942 + }, + { + "epoch": 2.958818361519877, + "grad_norm": 2.0193488597869873, + "learning_rate": 5e-06, + "loss": 0.7526, + "mean_token_accuracy": 0.7665772438049316, + "num_tokens": 697057030.0, + "step": 26943 + }, + { + "epoch": 2.9589281792224904, + "grad_norm": 2.1613919734954834, + "learning_rate": 5e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.7609009742736816, + "num_tokens": 697079749.0, + "step": 26944 + }, + { + "epoch": 2.959037996925104, + "grad_norm": 2.0216641426086426, + "learning_rate": 5e-06, + "loss": 0.7624, + "mean_token_accuracy": 0.7513007521629333, + "num_tokens": 697105571.0, + "step": 26945 + }, + { + "epoch": 2.959147814627718, + "grad_norm": 1.9714246988296509, + "learning_rate": 5e-06, + "loss": 0.6308, + "mean_token_accuracy": 0.7867097854614258, + "num_tokens": 697130342.0, + "step": 26946 + }, + { + "epoch": 2.9592576323303317, + "grad_norm": 2.003012180328369, + "learning_rate": 5e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.7414959669113159, + "num_tokens": 697159475.0, + "step": 26947 + }, + { + "epoch": 2.9593674500329454, + "grad_norm": 1.8614318370819092, + "learning_rate": 5e-06, + "loss": 0.7594, + "mean_token_accuracy": 0.7472993731498718, + "num_tokens": 697190750.0, + "step": 26948 + }, + { + "epoch": 2.9594772677355587, + "grad_norm": 2.1342995166778564, + "learning_rate": 5e-06, + "loss": 0.7491, + "mean_token_accuracy": 0.7649237513542175, + "num_tokens": 697216399.0, + "step": 26949 + }, + { + "epoch": 2.9595870854381725, + "grad_norm": 2.296483039855957, + "learning_rate": 5e-06, + "loss": 0.7348, + "mean_token_accuracy": 0.7684926390647888, + "num_tokens": 697237429.0, + "step": 26950 + }, + { + "epoch": 2.9596969031407863, + "grad_norm": 1.893899917602539, + "learning_rate": 5e-06, + "loss": 0.7407, + "mean_token_accuracy": 0.7575257420539856, + "num_tokens": 697267823.0, + "step": 26951 + }, + { + "epoch": 2.9598067208434, + "grad_norm": 2.2894186973571777, + "learning_rate": 5e-06, + "loss": 0.6582, + "mean_token_accuracy": 0.7806723713874817, + "num_tokens": 697289000.0, + "step": 26952 + }, + { + "epoch": 2.9599165385460138, + "grad_norm": 2.3126394748687744, + "learning_rate": 5e-06, + "loss": 0.7018, + "mean_token_accuracy": 0.769917368888855, + "num_tokens": 697310694.0, + "step": 26953 + }, + { + "epoch": 2.960026356248627, + "grad_norm": 1.9326694011688232, + "learning_rate": 5e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7601557970046997, + "num_tokens": 697338686.0, + "step": 26954 + }, + { + "epoch": 2.960136173951241, + "grad_norm": 1.9243050813674927, + "learning_rate": 5e-06, + "loss": 0.688, + "mean_token_accuracy": 0.7777308225631714, + "num_tokens": 697367551.0, + "step": 26955 + }, + { + "epoch": 2.9602459916538546, + "grad_norm": 2.152240514755249, + "learning_rate": 5e-06, + "loss": 0.6898, + "mean_token_accuracy": 0.7690404057502747, + "num_tokens": 697391622.0, + "step": 26956 + }, + { + "epoch": 2.9603558093564684, + "grad_norm": 2.242452383041382, + "learning_rate": 5e-06, + "loss": 0.6272, + "mean_token_accuracy": 0.7871559858322144, + "num_tokens": 697413300.0, + "step": 26957 + }, + { + "epoch": 2.960465627059082, + "grad_norm": 1.9127758741378784, + "learning_rate": 5e-06, + "loss": 0.74, + "mean_token_accuracy": 0.7566075325012207, + "num_tokens": 697441782.0, + "step": 26958 + }, + { + "epoch": 2.9605754447616954, + "grad_norm": 2.181333065032959, + "learning_rate": 5e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.752091109752655, + "num_tokens": 697465249.0, + "step": 26959 + }, + { + "epoch": 2.960685262464309, + "grad_norm": 2.124373435974121, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7717555165290833, + "num_tokens": 697490981.0, + "step": 26960 + }, + { + "epoch": 2.960795080166923, + "grad_norm": 2.035090923309326, + "learning_rate": 5e-06, + "loss": 0.7574, + "mean_token_accuracy": 0.75102698802948, + "num_tokens": 697517917.0, + "step": 26961 + }, + { + "epoch": 2.9609048978695367, + "grad_norm": 1.9400054216384888, + "learning_rate": 5e-06, + "loss": 0.6792, + "mean_token_accuracy": 0.7746514678001404, + "num_tokens": 697544472.0, + "step": 26962 + }, + { + "epoch": 2.9610147155721505, + "grad_norm": 2.205462694168091, + "learning_rate": 5e-06, + "loss": 0.6553, + "mean_token_accuracy": 0.7754825949668884, + "num_tokens": 697567320.0, + "step": 26963 + }, + { + "epoch": 2.9611245332747638, + "grad_norm": 1.9098780155181885, + "learning_rate": 5e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7671955227851868, + "num_tokens": 697593403.0, + "step": 26964 + }, + { + "epoch": 2.9612343509773775, + "grad_norm": 2.081885576248169, + "learning_rate": 5e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7515552639961243, + "num_tokens": 697619732.0, + "step": 26965 + }, + { + "epoch": 2.9613441686799913, + "grad_norm": 1.95322847366333, + "learning_rate": 5e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.742891788482666, + "num_tokens": 697649109.0, + "step": 26966 + }, + { + "epoch": 2.961453986382605, + "grad_norm": 1.9464856386184692, + "learning_rate": 5e-06, + "loss": 0.7381, + "mean_token_accuracy": 0.765055775642395, + "num_tokens": 697678623.0, + "step": 26967 + }, + { + "epoch": 2.961563804085219, + "grad_norm": 2.276319742202759, + "learning_rate": 5e-06, + "loss": 0.6681, + "mean_token_accuracy": 0.7729328870773315, + "num_tokens": 697700907.0, + "step": 26968 + }, + { + "epoch": 2.961673621787832, + "grad_norm": 2.151094913482666, + "learning_rate": 5e-06, + "loss": 0.645, + "mean_token_accuracy": 0.787048876285553, + "num_tokens": 697721986.0, + "step": 26969 + }, + { + "epoch": 2.961783439490446, + "grad_norm": 2.054919719696045, + "learning_rate": 5e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7404014468193054, + "num_tokens": 697748320.0, + "step": 26970 + }, + { + "epoch": 2.9618932571930596, + "grad_norm": 2.3416998386383057, + "learning_rate": 5e-06, + "loss": 0.6782, + "mean_token_accuracy": 0.7736390829086304, + "num_tokens": 697769396.0, + "step": 26971 + }, + { + "epoch": 2.962003074895673, + "grad_norm": 2.257530689239502, + "learning_rate": 5e-06, + "loss": 0.7046, + "mean_token_accuracy": 0.7647178173065186, + "num_tokens": 697791176.0, + "step": 26972 + }, + { + "epoch": 2.9621128925982867, + "grad_norm": 2.236637592315674, + "learning_rate": 5e-06, + "loss": 0.6671, + "mean_token_accuracy": 0.7847163677215576, + "num_tokens": 697813871.0, + "step": 26973 + }, + { + "epoch": 2.9622227103009005, + "grad_norm": 2.197861671447754, + "learning_rate": 5e-06, + "loss": 0.6861, + "mean_token_accuracy": 0.7789925336837769, + "num_tokens": 697836300.0, + "step": 26974 + }, + { + "epoch": 2.962332528003514, + "grad_norm": 1.9967615604400635, + "learning_rate": 5e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7383766174316406, + "num_tokens": 697863929.0, + "step": 26975 + }, + { + "epoch": 2.962442345706128, + "grad_norm": 2.00093674659729, + "learning_rate": 5e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7432559132575989, + "num_tokens": 697894403.0, + "step": 26976 + }, + { + "epoch": 2.9625521634087413, + "grad_norm": 2.283778667449951, + "learning_rate": 5e-06, + "loss": 0.6139, + "mean_token_accuracy": 0.7942128777503967, + "num_tokens": 697914255.0, + "step": 26977 + }, + { + "epoch": 2.962661981111355, + "grad_norm": 1.9677941799163818, + "learning_rate": 5e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.727251410484314, + "num_tokens": 697944363.0, + "step": 26978 + }, + { + "epoch": 2.962771798813969, + "grad_norm": 1.9382559061050415, + "learning_rate": 5e-06, + "loss": 0.7082, + "mean_token_accuracy": 0.7743058204650879, + "num_tokens": 697973417.0, + "step": 26979 + }, + { + "epoch": 2.9628816165165826, + "grad_norm": 2.0864644050598145, + "learning_rate": 5e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7793529033660889, + "num_tokens": 698001650.0, + "step": 26980 + }, + { + "epoch": 2.9629914342191963, + "grad_norm": 2.255734443664551, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7562078833580017, + "num_tokens": 698023437.0, + "step": 26981 + }, + { + "epoch": 2.9631012519218096, + "grad_norm": 1.8194432258605957, + "learning_rate": 5e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7483513951301575, + "num_tokens": 698055714.0, + "step": 26982 + }, + { + "epoch": 2.9632110696244234, + "grad_norm": 1.9327408075332642, + "learning_rate": 5e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7329246997833252, + "num_tokens": 698088078.0, + "step": 26983 + }, + { + "epoch": 2.963320887327037, + "grad_norm": 2.1664531230926514, + "learning_rate": 5e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7723422050476074, + "num_tokens": 698111549.0, + "step": 26984 + }, + { + "epoch": 2.963430705029651, + "grad_norm": 2.157550811767578, + "learning_rate": 5e-06, + "loss": 0.6899, + "mean_token_accuracy": 0.7737227082252502, + "num_tokens": 698134222.0, + "step": 26985 + }, + { + "epoch": 2.9635405227322646, + "grad_norm": 2.3977279663085938, + "learning_rate": 5e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7604718804359436, + "num_tokens": 698155512.0, + "step": 26986 + }, + { + "epoch": 2.963650340434878, + "grad_norm": 1.9921802282333374, + "learning_rate": 5e-06, + "loss": 0.7206, + "mean_token_accuracy": 0.7674612998962402, + "num_tokens": 698183894.0, + "step": 26987 + }, + { + "epoch": 2.9637601581374917, + "grad_norm": 2.2466588020324707, + "learning_rate": 5e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.7748923897743225, + "num_tokens": 698205334.0, + "step": 26988 + }, + { + "epoch": 2.9638699758401055, + "grad_norm": 2.2915496826171875, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7646304368972778, + "num_tokens": 698228867.0, + "step": 26989 + }, + { + "epoch": 2.9639797935427192, + "grad_norm": 1.8561367988586426, + "learning_rate": 5e-06, + "loss": 0.6659, + "mean_token_accuracy": 0.7746062278747559, + "num_tokens": 698257654.0, + "step": 26990 + }, + { + "epoch": 2.964089611245333, + "grad_norm": 2.1849076747894287, + "learning_rate": 5e-06, + "loss": 0.7039, + "mean_token_accuracy": 0.7588174343109131, + "num_tokens": 698281963.0, + "step": 26991 + }, + { + "epoch": 2.9641994289479463, + "grad_norm": 1.8795782327651978, + "learning_rate": 5e-06, + "loss": 0.7259, + "mean_token_accuracy": 0.7592416405677795, + "num_tokens": 698312409.0, + "step": 26992 + }, + { + "epoch": 2.96430924665056, + "grad_norm": 1.8980125188827515, + "learning_rate": 5e-06, + "loss": 0.7385, + "mean_token_accuracy": 0.7592160701751709, + "num_tokens": 698343311.0, + "step": 26993 + }, + { + "epoch": 2.964419064353174, + "grad_norm": 1.8765928745269775, + "learning_rate": 5e-06, + "loss": 0.7577, + "mean_token_accuracy": 0.7519588470458984, + "num_tokens": 698372936.0, + "step": 26994 + }, + { + "epoch": 2.964528882055787, + "grad_norm": 2.3022713661193848, + "learning_rate": 5e-06, + "loss": 0.6786, + "mean_token_accuracy": 0.7756168842315674, + "num_tokens": 698394775.0, + "step": 26995 + }, + { + "epoch": 2.9646386997584013, + "grad_norm": 1.952514410018921, + "learning_rate": 5e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7484889626502991, + "num_tokens": 698424011.0, + "step": 26996 + }, + { + "epoch": 2.9647485174610146, + "grad_norm": 2.1033365726470947, + "learning_rate": 5e-06, + "loss": 0.6783, + "mean_token_accuracy": 0.7761794328689575, + "num_tokens": 698448224.0, + "step": 26997 + }, + { + "epoch": 2.9648583351636284, + "grad_norm": 2.0252602100372314, + "learning_rate": 5e-06, + "loss": 0.7186, + "mean_token_accuracy": 0.7603808641433716, + "num_tokens": 698474054.0, + "step": 26998 + }, + { + "epoch": 2.964968152866242, + "grad_norm": 1.8257941007614136, + "learning_rate": 5e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7300927639007568, + "num_tokens": 698507719.0, + "step": 26999 + }, + { + "epoch": 2.9650779705688555, + "grad_norm": 1.90569269657135, + "learning_rate": 5e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7517258524894714, + "num_tokens": 698540375.0, + "step": 27000 + }, + { + "epoch": 2.9651877882714692, + "grad_norm": 1.9837687015533447, + "learning_rate": 5e-06, + "loss": 0.7012, + "mean_token_accuracy": 0.7679824233055115, + "num_tokens": 698566612.0, + "step": 27001 + }, + { + "epoch": 2.965297605974083, + "grad_norm": 2.0082061290740967, + "learning_rate": 5e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7311728000640869, + "num_tokens": 698594084.0, + "step": 27002 + }, + { + "epoch": 2.9654074236766967, + "grad_norm": 2.0604920387268066, + "learning_rate": 5e-06, + "loss": 0.6585, + "mean_token_accuracy": 0.7810398936271667, + "num_tokens": 698617302.0, + "step": 27003 + }, + { + "epoch": 2.9655172413793105, + "grad_norm": 1.5963730812072754, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7538129091262817, + "num_tokens": 698657199.0, + "step": 27004 + }, + { + "epoch": 2.965627059081924, + "grad_norm": 2.3140413761138916, + "learning_rate": 5e-06, + "loss": 0.7396, + "mean_token_accuracy": 0.7554877996444702, + "num_tokens": 698678831.0, + "step": 27005 + }, + { + "epoch": 2.9657368767845376, + "grad_norm": 1.9226433038711548, + "learning_rate": 5e-06, + "loss": 0.7363, + "mean_token_accuracy": 0.7660015225410461, + "num_tokens": 698707255.0, + "step": 27006 + }, + { + "epoch": 2.9658466944871513, + "grad_norm": 2.0111641883850098, + "learning_rate": 5e-06, + "loss": 0.7671, + "mean_token_accuracy": 0.7470904588699341, + "num_tokens": 698737233.0, + "step": 27007 + }, + { + "epoch": 2.965956512189765, + "grad_norm": 2.180389165878296, + "learning_rate": 5e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7664313316345215, + "num_tokens": 698761050.0, + "step": 27008 + }, + { + "epoch": 2.966066329892379, + "grad_norm": 2.1053614616394043, + "learning_rate": 5e-06, + "loss": 0.6994, + "mean_token_accuracy": 0.7651981115341187, + "num_tokens": 698785533.0, + "step": 27009 + }, + { + "epoch": 2.966176147594992, + "grad_norm": 2.1893627643585205, + "learning_rate": 5e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7518199682235718, + "num_tokens": 698811727.0, + "step": 27010 + }, + { + "epoch": 2.966285965297606, + "grad_norm": 2.2702441215515137, + "learning_rate": 5e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.7698510885238647, + "num_tokens": 698834523.0, + "step": 27011 + }, + { + "epoch": 2.9663957830002197, + "grad_norm": 2.00726318359375, + "learning_rate": 5e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7349910736083984, + "num_tokens": 698863117.0, + "step": 27012 + }, + { + "epoch": 2.9665056007028334, + "grad_norm": 1.674036979675293, + "learning_rate": 5e-06, + "loss": 0.725, + "mean_token_accuracy": 0.7626399397850037, + "num_tokens": 698901103.0, + "step": 27013 + }, + { + "epoch": 2.966615418405447, + "grad_norm": 2.1225640773773193, + "learning_rate": 5e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.7441481351852417, + "num_tokens": 698927427.0, + "step": 27014 + }, + { + "epoch": 2.9667252361080605, + "grad_norm": 2.2464828491210938, + "learning_rate": 5e-06, + "loss": 0.6433, + "mean_token_accuracy": 0.7854037284851074, + "num_tokens": 698948290.0, + "step": 27015 + }, + { + "epoch": 2.9668350538106742, + "grad_norm": 1.9564871788024902, + "learning_rate": 5e-06, + "loss": 0.7655, + "mean_token_accuracy": 0.7507920265197754, + "num_tokens": 698974746.0, + "step": 27016 + }, + { + "epoch": 2.966944871513288, + "grad_norm": 1.9237942695617676, + "learning_rate": 5e-06, + "loss": 0.7568, + "mean_token_accuracy": 0.7604386806488037, + "num_tokens": 699003117.0, + "step": 27017 + }, + { + "epoch": 2.9670546892159018, + "grad_norm": 1.900851845741272, + "learning_rate": 5e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.760818600654602, + "num_tokens": 699031170.0, + "step": 27018 + }, + { + "epoch": 2.9671645069185155, + "grad_norm": 2.123811960220337, + "learning_rate": 5e-06, + "loss": 0.7445, + "mean_token_accuracy": 0.7652873992919922, + "num_tokens": 699054574.0, + "step": 27019 + }, + { + "epoch": 2.967274324621129, + "grad_norm": 1.9123811721801758, + "learning_rate": 5e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7264237403869629, + "num_tokens": 699084324.0, + "step": 27020 + }, + { + "epoch": 2.9673841423237426, + "grad_norm": 1.9813487529754639, + "learning_rate": 5e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7505955696105957, + "num_tokens": 699113521.0, + "step": 27021 + }, + { + "epoch": 2.9674939600263563, + "grad_norm": 2.0199482440948486, + "learning_rate": 5e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7460598945617676, + "num_tokens": 699138883.0, + "step": 27022 + }, + { + "epoch": 2.9676037777289697, + "grad_norm": 2.6517770290374756, + "learning_rate": 5e-06, + "loss": 0.681, + "mean_token_accuracy": 0.7799646854400635, + "num_tokens": 699155889.0, + "step": 27023 + }, + { + "epoch": 2.9677135954315834, + "grad_norm": 2.559147834777832, + "learning_rate": 5e-06, + "loss": 0.6848, + "mean_token_accuracy": 0.7742278575897217, + "num_tokens": 699174635.0, + "step": 27024 + }, + { + "epoch": 2.967823413134197, + "grad_norm": 1.8412691354751587, + "learning_rate": 5e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7333921194076538, + "num_tokens": 699205938.0, + "step": 27025 + }, + { + "epoch": 2.967933230836811, + "grad_norm": 2.1902687549591064, + "learning_rate": 5e-06, + "loss": 0.6999, + "mean_token_accuracy": 0.7751641273498535, + "num_tokens": 699228961.0, + "step": 27026 + }, + { + "epoch": 2.9680430485394247, + "grad_norm": 1.9603323936462402, + "learning_rate": 5e-06, + "loss": 0.7292, + "mean_token_accuracy": 0.7628339529037476, + "num_tokens": 699258232.0, + "step": 27027 + }, + { + "epoch": 2.968152866242038, + "grad_norm": 2.1440985202789307, + "learning_rate": 5e-06, + "loss": 0.7023, + "mean_token_accuracy": 0.7757290005683899, + "num_tokens": 699282404.0, + "step": 27028 + }, + { + "epoch": 2.9682626839446518, + "grad_norm": 1.9550139904022217, + "learning_rate": 5e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.757685661315918, + "num_tokens": 699310491.0, + "step": 27029 + }, + { + "epoch": 2.9683725016472655, + "grad_norm": 2.1517114639282227, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7639971971511841, + "num_tokens": 699333471.0, + "step": 27030 + }, + { + "epoch": 2.9684823193498793, + "grad_norm": 2.242262601852417, + "learning_rate": 5e-06, + "loss": 0.6706, + "mean_token_accuracy": 0.7728146314620972, + "num_tokens": 699354546.0, + "step": 27031 + }, + { + "epoch": 2.968592137052493, + "grad_norm": 1.9971317052841187, + "learning_rate": 5e-06, + "loss": 0.713, + "mean_token_accuracy": 0.7662525177001953, + "num_tokens": 699382645.0, + "step": 27032 + }, + { + "epoch": 2.9687019547551063, + "grad_norm": 2.1029701232910156, + "learning_rate": 5e-06, + "loss": 0.7099, + "mean_token_accuracy": 0.774112343788147, + "num_tokens": 699406563.0, + "step": 27033 + }, + { + "epoch": 2.96881177245772, + "grad_norm": 2.0740559101104736, + "learning_rate": 5e-06, + "loss": 0.7261, + "mean_token_accuracy": 0.7646679878234863, + "num_tokens": 699432307.0, + "step": 27034 + }, + { + "epoch": 2.968921590160334, + "grad_norm": 2.166776418685913, + "learning_rate": 5e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7569743990898132, + "num_tokens": 699457761.0, + "step": 27035 + }, + { + "epoch": 2.9690314078629476, + "grad_norm": 2.0578906536102295, + "learning_rate": 5e-06, + "loss": 0.6943, + "mean_token_accuracy": 0.7702891826629639, + "num_tokens": 699486253.0, + "step": 27036 + }, + { + "epoch": 2.9691412255655614, + "grad_norm": 1.9527909755706787, + "learning_rate": 5e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7212930917739868, + "num_tokens": 699518574.0, + "step": 27037 + }, + { + "epoch": 2.9692510432681747, + "grad_norm": 1.8029329776763916, + "learning_rate": 5e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7488982081413269, + "num_tokens": 699547422.0, + "step": 27038 + }, + { + "epoch": 2.9693608609707884, + "grad_norm": 2.1115989685058594, + "learning_rate": 5e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7518117427825928, + "num_tokens": 699573601.0, + "step": 27039 + }, + { + "epoch": 2.969470678673402, + "grad_norm": 2.0360770225524902, + "learning_rate": 5e-06, + "loss": 0.6194, + "mean_token_accuracy": 0.794033408164978, + "num_tokens": 699597580.0, + "step": 27040 + }, + { + "epoch": 2.969580496376016, + "grad_norm": 2.1813011169433594, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7632719278335571, + "num_tokens": 699621437.0, + "step": 27041 + }, + { + "epoch": 2.9696903140786297, + "grad_norm": 2.0737102031707764, + "learning_rate": 5e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.7824016213417053, + "num_tokens": 699648283.0, + "step": 27042 + }, + { + "epoch": 2.969800131781243, + "grad_norm": 1.9727225303649902, + "learning_rate": 5e-06, + "loss": 0.737, + "mean_token_accuracy": 0.7601928114891052, + "num_tokens": 699673893.0, + "step": 27043 + }, + { + "epoch": 2.969909949483857, + "grad_norm": 2.1923744678497314, + "learning_rate": 5e-06, + "loss": 0.7675, + "mean_token_accuracy": 0.7577880620956421, + "num_tokens": 699697886.0, + "step": 27044 + }, + { + "epoch": 2.9700197671864705, + "grad_norm": 2.278350591659546, + "learning_rate": 5e-06, + "loss": 0.6689, + "mean_token_accuracy": 0.7837721705436707, + "num_tokens": 699720125.0, + "step": 27045 + }, + { + "epoch": 2.970129584889084, + "grad_norm": 1.7941011190414429, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7348073720932007, + "num_tokens": 699754310.0, + "step": 27046 + }, + { + "epoch": 2.970239402591698, + "grad_norm": 1.9687995910644531, + "learning_rate": 5e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7383559346199036, + "num_tokens": 699783600.0, + "step": 27047 + }, + { + "epoch": 2.9703492202943114, + "grad_norm": 2.0152289867401123, + "learning_rate": 5e-06, + "loss": 0.694, + "mean_token_accuracy": 0.7750328779220581, + "num_tokens": 699808064.0, + "step": 27048 + }, + { + "epoch": 2.970459037996925, + "grad_norm": 1.8693112134933472, + "learning_rate": 5e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7568395733833313, + "num_tokens": 699837353.0, + "step": 27049 + }, + { + "epoch": 2.970568855699539, + "grad_norm": 1.8146394491195679, + "learning_rate": 5e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7420969009399414, + "num_tokens": 699871350.0, + "step": 27050 + }, + { + "epoch": 2.970678673402152, + "grad_norm": 2.001394271850586, + "learning_rate": 5e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.7488685250282288, + "num_tokens": 699898336.0, + "step": 27051 + }, + { + "epoch": 2.970788491104766, + "grad_norm": 2.1930723190307617, + "learning_rate": 5e-06, + "loss": 0.7426, + "mean_token_accuracy": 0.7581310868263245, + "num_tokens": 699922496.0, + "step": 27052 + }, + { + "epoch": 2.9708983088073797, + "grad_norm": 1.8999061584472656, + "learning_rate": 5e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.749469518661499, + "num_tokens": 699952029.0, + "step": 27053 + }, + { + "epoch": 2.9710081265099935, + "grad_norm": 2.2137694358825684, + "learning_rate": 5e-06, + "loss": 0.6623, + "mean_token_accuracy": 0.7773808836936951, + "num_tokens": 699974610.0, + "step": 27054 + }, + { + "epoch": 2.971117944212607, + "grad_norm": 2.2811272144317627, + "learning_rate": 5e-06, + "loss": 0.6872, + "mean_token_accuracy": 0.7784656286239624, + "num_tokens": 699995965.0, + "step": 27055 + }, + { + "epoch": 2.9712277619152205, + "grad_norm": 1.764363169670105, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7564135193824768, + "num_tokens": 700027123.0, + "step": 27056 + }, + { + "epoch": 2.9713375796178343, + "grad_norm": 2.0572621822357178, + "learning_rate": 5e-06, + "loss": 0.7507, + "mean_token_accuracy": 0.7548604011535645, + "num_tokens": 700054659.0, + "step": 27057 + }, + { + "epoch": 2.971447397320448, + "grad_norm": 2.044023275375366, + "learning_rate": 5e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7493311166763306, + "num_tokens": 700080725.0, + "step": 27058 + }, + { + "epoch": 2.971557215023062, + "grad_norm": 1.8950371742248535, + "learning_rate": 5e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7494392395019531, + "num_tokens": 700110098.0, + "step": 27059 + }, + { + "epoch": 2.9716670327256756, + "grad_norm": 1.9420872926712036, + "learning_rate": 5e-06, + "loss": 0.7207, + "mean_token_accuracy": 0.7629915475845337, + "num_tokens": 700138884.0, + "step": 27060 + }, + { + "epoch": 2.971776850428289, + "grad_norm": 2.0616142749786377, + "learning_rate": 5e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7438250780105591, + "num_tokens": 700166311.0, + "step": 27061 + }, + { + "epoch": 2.9718866681309026, + "grad_norm": 2.1448967456817627, + "learning_rate": 5e-06, + "loss": 0.7231, + "mean_token_accuracy": 0.7698748707771301, + "num_tokens": 700190884.0, + "step": 27062 + }, + { + "epoch": 2.9719964858335164, + "grad_norm": 2.066833972930908, + "learning_rate": 5e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7554206848144531, + "num_tokens": 700216897.0, + "step": 27063 + }, + { + "epoch": 2.97210630353613, + "grad_norm": 2.2056691646575928, + "learning_rate": 5e-06, + "loss": 0.6971, + "mean_token_accuracy": 0.7728161811828613, + "num_tokens": 700238305.0, + "step": 27064 + }, + { + "epoch": 2.972216121238744, + "grad_norm": 2.171274423599243, + "learning_rate": 5e-06, + "loss": 0.706, + "mean_token_accuracy": 0.7664328813552856, + "num_tokens": 700262486.0, + "step": 27065 + }, + { + "epoch": 2.972325938941357, + "grad_norm": 2.2274374961853027, + "learning_rate": 5e-06, + "loss": 0.7612, + "mean_token_accuracy": 0.7608296871185303, + "num_tokens": 700286673.0, + "step": 27066 + }, + { + "epoch": 2.972435756643971, + "grad_norm": 1.9241514205932617, + "learning_rate": 5e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7367290258407593, + "num_tokens": 700319721.0, + "step": 27067 + }, + { + "epoch": 2.9725455743465847, + "grad_norm": 1.9706705808639526, + "learning_rate": 5e-06, + "loss": 0.7642, + "mean_token_accuracy": 0.7529561519622803, + "num_tokens": 700348120.0, + "step": 27068 + }, + { + "epoch": 2.9726553920491985, + "grad_norm": 2.0099730491638184, + "learning_rate": 5e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.751901388168335, + "num_tokens": 700375563.0, + "step": 27069 + }, + { + "epoch": 2.9727652097518122, + "grad_norm": 2.1284468173980713, + "learning_rate": 5e-06, + "loss": 0.7645, + "mean_token_accuracy": 0.7533568143844604, + "num_tokens": 700401674.0, + "step": 27070 + }, + { + "epoch": 2.9728750274544256, + "grad_norm": 1.9960623979568481, + "learning_rate": 5e-06, + "loss": 0.7525, + "mean_token_accuracy": 0.7525498867034912, + "num_tokens": 700429570.0, + "step": 27071 + }, + { + "epoch": 2.9729848451570393, + "grad_norm": 2.018781900405884, + "learning_rate": 5e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7325630187988281, + "num_tokens": 700454970.0, + "step": 27072 + }, + { + "epoch": 2.973094662859653, + "grad_norm": 1.9799656867980957, + "learning_rate": 5e-06, + "loss": 0.7187, + "mean_token_accuracy": 0.7675343155860901, + "num_tokens": 700481703.0, + "step": 27073 + }, + { + "epoch": 2.9732044805622664, + "grad_norm": 1.9796984195709229, + "learning_rate": 5e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7496969699859619, + "num_tokens": 700510514.0, + "step": 27074 + }, + { + "epoch": 2.97331429826488, + "grad_norm": 1.9335455894470215, + "learning_rate": 5e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.726497232913971, + "num_tokens": 700541430.0, + "step": 27075 + }, + { + "epoch": 2.973424115967494, + "grad_norm": 2.223788261413574, + "learning_rate": 5e-06, + "loss": 0.7482, + "mean_token_accuracy": 0.7519456148147583, + "num_tokens": 700565285.0, + "step": 27076 + }, + { + "epoch": 2.9735339336701077, + "grad_norm": 1.9989607334136963, + "learning_rate": 5e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7474474906921387, + "num_tokens": 700593653.0, + "step": 27077 + }, + { + "epoch": 2.9736437513727214, + "grad_norm": 2.003324031829834, + "learning_rate": 5e-06, + "loss": 0.6976, + "mean_token_accuracy": 0.7738258838653564, + "num_tokens": 700620909.0, + "step": 27078 + }, + { + "epoch": 2.9737535690753347, + "grad_norm": 1.9058547019958496, + "learning_rate": 5e-06, + "loss": 0.7332, + "mean_token_accuracy": 0.7605751752853394, + "num_tokens": 700650284.0, + "step": 27079 + }, + { + "epoch": 2.9738633867779485, + "grad_norm": 2.108828544616699, + "learning_rate": 5e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7608788013458252, + "num_tokens": 700675571.0, + "step": 27080 + }, + { + "epoch": 2.9739732044805622, + "grad_norm": 2.3909411430358887, + "learning_rate": 5e-06, + "loss": 0.7333, + "mean_token_accuracy": 0.7591875791549683, + "num_tokens": 700696321.0, + "step": 27081 + }, + { + "epoch": 2.974083022183176, + "grad_norm": 2.181835174560547, + "learning_rate": 5e-06, + "loss": 0.6727, + "mean_token_accuracy": 0.773838460445404, + "num_tokens": 700719048.0, + "step": 27082 + }, + { + "epoch": 2.9741928398857898, + "grad_norm": 2.1640522480010986, + "learning_rate": 5e-06, + "loss": 0.7395, + "mean_token_accuracy": 0.7553441524505615, + "num_tokens": 700738890.0, + "step": 27083 + }, + { + "epoch": 2.974302657588403, + "grad_norm": 2.134181022644043, + "learning_rate": 5e-06, + "loss": 0.6965, + "mean_token_accuracy": 0.7715543508529663, + "num_tokens": 700763680.0, + "step": 27084 + }, + { + "epoch": 2.974412475291017, + "grad_norm": 2.327378034591675, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7605143785476685, + "num_tokens": 700786292.0, + "step": 27085 + }, + { + "epoch": 2.9745222929936306, + "grad_norm": 2.099116563796997, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7635714411735535, + "num_tokens": 700811795.0, + "step": 27086 + }, + { + "epoch": 2.9746321106962443, + "grad_norm": 2.0654265880584717, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7546854019165039, + "num_tokens": 700840426.0, + "step": 27087 + }, + { + "epoch": 2.974741928398858, + "grad_norm": 2.148594379425049, + "learning_rate": 5e-06, + "loss": 0.7506, + "mean_token_accuracy": 0.7536888718605042, + "num_tokens": 700866966.0, + "step": 27088 + }, + { + "epoch": 2.9748517461014714, + "grad_norm": 2.0703341960906982, + "learning_rate": 5e-06, + "loss": 0.724, + "mean_token_accuracy": 0.7602386474609375, + "num_tokens": 700892869.0, + "step": 27089 + }, + { + "epoch": 2.974961563804085, + "grad_norm": 2.003143548965454, + "learning_rate": 5e-06, + "loss": 0.7606, + "mean_token_accuracy": 0.7562717199325562, + "num_tokens": 700918763.0, + "step": 27090 + }, + { + "epoch": 2.975071381506699, + "grad_norm": 2.05425763130188, + "learning_rate": 5e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7551957964897156, + "num_tokens": 700946988.0, + "step": 27091 + }, + { + "epoch": 2.9751811992093127, + "grad_norm": 1.8444684743881226, + "learning_rate": 5e-06, + "loss": 0.7828, + "mean_token_accuracy": 0.7534648180007935, + "num_tokens": 700978691.0, + "step": 27092 + }, + { + "epoch": 2.9752910169119264, + "grad_norm": 2.235727071762085, + "learning_rate": 5e-06, + "loss": 0.7319, + "mean_token_accuracy": 0.7669500112533569, + "num_tokens": 701001138.0, + "step": 27093 + }, + { + "epoch": 2.9754008346145397, + "grad_norm": 2.0179381370544434, + "learning_rate": 5e-06, + "loss": 0.7572, + "mean_token_accuracy": 0.7568233013153076, + "num_tokens": 701027948.0, + "step": 27094 + }, + { + "epoch": 2.9755106523171535, + "grad_norm": 1.929787039756775, + "learning_rate": 5e-06, + "loss": 0.7438, + "mean_token_accuracy": 0.7562170028686523, + "num_tokens": 701056121.0, + "step": 27095 + }, + { + "epoch": 2.9756204700197673, + "grad_norm": 2.0201334953308105, + "learning_rate": 5e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7440119981765747, + "num_tokens": 701082666.0, + "step": 27096 + }, + { + "epoch": 2.9757302877223806, + "grad_norm": 2.1287524700164795, + "learning_rate": 5e-06, + "loss": 0.7889, + "mean_token_accuracy": 0.751482367515564, + "num_tokens": 701106763.0, + "step": 27097 + }, + { + "epoch": 2.9758401054249948, + "grad_norm": 2.1391026973724365, + "learning_rate": 5e-06, + "loss": 0.7391, + "mean_token_accuracy": 0.7568744421005249, + "num_tokens": 701131048.0, + "step": 27098 + }, + { + "epoch": 2.975949923127608, + "grad_norm": 2.0196428298950195, + "learning_rate": 5e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7528734803199768, + "num_tokens": 701157629.0, + "step": 27099 + }, + { + "epoch": 2.976059740830222, + "grad_norm": 1.9692710638046265, + "learning_rate": 5e-06, + "loss": 0.6812, + "mean_token_accuracy": 0.7747727632522583, + "num_tokens": 701183900.0, + "step": 27100 + }, + { + "epoch": 2.9761695585328356, + "grad_norm": 2.1304450035095215, + "learning_rate": 5e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7574701309204102, + "num_tokens": 701208663.0, + "step": 27101 + }, + { + "epoch": 2.976279376235449, + "grad_norm": 2.1916589736938477, + "learning_rate": 5e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7315006852149963, + "num_tokens": 701234552.0, + "step": 27102 + }, + { + "epoch": 2.9763891939380627, + "grad_norm": 2.1216957569122314, + "learning_rate": 5e-06, + "loss": 0.6926, + "mean_token_accuracy": 0.7647953033447266, + "num_tokens": 701258110.0, + "step": 27103 + }, + { + "epoch": 2.9764990116406764, + "grad_norm": 1.7722585201263428, + "learning_rate": 5e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.775175929069519, + "num_tokens": 701290100.0, + "step": 27104 + }, + { + "epoch": 2.97660882934329, + "grad_norm": 2.2018864154815674, + "learning_rate": 5e-06, + "loss": 0.7098, + "mean_token_accuracy": 0.765841007232666, + "num_tokens": 701313345.0, + "step": 27105 + }, + { + "epoch": 2.976718647045904, + "grad_norm": 1.9239553213119507, + "learning_rate": 5e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7518360614776611, + "num_tokens": 701342917.0, + "step": 27106 + }, + { + "epoch": 2.9768284647485173, + "grad_norm": 1.8946486711502075, + "learning_rate": 5e-06, + "loss": 0.697, + "mean_token_accuracy": 0.766069769859314, + "num_tokens": 701370151.0, + "step": 27107 + }, + { + "epoch": 2.976938282451131, + "grad_norm": 1.9663437604904175, + "learning_rate": 5e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7312474846839905, + "num_tokens": 701400265.0, + "step": 27108 + }, + { + "epoch": 2.9770481001537448, + "grad_norm": 2.0477757453918457, + "learning_rate": 5e-06, + "loss": 0.7676, + "mean_token_accuracy": 0.7555281519889832, + "num_tokens": 701427186.0, + "step": 27109 + }, + { + "epoch": 2.9771579178563585, + "grad_norm": 2.4720253944396973, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7545377016067505, + "num_tokens": 701447766.0, + "step": 27110 + }, + { + "epoch": 2.9772677355589723, + "grad_norm": 1.8471513986587524, + "learning_rate": 5e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7268657684326172, + "num_tokens": 701484209.0, + "step": 27111 + }, + { + "epoch": 2.9773775532615856, + "grad_norm": 2.275289297103882, + "learning_rate": 5e-06, + "loss": 0.7686, + "mean_token_accuracy": 0.7556084990501404, + "num_tokens": 701509005.0, + "step": 27112 + }, + { + "epoch": 2.9774873709641994, + "grad_norm": 2.1662237644195557, + "learning_rate": 5e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7388207912445068, + "num_tokens": 701535068.0, + "step": 27113 + }, + { + "epoch": 2.977597188666813, + "grad_norm": 2.142929792404175, + "learning_rate": 5e-06, + "loss": 0.7149, + "mean_token_accuracy": 0.7690955400466919, + "num_tokens": 701559202.0, + "step": 27114 + }, + { + "epoch": 2.977707006369427, + "grad_norm": 2.1479406356811523, + "learning_rate": 5e-06, + "loss": 0.6233, + "mean_token_accuracy": 0.7895199060440063, + "num_tokens": 701580893.0, + "step": 27115 + }, + { + "epoch": 2.9778168240720406, + "grad_norm": 2.0251097679138184, + "learning_rate": 5e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7666681408882141, + "num_tokens": 701608805.0, + "step": 27116 + }, + { + "epoch": 2.977926641774654, + "grad_norm": 1.984592080116272, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7652870416641235, + "num_tokens": 701636118.0, + "step": 27117 + }, + { + "epoch": 2.9780364594772677, + "grad_norm": 2.2665069103240967, + "learning_rate": 5e-06, + "loss": 0.662, + "mean_token_accuracy": 0.7784823179244995, + "num_tokens": 701657995.0, + "step": 27118 + }, + { + "epoch": 2.9781462771798815, + "grad_norm": 2.419893503189087, + "learning_rate": 5e-06, + "loss": 0.682, + "mean_token_accuracy": 0.7768039107322693, + "num_tokens": 701677789.0, + "step": 27119 + }, + { + "epoch": 2.978256094882495, + "grad_norm": 1.9291497468948364, + "learning_rate": 5e-06, + "loss": 0.7456, + "mean_token_accuracy": 0.7589594721794128, + "num_tokens": 701704754.0, + "step": 27120 + }, + { + "epoch": 2.978365912585109, + "grad_norm": 2.0234625339508057, + "learning_rate": 5e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7485381364822388, + "num_tokens": 701731439.0, + "step": 27121 + }, + { + "epoch": 2.9784757302877223, + "grad_norm": 1.7886302471160889, + "learning_rate": 5e-06, + "loss": 0.7136, + "mean_token_accuracy": 0.7676935791969299, + "num_tokens": 701763776.0, + "step": 27122 + }, + { + "epoch": 2.978585547990336, + "grad_norm": 1.9285964965820312, + "learning_rate": 5e-06, + "loss": 0.7496, + "mean_token_accuracy": 0.7637736201286316, + "num_tokens": 701792336.0, + "step": 27123 + }, + { + "epoch": 2.97869536569295, + "grad_norm": 2.38531756401062, + "learning_rate": 5e-06, + "loss": 0.6805, + "mean_token_accuracy": 0.7781360745429993, + "num_tokens": 701811804.0, + "step": 27124 + }, + { + "epoch": 2.978805183395563, + "grad_norm": 2.1725358963012695, + "learning_rate": 5e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7433443665504456, + "num_tokens": 701836505.0, + "step": 27125 + }, + { + "epoch": 2.978915001098177, + "grad_norm": 1.999627947807312, + "learning_rate": 5e-06, + "loss": 0.6802, + "mean_token_accuracy": 0.7782177329063416, + "num_tokens": 701860509.0, + "step": 27126 + }, + { + "epoch": 2.9790248188007906, + "grad_norm": 2.1292035579681396, + "learning_rate": 5e-06, + "loss": 0.6645, + "mean_token_accuracy": 0.7788438200950623, + "num_tokens": 701883929.0, + "step": 27127 + }, + { + "epoch": 2.9791346365034044, + "grad_norm": 2.013197183609009, + "learning_rate": 5e-06, + "loss": 0.6546, + "mean_token_accuracy": 0.7810341715812683, + "num_tokens": 701909602.0, + "step": 27128 + }, + { + "epoch": 2.979244454206018, + "grad_norm": 2.1369011402130127, + "learning_rate": 5e-06, + "loss": 0.7171, + "mean_token_accuracy": 0.7734395265579224, + "num_tokens": 701933913.0, + "step": 27129 + }, + { + "epoch": 2.9793542719086314, + "grad_norm": 1.8096674680709839, + "learning_rate": 5e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7446979284286499, + "num_tokens": 701964682.0, + "step": 27130 + }, + { + "epoch": 2.979464089611245, + "grad_norm": 2.2154791355133057, + "learning_rate": 5e-06, + "loss": 0.669, + "mean_token_accuracy": 0.7764456272125244, + "num_tokens": 701990029.0, + "step": 27131 + }, + { + "epoch": 2.979573907313859, + "grad_norm": 2.059483766555786, + "learning_rate": 5e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7431584000587463, + "num_tokens": 702017046.0, + "step": 27132 + }, + { + "epoch": 2.9796837250164727, + "grad_norm": 1.9957976341247559, + "learning_rate": 5e-06, + "loss": 0.6231, + "mean_token_accuracy": 0.7844551205635071, + "num_tokens": 702041712.0, + "step": 27133 + }, + { + "epoch": 2.9797935427190865, + "grad_norm": 2.230375289916992, + "learning_rate": 5e-06, + "loss": 0.7146, + "mean_token_accuracy": 0.7689576148986816, + "num_tokens": 702065171.0, + "step": 27134 + }, + { + "epoch": 2.9799033604217, + "grad_norm": 1.8166111707687378, + "learning_rate": 5e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7467924356460571, + "num_tokens": 702097824.0, + "step": 27135 + }, + { + "epoch": 2.9800131781243135, + "grad_norm": 1.8204131126403809, + "learning_rate": 5e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.746313214302063, + "num_tokens": 702131816.0, + "step": 27136 + }, + { + "epoch": 2.9801229958269273, + "grad_norm": 1.908202052116394, + "learning_rate": 5e-06, + "loss": 0.773, + "mean_token_accuracy": 0.7472954988479614, + "num_tokens": 702161119.0, + "step": 27137 + }, + { + "epoch": 2.980232813529541, + "grad_norm": 2.198357582092285, + "learning_rate": 5e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7479391098022461, + "num_tokens": 702186122.0, + "step": 27138 + }, + { + "epoch": 2.980342631232155, + "grad_norm": 2.193528890609741, + "learning_rate": 5e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7543177604675293, + "num_tokens": 702208834.0, + "step": 27139 + }, + { + "epoch": 2.980452448934768, + "grad_norm": 1.963413953781128, + "learning_rate": 5e-06, + "loss": 0.7383, + "mean_token_accuracy": 0.7554246187210083, + "num_tokens": 702235059.0, + "step": 27140 + }, + { + "epoch": 2.980562266637382, + "grad_norm": 1.8807498216629028, + "learning_rate": 5e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7472419142723083, + "num_tokens": 702264816.0, + "step": 27141 + }, + { + "epoch": 2.9806720843399956, + "grad_norm": 1.6763485670089722, + "learning_rate": 5e-06, + "loss": 0.728, + "mean_token_accuracy": 0.7716208696365356, + "num_tokens": 702297593.0, + "step": 27142 + }, + { + "epoch": 2.9807819020426094, + "grad_norm": 2.235888957977295, + "learning_rate": 5e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7741875648498535, + "num_tokens": 702318489.0, + "step": 27143 + }, + { + "epoch": 2.980891719745223, + "grad_norm": 2.139869451522827, + "learning_rate": 5e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7728232741355896, + "num_tokens": 702341223.0, + "step": 27144 + }, + { + "epoch": 2.9810015374478365, + "grad_norm": 2.1331989765167236, + "learning_rate": 5e-06, + "loss": 0.6877, + "mean_token_accuracy": 0.7729363441467285, + "num_tokens": 702366831.0, + "step": 27145 + }, + { + "epoch": 2.9811113551504502, + "grad_norm": 2.254009962081909, + "learning_rate": 5e-06, + "loss": 0.6745, + "mean_token_accuracy": 0.7761301398277283, + "num_tokens": 702389551.0, + "step": 27146 + }, + { + "epoch": 2.981221172853064, + "grad_norm": 2.171766757965088, + "learning_rate": 5e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7625067234039307, + "num_tokens": 702413401.0, + "step": 27147 + }, + { + "epoch": 2.9813309905556777, + "grad_norm": 1.9431817531585693, + "learning_rate": 5e-06, + "loss": 0.6584, + "mean_token_accuracy": 0.7770564556121826, + "num_tokens": 702439525.0, + "step": 27148 + }, + { + "epoch": 2.9814408082582915, + "grad_norm": 1.9152182340621948, + "learning_rate": 5e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7489480376243591, + "num_tokens": 702469639.0, + "step": 27149 + }, + { + "epoch": 2.981550625960905, + "grad_norm": 2.076939344406128, + "learning_rate": 5e-06, + "loss": 0.707, + "mean_token_accuracy": 0.7663286328315735, + "num_tokens": 702493623.0, + "step": 27150 + }, + { + "epoch": 2.9816604436635186, + "grad_norm": 2.059016227722168, + "learning_rate": 5e-06, + "loss": 0.73, + "mean_token_accuracy": 0.76114422082901, + "num_tokens": 702517662.0, + "step": 27151 + }, + { + "epoch": 2.9817702613661323, + "grad_norm": 1.9915025234222412, + "learning_rate": 5e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7302210330963135, + "num_tokens": 702546295.0, + "step": 27152 + }, + { + "epoch": 2.9818800790687456, + "grad_norm": 2.127483606338501, + "learning_rate": 5e-06, + "loss": 0.7326, + "mean_token_accuracy": 0.7616314888000488, + "num_tokens": 702571510.0, + "step": 27153 + }, + { + "epoch": 2.9819898967713594, + "grad_norm": 1.9279916286468506, + "learning_rate": 5e-06, + "loss": 0.721, + "mean_token_accuracy": 0.7620598077774048, + "num_tokens": 702599698.0, + "step": 27154 + }, + { + "epoch": 2.982099714473973, + "grad_norm": 2.198343515396118, + "learning_rate": 5e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7641558051109314, + "num_tokens": 702623212.0, + "step": 27155 + }, + { + "epoch": 2.982209532176587, + "grad_norm": 1.8227078914642334, + "learning_rate": 5e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7464514970779419, + "num_tokens": 702652341.0, + "step": 27156 + }, + { + "epoch": 2.9823193498792007, + "grad_norm": 2.177884340286255, + "learning_rate": 5e-06, + "loss": 0.6401, + "mean_token_accuracy": 0.7853357791900635, + "num_tokens": 702674211.0, + "step": 27157 + }, + { + "epoch": 2.982429167581814, + "grad_norm": 1.8961409330368042, + "learning_rate": 5e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7580593824386597, + "num_tokens": 702702732.0, + "step": 27158 + }, + { + "epoch": 2.9825389852844277, + "grad_norm": 1.9655039310455322, + "learning_rate": 5e-06, + "loss": 0.7436, + "mean_token_accuracy": 0.7582334280014038, + "num_tokens": 702732067.0, + "step": 27159 + }, + { + "epoch": 2.9826488029870415, + "grad_norm": 1.7587472200393677, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7506470680236816, + "num_tokens": 702767214.0, + "step": 27160 + }, + { + "epoch": 2.9827586206896552, + "grad_norm": 2.2420902252197266, + "learning_rate": 5e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7505297064781189, + "num_tokens": 702791666.0, + "step": 27161 + }, + { + "epoch": 2.982868438392269, + "grad_norm": 1.9009188413619995, + "learning_rate": 5e-06, + "loss": 0.7189, + "mean_token_accuracy": 0.7690889835357666, + "num_tokens": 702820795.0, + "step": 27162 + }, + { + "epoch": 2.9829782560948823, + "grad_norm": 1.9528082609176636, + "learning_rate": 5e-06, + "loss": 0.6911, + "mean_token_accuracy": 0.7779634594917297, + "num_tokens": 702845687.0, + "step": 27163 + }, + { + "epoch": 2.983088073797496, + "grad_norm": 2.502408266067505, + "learning_rate": 5e-06, + "loss": 0.6666, + "mean_token_accuracy": 0.7745718359947205, + "num_tokens": 702864587.0, + "step": 27164 + }, + { + "epoch": 2.98319789150011, + "grad_norm": 2.2707433700561523, + "learning_rate": 5e-06, + "loss": 0.7386, + "mean_token_accuracy": 0.7568204402923584, + "num_tokens": 702886981.0, + "step": 27165 + }, + { + "epoch": 2.9833077092027236, + "grad_norm": 2.072587728500366, + "learning_rate": 5e-06, + "loss": 0.7228, + "mean_token_accuracy": 0.7616703510284424, + "num_tokens": 702912380.0, + "step": 27166 + }, + { + "epoch": 2.9834175269053373, + "grad_norm": 2.0476150512695312, + "learning_rate": 5e-06, + "loss": 0.6596, + "mean_token_accuracy": 0.7813716530799866, + "num_tokens": 702936228.0, + "step": 27167 + }, + { + "epoch": 2.9835273446079507, + "grad_norm": 2.011667490005493, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7679638862609863, + "num_tokens": 702960343.0, + "step": 27168 + }, + { + "epoch": 2.9836371623105644, + "grad_norm": 2.1001875400543213, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7677419185638428, + "num_tokens": 702983553.0, + "step": 27169 + }, + { + "epoch": 2.983746980013178, + "grad_norm": 2.3013217449188232, + "learning_rate": 5e-06, + "loss": 0.6892, + "mean_token_accuracy": 0.7734529376029968, + "num_tokens": 703006097.0, + "step": 27170 + }, + { + "epoch": 2.983856797715792, + "grad_norm": 2.3306915760040283, + "learning_rate": 5e-06, + "loss": 0.6909, + "mean_token_accuracy": 0.7883353233337402, + "num_tokens": 703024493.0, + "step": 27171 + }, + { + "epoch": 2.9839666154184057, + "grad_norm": 2.040519952774048, + "learning_rate": 5e-06, + "loss": 0.7492, + "mean_token_accuracy": 0.7578049302101135, + "num_tokens": 703051933.0, + "step": 27172 + }, + { + "epoch": 2.984076433121019, + "grad_norm": 2.1957597732543945, + "learning_rate": 5e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7582173943519592, + "num_tokens": 703075539.0, + "step": 27173 + }, + { + "epoch": 2.9841862508236328, + "grad_norm": 1.863752841949463, + "learning_rate": 5e-06, + "loss": 0.7461, + "mean_token_accuracy": 0.7576528191566467, + "num_tokens": 703104822.0, + "step": 27174 + }, + { + "epoch": 2.9842960685262465, + "grad_norm": 1.9644606113433838, + "learning_rate": 5e-06, + "loss": 0.7364, + "mean_token_accuracy": 0.7609843015670776, + "num_tokens": 703132176.0, + "step": 27175 + }, + { + "epoch": 2.98440588622886, + "grad_norm": 2.1700191497802734, + "learning_rate": 5e-06, + "loss": 0.6581, + "mean_token_accuracy": 0.7831766605377197, + "num_tokens": 703153399.0, + "step": 27176 + }, + { + "epoch": 2.984515703931474, + "grad_norm": 2.229109287261963, + "learning_rate": 5e-06, + "loss": 0.5958, + "mean_token_accuracy": 0.7995840311050415, + "num_tokens": 703172648.0, + "step": 27177 + }, + { + "epoch": 2.9846255216340873, + "grad_norm": 2.2125954627990723, + "learning_rate": 5e-06, + "loss": 0.7217, + "mean_token_accuracy": 0.7701269388198853, + "num_tokens": 703197397.0, + "step": 27178 + }, + { + "epoch": 2.984735339336701, + "grad_norm": 2.272653102874756, + "learning_rate": 5e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7785575985908508, + "num_tokens": 703217275.0, + "step": 27179 + }, + { + "epoch": 2.984845157039315, + "grad_norm": 1.899300456047058, + "learning_rate": 5e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7499327063560486, + "num_tokens": 703248075.0, + "step": 27180 + }, + { + "epoch": 2.984954974741928, + "grad_norm": 2.1563150882720947, + "learning_rate": 5e-06, + "loss": 0.7093, + "mean_token_accuracy": 0.7613628506660461, + "num_tokens": 703272200.0, + "step": 27181 + }, + { + "epoch": 2.985064792444542, + "grad_norm": 2.3334431648254395, + "learning_rate": 5e-06, + "loss": 0.6932, + "mean_token_accuracy": 0.7702757716178894, + "num_tokens": 703294272.0, + "step": 27182 + }, + { + "epoch": 2.9851746101471557, + "grad_norm": 1.8068938255310059, + "learning_rate": 5e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7462736368179321, + "num_tokens": 703325861.0, + "step": 27183 + }, + { + "epoch": 2.9852844278497694, + "grad_norm": 2.1011359691619873, + "learning_rate": 5e-06, + "loss": 0.7179, + "mean_token_accuracy": 0.7642280459403992, + "num_tokens": 703352581.0, + "step": 27184 + }, + { + "epoch": 2.985394245552383, + "grad_norm": 1.966099739074707, + "learning_rate": 5e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7435729503631592, + "num_tokens": 703382102.0, + "step": 27185 + }, + { + "epoch": 2.9855040632549965, + "grad_norm": 2.0241076946258545, + "learning_rate": 5e-06, + "loss": 0.7728, + "mean_token_accuracy": 0.7528478503227234, + "num_tokens": 703408860.0, + "step": 27186 + }, + { + "epoch": 2.9856138809576103, + "grad_norm": 1.9297376871109009, + "learning_rate": 5e-06, + "loss": 0.7736, + "mean_token_accuracy": 0.7429432272911072, + "num_tokens": 703437317.0, + "step": 27187 + }, + { + "epoch": 2.985723698660224, + "grad_norm": 1.8476101160049438, + "learning_rate": 5e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7282931804656982, + "num_tokens": 703466943.0, + "step": 27188 + }, + { + "epoch": 2.985833516362838, + "grad_norm": 2.119135856628418, + "learning_rate": 5e-06, + "loss": 0.7262, + "mean_token_accuracy": 0.7648259401321411, + "num_tokens": 703491671.0, + "step": 27189 + }, + { + "epoch": 2.9859433340654515, + "grad_norm": 2.017667055130005, + "learning_rate": 5e-06, + "loss": 0.78, + "mean_token_accuracy": 0.757783830165863, + "num_tokens": 703519776.0, + "step": 27190 + }, + { + "epoch": 2.986053151768065, + "grad_norm": 1.8940210342407227, + "learning_rate": 5e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7458692789077759, + "num_tokens": 703550089.0, + "step": 27191 + }, + { + "epoch": 2.9861629694706786, + "grad_norm": 2.0544373989105225, + "learning_rate": 5e-06, + "loss": 0.6141, + "mean_token_accuracy": 0.7971017360687256, + "num_tokens": 703573049.0, + "step": 27192 + }, + { + "epoch": 2.9862727871732924, + "grad_norm": 2.1398308277130127, + "learning_rate": 5e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7518090009689331, + "num_tokens": 703599799.0, + "step": 27193 + }, + { + "epoch": 2.986382604875906, + "grad_norm": 2.1674880981445312, + "learning_rate": 5e-06, + "loss": 0.7352, + "mean_token_accuracy": 0.761676549911499, + "num_tokens": 703623253.0, + "step": 27194 + }, + { + "epoch": 2.98649242257852, + "grad_norm": 1.739173173904419, + "learning_rate": 5e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7440975904464722, + "num_tokens": 703658679.0, + "step": 27195 + }, + { + "epoch": 2.986602240281133, + "grad_norm": 2.102893352508545, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7669521570205688, + "num_tokens": 703683186.0, + "step": 27196 + }, + { + "epoch": 2.986712057983747, + "grad_norm": 2.0266833305358887, + "learning_rate": 5e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7566920518875122, + "num_tokens": 703710677.0, + "step": 27197 + }, + { + "epoch": 2.9868218756863607, + "grad_norm": 2.3029775619506836, + "learning_rate": 5e-06, + "loss": 0.6174, + "mean_token_accuracy": 0.7908950448036194, + "num_tokens": 703731631.0, + "step": 27198 + }, + { + "epoch": 2.9869316933889745, + "grad_norm": 2.143888235092163, + "learning_rate": 5e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7667539119720459, + "num_tokens": 703755955.0, + "step": 27199 + }, + { + "epoch": 2.987041511091588, + "grad_norm": 2.040875196456909, + "learning_rate": 5e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7598901987075806, + "num_tokens": 703784337.0, + "step": 27200 + }, + { + "epoch": 2.9871513287942015, + "grad_norm": 2.134685754776001, + "learning_rate": 5e-06, + "loss": 0.7161, + "mean_token_accuracy": 0.7685558199882507, + "num_tokens": 703808678.0, + "step": 27201 + }, + { + "epoch": 2.9872611464968153, + "grad_norm": 1.9525071382522583, + "learning_rate": 5e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7457132339477539, + "num_tokens": 703839160.0, + "step": 27202 + }, + { + "epoch": 2.987370964199429, + "grad_norm": 2.075857400894165, + "learning_rate": 5e-06, + "loss": 0.6475, + "mean_token_accuracy": 0.7783927917480469, + "num_tokens": 703864778.0, + "step": 27203 + }, + { + "epoch": 2.9874807819020424, + "grad_norm": 2.011563301086426, + "learning_rate": 5e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7590968012809753, + "num_tokens": 703890861.0, + "step": 27204 + }, + { + "epoch": 2.987590599604656, + "grad_norm": 2.0497992038726807, + "learning_rate": 5e-06, + "loss": 0.6255, + "mean_token_accuracy": 0.7983752489089966, + "num_tokens": 703915134.0, + "step": 27205 + }, + { + "epoch": 2.98770041730727, + "grad_norm": 2.2768232822418213, + "learning_rate": 5e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7460607290267944, + "num_tokens": 703939118.0, + "step": 27206 + }, + { + "epoch": 2.9878102350098836, + "grad_norm": 1.9805220365524292, + "learning_rate": 5e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7520835399627686, + "num_tokens": 703966913.0, + "step": 27207 + }, + { + "epoch": 2.9879200527124974, + "grad_norm": 1.9925585985183716, + "learning_rate": 5e-06, + "loss": 0.7025, + "mean_token_accuracy": 0.7669262886047363, + "num_tokens": 703994862.0, + "step": 27208 + }, + { + "epoch": 2.9880298704151107, + "grad_norm": 2.1903576850891113, + "learning_rate": 5e-06, + "loss": 0.7168, + "mean_token_accuracy": 0.7585593461990356, + "num_tokens": 704017481.0, + "step": 27209 + }, + { + "epoch": 2.9881396881177245, + "grad_norm": 1.7695767879486084, + "learning_rate": 5e-06, + "loss": 0.7054, + "mean_token_accuracy": 0.7674912810325623, + "num_tokens": 704050595.0, + "step": 27210 + }, + { + "epoch": 2.988249505820338, + "grad_norm": 2.2060375213623047, + "learning_rate": 5e-06, + "loss": 0.7254, + "mean_token_accuracy": 0.7653603553771973, + "num_tokens": 704072889.0, + "step": 27211 + }, + { + "epoch": 2.988359323522952, + "grad_norm": 1.8497542142868042, + "learning_rate": 5e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7577003240585327, + "num_tokens": 704101648.0, + "step": 27212 + }, + { + "epoch": 2.9884691412255657, + "grad_norm": 2.2998111248016357, + "learning_rate": 5e-06, + "loss": 0.6647, + "mean_token_accuracy": 0.7804442644119263, + "num_tokens": 704121964.0, + "step": 27213 + }, + { + "epoch": 2.988578958928179, + "grad_norm": 2.287057638168335, + "learning_rate": 5e-06, + "loss": 0.7271, + "mean_token_accuracy": 0.7612866163253784, + "num_tokens": 704143675.0, + "step": 27214 + }, + { + "epoch": 2.988688776630793, + "grad_norm": 2.1539740562438965, + "learning_rate": 5e-06, + "loss": 0.6561, + "mean_token_accuracy": 0.7792551517486572, + "num_tokens": 704166534.0, + "step": 27215 + }, + { + "epoch": 2.9887985943334066, + "grad_norm": 2.1774001121520996, + "learning_rate": 5e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7406942844390869, + "num_tokens": 704189603.0, + "step": 27216 + }, + { + "epoch": 2.9889084120360203, + "grad_norm": 1.8578195571899414, + "learning_rate": 5e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7489118576049805, + "num_tokens": 704222123.0, + "step": 27217 + }, + { + "epoch": 2.989018229738634, + "grad_norm": 1.9282829761505127, + "learning_rate": 5e-06, + "loss": 0.7515, + "mean_token_accuracy": 0.7544881701469421, + "num_tokens": 704250474.0, + "step": 27218 + }, + { + "epoch": 2.9891280474412474, + "grad_norm": 2.244403600692749, + "learning_rate": 5e-06, + "loss": 0.6772, + "mean_token_accuracy": 0.7747241258621216, + "num_tokens": 704275244.0, + "step": 27219 + }, + { + "epoch": 2.989237865143861, + "grad_norm": 2.3661081790924072, + "learning_rate": 5e-06, + "loss": 0.6479, + "mean_token_accuracy": 0.7809774875640869, + "num_tokens": 704294930.0, + "step": 27220 + }, + { + "epoch": 2.989347682846475, + "grad_norm": 1.9139549732208252, + "learning_rate": 5e-06, + "loss": 0.7182, + "mean_token_accuracy": 0.7701719999313354, + "num_tokens": 704322572.0, + "step": 27221 + }, + { + "epoch": 2.9894575005490887, + "grad_norm": 2.026303291320801, + "learning_rate": 5e-06, + "loss": 0.6856, + "mean_token_accuracy": 0.7757582664489746, + "num_tokens": 704348303.0, + "step": 27222 + }, + { + "epoch": 2.9895673182517024, + "grad_norm": 2.0675597190856934, + "learning_rate": 5e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7578806281089783, + "num_tokens": 704372934.0, + "step": 27223 + }, + { + "epoch": 2.9896771359543157, + "grad_norm": 1.9688184261322021, + "learning_rate": 5e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.7664205431938171, + "num_tokens": 704400777.0, + "step": 27224 + }, + { + "epoch": 2.9897869536569295, + "grad_norm": 1.9304364919662476, + "learning_rate": 5e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7681739926338196, + "num_tokens": 704425611.0, + "step": 27225 + }, + { + "epoch": 2.9898967713595432, + "grad_norm": 1.7416802644729614, + "learning_rate": 5e-06, + "loss": 0.6723, + "mean_token_accuracy": 0.7789955735206604, + "num_tokens": 704459547.0, + "step": 27226 + }, + { + "epoch": 2.9900065890621566, + "grad_norm": 2.039015531539917, + "learning_rate": 5e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7562178373336792, + "num_tokens": 704489237.0, + "step": 27227 + }, + { + "epoch": 2.9901164067647708, + "grad_norm": 2.033597707748413, + "learning_rate": 5e-06, + "loss": 0.6595, + "mean_token_accuracy": 0.7818505764007568, + "num_tokens": 704514930.0, + "step": 27228 + }, + { + "epoch": 2.990226224467384, + "grad_norm": 1.967936396598816, + "learning_rate": 5e-06, + "loss": 0.7131, + "mean_token_accuracy": 0.7643559575080872, + "num_tokens": 704543725.0, + "step": 27229 + }, + { + "epoch": 2.990336042169998, + "grad_norm": 2.0190482139587402, + "learning_rate": 5e-06, + "loss": 0.7106, + "mean_token_accuracy": 0.7629109621047974, + "num_tokens": 704569814.0, + "step": 27230 + }, + { + "epoch": 2.9904458598726116, + "grad_norm": 2.0050275325775146, + "learning_rate": 5e-06, + "loss": 0.7478, + "mean_token_accuracy": 0.7582281231880188, + "num_tokens": 704596126.0, + "step": 27231 + }, + { + "epoch": 2.990555677575225, + "grad_norm": 2.2097887992858887, + "learning_rate": 5e-06, + "loss": 0.7355, + "mean_token_accuracy": 0.7580123543739319, + "num_tokens": 704619326.0, + "step": 27232 + }, + { + "epoch": 2.9906654952778386, + "grad_norm": 1.890609860420227, + "learning_rate": 5e-06, + "loss": 0.6835, + "mean_token_accuracy": 0.7697599530220032, + "num_tokens": 704648745.0, + "step": 27233 + }, + { + "epoch": 2.9907753129804524, + "grad_norm": 2.1076836585998535, + "learning_rate": 5e-06, + "loss": 0.7139, + "mean_token_accuracy": 0.763696014881134, + "num_tokens": 704672991.0, + "step": 27234 + }, + { + "epoch": 2.990885130683066, + "grad_norm": 1.6957039833068848, + "learning_rate": 5e-06, + "loss": 0.7144, + "mean_token_accuracy": 0.7661174535751343, + "num_tokens": 704704559.0, + "step": 27235 + }, + { + "epoch": 2.99099494838568, + "grad_norm": 1.944464087486267, + "learning_rate": 5e-06, + "loss": 0.6748, + "mean_token_accuracy": 0.7800989151000977, + "num_tokens": 704729698.0, + "step": 27236 + }, + { + "epoch": 2.9911047660882932, + "grad_norm": 1.880913496017456, + "learning_rate": 5e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7325568795204163, + "num_tokens": 704761732.0, + "step": 27237 + }, + { + "epoch": 2.991214583790907, + "grad_norm": 1.9457627534866333, + "learning_rate": 5e-06, + "loss": 0.6869, + "mean_token_accuracy": 0.7736061811447144, + "num_tokens": 704788266.0, + "step": 27238 + }, + { + "epoch": 2.9913244014935207, + "grad_norm": 2.261552333831787, + "learning_rate": 5e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7356557846069336, + "num_tokens": 704812709.0, + "step": 27239 + }, + { + "epoch": 2.9914342191961345, + "grad_norm": 2.4836623668670654, + "learning_rate": 5e-06, + "loss": 0.7101, + "mean_token_accuracy": 0.7676601409912109, + "num_tokens": 704833006.0, + "step": 27240 + }, + { + "epoch": 2.9915440368987483, + "grad_norm": 2.1358907222747803, + "learning_rate": 5e-06, + "loss": 0.7412, + "mean_token_accuracy": 0.7615803480148315, + "num_tokens": 704855813.0, + "step": 27241 + }, + { + "epoch": 2.9916538546013616, + "grad_norm": 1.8726813793182373, + "learning_rate": 5e-06, + "loss": 0.755, + "mean_token_accuracy": 0.7551174163818359, + "num_tokens": 704887152.0, + "step": 27242 + }, + { + "epoch": 2.9917636723039753, + "grad_norm": 2.2567577362060547, + "learning_rate": 5e-06, + "loss": 0.6084, + "mean_token_accuracy": 0.7923287153244019, + "num_tokens": 704908843.0, + "step": 27243 + }, + { + "epoch": 2.991873490006589, + "grad_norm": 1.972037672996521, + "learning_rate": 5e-06, + "loss": 0.7447, + "mean_token_accuracy": 0.7557575702667236, + "num_tokens": 704936221.0, + "step": 27244 + }, + { + "epoch": 2.991983307709203, + "grad_norm": 2.2703685760498047, + "learning_rate": 5e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7646374702453613, + "num_tokens": 704959020.0, + "step": 27245 + }, + { + "epoch": 2.9920931254118166, + "grad_norm": 2.2234745025634766, + "learning_rate": 5e-06, + "loss": 0.6746, + "mean_token_accuracy": 0.7714724540710449, + "num_tokens": 704983571.0, + "step": 27246 + }, + { + "epoch": 2.99220294311443, + "grad_norm": 2.027804374694824, + "learning_rate": 5e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7568819522857666, + "num_tokens": 705009127.0, + "step": 27247 + }, + { + "epoch": 2.9923127608170437, + "grad_norm": 1.951701045036316, + "learning_rate": 5e-06, + "loss": 0.738, + "mean_token_accuracy": 0.7645905017852783, + "num_tokens": 705039732.0, + "step": 27248 + }, + { + "epoch": 2.9924225785196574, + "grad_norm": 2.0835092067718506, + "learning_rate": 5e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7363581657409668, + "num_tokens": 705066845.0, + "step": 27249 + }, + { + "epoch": 2.992532396222271, + "grad_norm": 2.135129690170288, + "learning_rate": 5e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7536630034446716, + "num_tokens": 705091079.0, + "step": 27250 + }, + { + "epoch": 2.992642213924885, + "grad_norm": 1.877798080444336, + "learning_rate": 5e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7252389788627625, + "num_tokens": 705123847.0, + "step": 27251 + }, + { + "epoch": 2.9927520316274983, + "grad_norm": 1.8830511569976807, + "learning_rate": 5e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7392324209213257, + "num_tokens": 705154027.0, + "step": 27252 + }, + { + "epoch": 2.992861849330112, + "grad_norm": 2.2784881591796875, + "learning_rate": 5e-06, + "loss": 0.7247, + "mean_token_accuracy": 0.7579680681228638, + "num_tokens": 705175246.0, + "step": 27253 + }, + { + "epoch": 2.9929716670327258, + "grad_norm": 2.096236228942871, + "learning_rate": 5e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7479017376899719, + "num_tokens": 705201350.0, + "step": 27254 + }, + { + "epoch": 2.993081484735339, + "grad_norm": 2.105806589126587, + "learning_rate": 5e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7382412552833557, + "num_tokens": 705227701.0, + "step": 27255 + }, + { + "epoch": 2.993191302437953, + "grad_norm": 2.442216396331787, + "learning_rate": 5e-06, + "loss": 0.662, + "mean_token_accuracy": 0.778060257434845, + "num_tokens": 705246684.0, + "step": 27256 + }, + { + "epoch": 2.9933011201405666, + "grad_norm": 2.3334476947784424, + "learning_rate": 5e-06, + "loss": 0.7016, + "mean_token_accuracy": 0.7686840295791626, + "num_tokens": 705269029.0, + "step": 27257 + }, + { + "epoch": 2.9934109378431804, + "grad_norm": 2.144840955734253, + "learning_rate": 5e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.739534854888916, + "num_tokens": 705294415.0, + "step": 27258 + }, + { + "epoch": 2.993520755545794, + "grad_norm": 1.851082682609558, + "learning_rate": 5e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7299950122833252, + "num_tokens": 705327170.0, + "step": 27259 + }, + { + "epoch": 2.9936305732484074, + "grad_norm": 1.900270700454712, + "learning_rate": 5e-06, + "loss": 0.7451, + "mean_token_accuracy": 0.7543474435806274, + "num_tokens": 705355967.0, + "step": 27260 + }, + { + "epoch": 2.993740390951021, + "grad_norm": 1.9991860389709473, + "learning_rate": 5e-06, + "loss": 0.6517, + "mean_token_accuracy": 0.7857105731964111, + "num_tokens": 705380331.0, + "step": 27261 + }, + { + "epoch": 2.993850208653635, + "grad_norm": 2.167222261428833, + "learning_rate": 5e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7486938238143921, + "num_tokens": 705405301.0, + "step": 27262 + }, + { + "epoch": 2.9939600263562487, + "grad_norm": 2.167529821395874, + "learning_rate": 5e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7598354816436768, + "num_tokens": 705431363.0, + "step": 27263 + }, + { + "epoch": 2.9940698440588625, + "grad_norm": 2.0742130279541016, + "learning_rate": 5e-06, + "loss": 0.7152, + "mean_token_accuracy": 0.7668465971946716, + "num_tokens": 705455756.0, + "step": 27264 + }, + { + "epoch": 2.9941796617614758, + "grad_norm": 1.9098591804504395, + "learning_rate": 5e-06, + "loss": 0.7192, + "mean_token_accuracy": 0.7655229568481445, + "num_tokens": 705482895.0, + "step": 27265 + }, + { + "epoch": 2.9942894794640895, + "grad_norm": 1.9870567321777344, + "learning_rate": 5e-06, + "loss": 0.6975, + "mean_token_accuracy": 0.7671196460723877, + "num_tokens": 705509489.0, + "step": 27266 + }, + { + "epoch": 2.9943992971667033, + "grad_norm": 2.177856683731079, + "learning_rate": 5e-06, + "loss": 0.6914, + "mean_token_accuracy": 0.7741771340370178, + "num_tokens": 705532498.0, + "step": 27267 + }, + { + "epoch": 2.994509114869317, + "grad_norm": 2.0967190265655518, + "learning_rate": 5e-06, + "loss": 0.709, + "mean_token_accuracy": 0.7638257741928101, + "num_tokens": 705556449.0, + "step": 27268 + }, + { + "epoch": 2.994618932571931, + "grad_norm": 2.1840016841888428, + "learning_rate": 5e-06, + "loss": 0.7129, + "mean_token_accuracy": 0.7631098628044128, + "num_tokens": 705578429.0, + "step": 27269 + }, + { + "epoch": 2.994728750274544, + "grad_norm": 2.176767110824585, + "learning_rate": 5e-06, + "loss": 0.6996, + "mean_token_accuracy": 0.7683010101318359, + "num_tokens": 705601185.0, + "step": 27270 + }, + { + "epoch": 2.994838567977158, + "grad_norm": 1.9403417110443115, + "learning_rate": 5e-06, + "loss": 0.7089, + "mean_token_accuracy": 0.7663772702217102, + "num_tokens": 705628920.0, + "step": 27271 + }, + { + "epoch": 2.9949483856797716, + "grad_norm": 2.016296863555908, + "learning_rate": 5e-06, + "loss": 0.7441, + "mean_token_accuracy": 0.7694900035858154, + "num_tokens": 705656685.0, + "step": 27272 + }, + { + "epoch": 2.9950582033823854, + "grad_norm": 2.3780059814453125, + "learning_rate": 5e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.7782062292098999, + "num_tokens": 705677036.0, + "step": 27273 + }, + { + "epoch": 2.995168021084999, + "grad_norm": 1.9948005676269531, + "learning_rate": 5e-06, + "loss": 0.7311, + "mean_token_accuracy": 0.761532187461853, + "num_tokens": 705703171.0, + "step": 27274 + }, + { + "epoch": 2.9952778387876124, + "grad_norm": 2.073336601257324, + "learning_rate": 5e-06, + "loss": 0.6857, + "mean_token_accuracy": 0.7769038677215576, + "num_tokens": 705728764.0, + "step": 27275 + }, + { + "epoch": 2.995387656490226, + "grad_norm": 2.080009698867798, + "learning_rate": 5e-06, + "loss": 0.6491, + "mean_token_accuracy": 0.7749124765396118, + "num_tokens": 705753211.0, + "step": 27276 + }, + { + "epoch": 2.99549747419284, + "grad_norm": 2.1181581020355225, + "learning_rate": 5e-06, + "loss": 0.7061, + "mean_token_accuracy": 0.7669095396995544, + "num_tokens": 705778116.0, + "step": 27277 + }, + { + "epoch": 2.9956072918954533, + "grad_norm": 2.038081407546997, + "learning_rate": 5e-06, + "loss": 0.7301, + "mean_token_accuracy": 0.7594162225723267, + "num_tokens": 705802592.0, + "step": 27278 + }, + { + "epoch": 2.9957171095980675, + "grad_norm": 1.973935842514038, + "learning_rate": 5e-06, + "loss": 0.7178, + "mean_token_accuracy": 0.7630077004432678, + "num_tokens": 705830357.0, + "step": 27279 + }, + { + "epoch": 2.995826927300681, + "grad_norm": 2.2716221809387207, + "learning_rate": 5e-06, + "loss": 0.6921, + "mean_token_accuracy": 0.7719821929931641, + "num_tokens": 705852265.0, + "step": 27280 + }, + { + "epoch": 2.9959367450032945, + "grad_norm": 1.9674999713897705, + "learning_rate": 5e-06, + "loss": 0.7268, + "mean_token_accuracy": 0.7647334337234497, + "num_tokens": 705882432.0, + "step": 27281 + }, + { + "epoch": 2.9960465627059083, + "grad_norm": 2.0092644691467285, + "learning_rate": 5e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7431639432907104, + "num_tokens": 705909407.0, + "step": 27282 + }, + { + "epoch": 2.9961563804085216, + "grad_norm": 1.943157434463501, + "learning_rate": 5e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7528206706047058, + "num_tokens": 705937743.0, + "step": 27283 + }, + { + "epoch": 2.9962661981111354, + "grad_norm": 1.7069743871688843, + "learning_rate": 5e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7299611568450928, + "num_tokens": 705970976.0, + "step": 27284 + }, + { + "epoch": 2.996376015813749, + "grad_norm": 2.089383125305176, + "learning_rate": 5e-06, + "loss": 0.6836, + "mean_token_accuracy": 0.7745874524116516, + "num_tokens": 705995286.0, + "step": 27285 + }, + { + "epoch": 2.996485833516363, + "grad_norm": 2.1308720111846924, + "learning_rate": 5e-06, + "loss": 0.7439, + "mean_token_accuracy": 0.7558733820915222, + "num_tokens": 706018552.0, + "step": 27286 + }, + { + "epoch": 2.9965956512189766, + "grad_norm": 1.9605742692947388, + "learning_rate": 5e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7751322388648987, + "num_tokens": 706044621.0, + "step": 27287 + }, + { + "epoch": 2.99670546892159, + "grad_norm": 1.9749577045440674, + "learning_rate": 5e-06, + "loss": 0.7165, + "mean_token_accuracy": 0.7697262763977051, + "num_tokens": 706071681.0, + "step": 27288 + }, + { + "epoch": 2.9968152866242037, + "grad_norm": 2.132744550704956, + "learning_rate": 5e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7502104043960571, + "num_tokens": 706098893.0, + "step": 27289 + }, + { + "epoch": 2.9969251043268175, + "grad_norm": 2.1983625888824463, + "learning_rate": 5e-06, + "loss": 0.6806, + "mean_token_accuracy": 0.7764120101928711, + "num_tokens": 706122056.0, + "step": 27290 + }, + { + "epoch": 2.9970349220294312, + "grad_norm": 2.3415846824645996, + "learning_rate": 5e-06, + "loss": 0.7183, + "mean_token_accuracy": 0.7748573422431946, + "num_tokens": 706140802.0, + "step": 27291 + }, + { + "epoch": 2.997144739732045, + "grad_norm": 1.9530916213989258, + "learning_rate": 5e-06, + "loss": 0.7079, + "mean_token_accuracy": 0.7680386900901794, + "num_tokens": 706169687.0, + "step": 27292 + }, + { + "epoch": 2.9972545574346583, + "grad_norm": 2.193337917327881, + "learning_rate": 5e-06, + "loss": 0.7071, + "mean_token_accuracy": 0.7689529657363892, + "num_tokens": 706193485.0, + "step": 27293 + }, + { + "epoch": 2.997364375137272, + "grad_norm": 1.9493427276611328, + "learning_rate": 5e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7614431381225586, + "num_tokens": 706220167.0, + "step": 27294 + }, + { + "epoch": 2.997474192839886, + "grad_norm": 1.8780348300933838, + "learning_rate": 5e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7397910356521606, + "num_tokens": 706252506.0, + "step": 27295 + }, + { + "epoch": 2.9975840105424996, + "grad_norm": 1.9839593172073364, + "learning_rate": 5e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7507234811782837, + "num_tokens": 706280827.0, + "step": 27296 + }, + { + "epoch": 2.9976938282451133, + "grad_norm": 2.183288097381592, + "learning_rate": 5e-06, + "loss": 0.7196, + "mean_token_accuracy": 0.7712891101837158, + "num_tokens": 706303872.0, + "step": 27297 + }, + { + "epoch": 2.9978036459477266, + "grad_norm": 2.0923264026641846, + "learning_rate": 5e-06, + "loss": 0.7463, + "mean_token_accuracy": 0.750248372554779, + "num_tokens": 706331428.0, + "step": 27298 + }, + { + "epoch": 2.9979134636503404, + "grad_norm": 1.9250670671463013, + "learning_rate": 5e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7601655125617981, + "num_tokens": 706358303.0, + "step": 27299 + }, + { + "epoch": 2.998023281352954, + "grad_norm": 1.965684175491333, + "learning_rate": 5e-06, + "loss": 0.7677, + "mean_token_accuracy": 0.7511122226715088, + "num_tokens": 706385112.0, + "step": 27300 + }, + { + "epoch": 2.998133099055568, + "grad_norm": 2.1881635189056396, + "learning_rate": 5e-06, + "loss": 0.7325, + "mean_token_accuracy": 0.7671431303024292, + "num_tokens": 706410867.0, + "step": 27301 + }, + { + "epoch": 2.9982429167581817, + "grad_norm": 2.012791395187378, + "learning_rate": 5e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.727428674697876, + "num_tokens": 706440412.0, + "step": 27302 + }, + { + "epoch": 2.998352734460795, + "grad_norm": 2.098051071166992, + "learning_rate": 5e-06, + "loss": 0.6688, + "mean_token_accuracy": 0.7748852372169495, + "num_tokens": 706464810.0, + "step": 27303 + }, + { + "epoch": 2.9984625521634087, + "grad_norm": 2.3029706478118896, + "learning_rate": 5e-06, + "loss": 0.6676, + "mean_token_accuracy": 0.7731438875198364, + "num_tokens": 706486446.0, + "step": 27304 + }, + { + "epoch": 2.9985723698660225, + "grad_norm": 1.9850438833236694, + "learning_rate": 5e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7470749616622925, + "num_tokens": 706518703.0, + "step": 27305 + }, + { + "epoch": 2.998682187568636, + "grad_norm": 1.957452416419983, + "learning_rate": 5e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7442525625228882, + "num_tokens": 706548453.0, + "step": 27306 + }, + { + "epoch": 2.9987920052712496, + "grad_norm": 1.9954606294631958, + "learning_rate": 5e-06, + "loss": 0.7274, + "mean_token_accuracy": 0.7640524506568909, + "num_tokens": 706575018.0, + "step": 27307 + }, + { + "epoch": 2.9989018229738633, + "grad_norm": 2.049574851989746, + "learning_rate": 5e-06, + "loss": 0.717, + "mean_token_accuracy": 0.764019250869751, + "num_tokens": 706602553.0, + "step": 27308 + }, + { + "epoch": 2.999011640676477, + "grad_norm": 2.281806230545044, + "learning_rate": 5e-06, + "loss": 0.624, + "mean_token_accuracy": 0.782729983329773, + "num_tokens": 706623616.0, + "step": 27309 + }, + { + "epoch": 2.999121458379091, + "grad_norm": 2.265284776687622, + "learning_rate": 5e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7472152709960938, + "num_tokens": 706646192.0, + "step": 27310 + }, + { + "epoch": 2.999231276081704, + "grad_norm": 2.195244312286377, + "learning_rate": 5e-06, + "loss": 0.6417, + "mean_token_accuracy": 0.7842955589294434, + "num_tokens": 706667274.0, + "step": 27311 + }, + { + "epoch": 2.999341093784318, + "grad_norm": 2.350149631500244, + "learning_rate": 5e-06, + "loss": 0.6992, + "mean_token_accuracy": 0.7615613341331482, + "num_tokens": 706689972.0, + "step": 27312 + }, + { + "epoch": 2.9994509114869317, + "grad_norm": 2.0877652168273926, + "learning_rate": 5e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7465739846229553, + "num_tokens": 706717776.0, + "step": 27313 + }, + { + "epoch": 2.9995607291895454, + "grad_norm": 2.0660107135772705, + "learning_rate": 5e-06, + "loss": 0.7328, + "mean_token_accuracy": 0.7572689056396484, + "num_tokens": 706742820.0, + "step": 27314 + }, + { + "epoch": 2.999670546892159, + "grad_norm": 1.9235029220581055, + "learning_rate": 5e-06, + "loss": 0.6815, + "mean_token_accuracy": 0.7699867486953735, + "num_tokens": 706774108.0, + "step": 27315 + }, + { + "epoch": 2.9997803645947725, + "grad_norm": 1.809798240661621, + "learning_rate": 5e-06, + "loss": 0.7362, + "mean_token_accuracy": 0.7536214590072632, + "num_tokens": 706804721.0, + "step": 27316 + }, + { + "epoch": 2.9998901822973862, + "grad_norm": 1.9827907085418701, + "learning_rate": 5e-06, + "loss": 0.6961, + "mean_token_accuracy": 0.7710881233215332, + "num_tokens": 706832128.0, + "step": 27317 + }, + { + "epoch": 3.0, + "grad_norm": 2.134927988052368, + "learning_rate": 5e-06, + "loss": 0.689, + "mean_token_accuracy": 0.7714504599571228, + "num_tokens": 706855928.0, + "step": 27318 + }, + { + "epoch": 3.0, + "step": 27318, + "total_flos": 3.182940295913485e+19, + "train_loss": 0.8493060141035994, + "train_runtime": 32137.5108, + "train_samples_per_second": 13.6, + "train_steps_per_second": 0.85 + } + ], + "logging_steps": 1, + "max_steps": 27318, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 13659, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.182940295913485e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..9bce34b --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c9a01f09e4a3303e4003d160fb65893744485efbebf27a9ede1568524888d9 +size 13329